In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

In [None]:
%matplotlib inline
warnings.filterwarnings("ignore")

<h2>Reading Data</h2>

In [None]:
# reading the csv file
missing_values = ["n/a", "na", "undefined"]
df = pd.read_csv("../data/AdSmartABdata.csv", na_values=missing_values)
df.head()

<h2>General Statistics</h2>

In [None]:
# number of elements in the df
df.size

In [None]:
# rows and columns in the df
df.shape

In [None]:
# non-null counts and data types of columns
df.info()

In [None]:
# descriptive statistics that summarize the central tendency, dispersion of the df's numerical columns, excluding NaN values
df.describe()

<h2>Univariate Analysis</h2>

<h3>Auction Id</h3>

In [None]:
# unique value counts
unique_counts = df['auction_id'].nunique()
unique_counts

In [None]:
#this indicates that each Auction id is unique

## Experiment

In [None]:
# unique value counts
counts_df = df['experiment'].value_counts()
counts_df

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='experiment')
plt.title('Unique value counts of the experiment column');
plt.show()

In [None]:
#This show us that the grouping of control and exposed is balanced in terms of numbers.

In [None]:
# unique value counts
counts_df = df['date'].value_counts()
counts_df

In [None]:
plt.figure(figsize=(12, 6))
plt.title('Unique value counts of the date column')
sns.countplot(data=df, x='date')
plt.show()

In [None]:
#From the plot we can infer that the first day has recieved the highest number of visitors from the rest of the days.

### Hour

In [None]:
# unique value counts
counts_df = df['hour'].value_counts()
counts_df

In [None]:
plt.figure(figsize=(12, 6))
plt.title('Unique value counts of the hour column')
sns.countplot(data=df, x='hour')
plt.show()

In [None]:
#Hour 15 was the most busiest as the visitors traffic was very high almost 3 times the average of the rest hours.

### Device

In [None]:
# unique value counts
counts_df = df['device_make'].value_counts()
counts_df

In [None]:
df['device_make'].nunique()

In [None]:
#Platform OS

In [None]:
# unique value counts
counts_df = df['platform_os'].value_counts()
counts_df

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='platform_os')
plt.title('Unique value counts of the platform_os column')
plt.show()

### Browser

In [None]:
# unique value counts
counts_df = df['browser'].value_counts()
counts_df


In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="browser")
plt.title('Unique value counts of the browser column')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Chrome browsers are the most used browsers among the users.

### Yes

In [None]:
# unique value counts
counts_df = df['yes'].value_counts()
counts_df

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="yes")
plt.title('Unique value counts of the yes column')
plt.show()

### No

In [None]:
# unique value counts
counts_df = df['no'].value_counts()
counts_df

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="no")
plt.title('Unique value counts of the no column')
plt.show()

## Bivariate Analysis

### Experiment and Date

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="date", hue="experiment")
plt.title('Experiment vs Date')
plt.show()

### Experiment and OS

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="platform_os", hue="experiment")
plt.title('Experiment vs OS')
plt.show()

### Experiment and Yes

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="yes", hue="experiment")
plt.title('Experiment vs Yes')
plt.show()

### Experiment and No

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="no", hue="experiment")
plt.title('Experiment vs No')
plt.show()

### Experiment and Browser

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="browser", hue="experiment")
plt.title('Experiment vs Browser')
plt.xticks(rotation=45)
plt.show()

### Correlation Analysis

In [None]:
plt.figure(figsize=(12, 6))
corr = df.corr()
sns.heatmap(corr, annot=True)
plt.title('Heatmap of correlation for the numerical columns')
plt.show()