In [1]:
import pandas as pd
import numpy as np

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import logging


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [59]:
df = pd.read_csv('../data/AdSmartABdata.csv')
df

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
...,...,...,...,...,...,...,...,...,...
8072,ffea24ec-cec1-43fb-b1d1-8f93828c2be2,exposed,2020-07-05,7,Generic Smartphone,6,Chrome Mobile,0,0
8073,ffea3210-2c3e-426f-a77d-0aa72e73b20f,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0
8074,ffeaa0f1-1d72-4ba9-afb4-314b3b00a7c7,control,2020-07-04,9,Generic Smartphone,6,Chrome Mobile,0,0
8075,ffeeed62-3f7c-4a6e-8ba7-95d303d40969,exposed,2020-07-05,15,Samsung SM-A515F,6,Samsung Internet,0,0


In [None]:
# Print number of rows in data
print("Rows:", len(df))

In [None]:
# Prints Summary of Numerical Data
df.describe()

In [None]:
# Prints Summary of Categorical Data
df.describe(include=[np.object])

# Exploring the categorical columns

In [62]:
relevant_rows = df.query('yes == 1 | no == 1')
relevant_rows

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1
...,...,...,...,...,...,...,...,...,...
8059,ffa08ff9-a132-4051-aef5-01a9c79367bc,exposed,2020-07-05,21,Generic Smartphone,6,Chrome Mobile,1,0
8063,ffb176df-ecd2-45d3-b05f-05b173a093a7,exposed,2020-07-04,1,Generic Smartphone,6,Chrome Mobile,1,0
8064,ffb79718-6f25-4896-b6b3-e58b80a6e147,control,2020-07-09,7,Generic Smartphone,6,Chrome Mobile,0,1
8069,ffca1153-c182-4f32-9e90-2a6008417497,control,2020-07-10,16,Generic Smartphone,6,Chrome Mobile,0,1


In [158]:
top2_platform = relevant_rows['platform_os'].value_counts().index.to_list()[:2]
top2_platform

[6, 5]

In [165]:
platform_list = []
for platform in top2_platform:
    platforms = relevant_rows.loc[relevant_rows['platform_os'] == platform]
    platform_list.append(platforms)
platform_six = platform_list[0]
platform_five = platform_list[1]

In [215]:
platform_six.to_csv('../data/platform_six.csv')
platform_five.to_csv('../data/platform_five.csv')

In [155]:
top2_browser = relevant_rows['browser'].value_counts().index.to_list()[:2]
top2_browser

['Chrome Mobile', 'Chrome Mobile WebView']

In [156]:
browser_list = []
for browser in top2_browser:
    browsers = relevant_rows.loc[relevant_rows['browser'] == browser]
    browser_list.append(browsers)
chrome_mobile_data = browser_list[0]
chrome_mobile_webview_data = browser_list[1]

In [216]:
chrome_mobile_data.to_csv('../data/chrome_mobile_data.csv')
chrome_mobile_webview_data.to_csv('../data/chrome_mobile_webview_data.csv')

In [169]:
df2 = relevant_rows.drop('no', axis=1)
# df2

In [167]:
df2.rename(columns = {'yes': 'clicked_or_not'}, inplace=True)
# df2

# Splitting Numerical and categorical variables

In [170]:
numerical_column = df2.select_dtypes(exclude="object").columns.tolist()
categorical_column = df2.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("****************")
print("Categorical Columns:", categorical_column)

Numerical Columns: ['hour', 'platform_os', 'yes']
****************
Categorical Columns: ['auction_id', 'experiment', 'date', 'device_make', 'browser']


# Splitting the columns for one hot encoding and label encoding

In [245]:
to_one_hot_encoding = [col for col in categorical_column if df2[col].nunique() <= 10 and df2[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)


To One Hot Encoding: ['date', 'browser']
To Label Encoding: ['auction_id', 'experiment', 'device_make']


In [261]:
# We will use built in pandas function "get_dummies()" to simply to encode "to_one_hot_encoding" columns
one_hot_encoded_columns = pd.get_dummies(df2[to_one_hot_encoding])
one_hot_encoded_columns

Unnamed: 0,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10,browser_Chrome,browser_Chrome Mobile,browser_Chrome Mobile WebView,browser_Chrome Mobile iOS,browser_Facebook,browser_Mobile Safari,browser_Mobile Safari UI/WKWebView,browser_Samsung Internet
2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
16,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
20,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
23,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
27,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
8063,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
8064,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
8069,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0


In [267]:
le = LabelEncoder()
df2[to_label_encoding] = df2[to_label_encoding].apply(le.fit_transform)
df2

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes
2,0,1,2020-07-05,2,4,6,Chrome Mobile WebView,0
16,1,1,2020-07-04,16,13,6,Chrome Mobile,1
20,2,1,2020-07-06,8,13,6,Chrome Mobile,0
23,3,0,2020-07-08,4,43,6,Facebook,1
27,4,0,2020-07-03,15,13,6,Chrome Mobile,0
...,...,...,...,...,...,...,...,...
8059,1238,1,2020-07-05,21,13,6,Chrome Mobile,1
8063,1239,1,2020-07-04,1,13,6,Chrome Mobile,1
8064,1240,0,2020-07-09,7,13,6,Chrome Mobile,0
8069,1241,0,2020-07-10,16,13,6,Chrome Mobile,0


In [268]:
# Label Encoding

label_encoded_columns = []
# For loop for each columns
for col in to_label_encoding:
    # We define new label encoder to each new column
    le = LabelEncoder()
    # Encode our data and create new Dataframe of it, 
    # notice that we gave column name in "columns" arguments
    column_dataframe = pd.DataFrame(le.fit_transform(df2[col]), columns=[col] )
    # and add new DataFrame to "label_encoded_columns" list
    label_encoded_columns.append(column_dataframe)

# Merge all data frames
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
label_encoded_columns

Unnamed: 0,auction_id,experiment,device_make
0,0,1,4
1,1,1,13
2,2,1,13
3,3,0,43
4,4,0,13
...,...,...,...
1238,1238,1,13
1239,1239,1,13
1240,1240,0,13
1241,1241,0,13


In [286]:
# Copy our DataFrame to X variable
X = df2.copy()
X.drop(['date', 'browser'], axis=1, inplace=True)
# Merge DataFrames
X = pd.concat([X, one_hot_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
X

All columns: ['auction_id', 'experiment', 'hour', 'device_make', 'platform_os', 'yes', 'date_2020-07-03', 'date_2020-07-04', 'date_2020-07-05', 'date_2020-07-06', 'date_2020-07-07', 'date_2020-07-08', 'date_2020-07-09', 'date_2020-07-10', 'browser_Chrome', 'browser_Chrome Mobile', 'browser_Chrome Mobile WebView', 'browser_Chrome Mobile iOS', 'browser_Facebook', 'browser_Mobile Safari', 'browser_Mobile Safari UI/WKWebView', 'browser_Samsung Internet']


Unnamed: 0,auction_id,experiment,hour,device_make,platform_os,yes,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,...,date_2020-07-09,date_2020-07-10,browser_Chrome,browser_Chrome Mobile,browser_Chrome Mobile WebView,browser_Chrome Mobile iOS,browser_Facebook,browser_Mobile Safari,browser_Mobile Safari UI/WKWebView,browser_Samsung Internet
2,0,1,2,4,6,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
16,1,1,16,13,6,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
20,2,1,8,13,6,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
23,3,0,4,43,6,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
27,4,0,15,13,6,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,1238,1,21,13,6,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
8063,1239,1,1,13,6,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
8064,1240,0,7,13,6,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
8069,1241,0,16,13,6,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [287]:
# Define Y (This is the value we will predict)
y = X["yes"]

# Droping "class" from X
X.drop(["yes"], axis=1, inplace=True)
# X

In [288]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.23)

In [289]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

(870, 21)
(373, 21)
(258, 21)


# Logistic Regression

In [290]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 0 0 0 0 0 0 0 1 0]
Actual: 838     0
5127    0
2778    0
1045    0
6316    0
4991    0
3962    1
118     1
7159    0
1319    1
Name: yes, dtype: int64


# Decision Trees

In [291]:
# Define Decision Tree Model
dt = DecisionTreeClassifier()
# We fit our model with our train data
dt.fit(X_train, y_train)
# Then predict results from X_test data
pred_dt = dt.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_dt[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 0 0 1 0 0 1 0 1 1]
Actual: 838     0
5127    0
2778    0
1045    0
6316    0
4991    0
3962    1
118     1
7159    0
1319    1
Name: yes, dtype: int64


# XGBoost

In [295]:
# Define XGBoost Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
# We fit our model with our train data
xgb.fit(
    X_train, y_train,
    # That means if model don't improve it self in 5 rounds, it will stop learning
    # So you can save your time and don't overtrain your model.
    early_stopping_rounds=5,
    # We provide Test data's to evaluate model performance
    eval_set=[(X_test, y_test)],
    verbose=False
 )
# Then predict results from X_test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predictions and They Actual Values
# print("Predicted:", pred_xgb[0:10])
print("Actual:", y_test[0:10])

Actual: 838     0
5127    0
2778    0
1045    0
6316    0
4991    0
3962    1
118     1
7159    0
1319    1
Name: yes, dtype: int64


# RandomForest

In [298]:
# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])

Predicted: [1 0 0 0 0 1 0 0 0 0]
Actual: 838     0
5127    0
2778    0
1045    0
6316    0
4991    0
3962    1
118     1
7159    0
1319    1
Name: yes, dtype: int64


## Confusion Matrices

In [299]:
cm_log = confusion_matrix(y_test, pred_log)
print("Logistic Regression:\n", cm_log)

# Desicion Tree
cm_dt = confusion_matrix(y_test, pred_dt)
print("Desicion Tree:\n", cm_dt)

# XGBoost 
cm_xgb = confusion_matrix(y_test, pred_xgb)
print("XGBoost:\n", cm_xgb)

# Random Forest 
cm_rf = confusion_matrix(y_test, pred_rf)
print("Random Forest:\n", cm_rf)

Logistic Regression:
 [[150  53]
 [129  41]]
Desicion Tree:
 [[108  95]
 [ 82  88]]
XGBoost:
 [[179  24]
 [156  14]]
Random Forest:
 [[117  86]
 [ 95  75]]


## Accuracy scores

In [300]:
# Logistic Regression
acc_log = accuracy_score(y_test, pred_log)
print("Logistic Regression accuracy:", acc_log)

# Desicion Tree
acc_dt = accuracy_score(y_test, pred_dt)
print("Desicion Tree:", acc_dt)

# XGBoost 
acc_xgb = accuracy_score(y_test, pred_xgb)
print("XGBoost:", acc_xgb)

# Random Forest 
acc_rf = accuracy_score(y_test, pred_rf)
print("Random Forest:", acc_rf)

Logistic Regression accuracy: 0.5120643431635389
Desicion Tree: 0.5254691689008043
XGBoost: 0.517426273458445
Random Forest: 0.514745308310992
