In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys

from fast_ml.model_development import train_valid_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn import metrics 
from sklearn import tree

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [2]:
warnings.filterwarnings('ignore')

### Split data by browser and platform_os, and version each split as a new version of the data in dvc.

In [3]:
df_split = pd.read_csv('../data/data_with_response.csv')
df_split.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,user_response
0,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1
1,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1
2,018af862-486e-4da1-a85b-71872120e57c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,1
3,023ec7b5-cb8f-49a5-995f-e0d7c2f702e5,exposed,2020-07-09,13,Samsung SM-G935F,6,Facebook,1
4,02efdb70-8596-4f3f-b0b2-b91e194f61f7,exposed,2020-07-05,6,Generic Smartphone,6,Chrome Mobile,1


In [4]:
label_encoder = LabelEncoder()
# 
df_split['experiment'] = label_encoder.fit_transform(df_split['experiment'])
df_split['date'] = label_encoder.fit_transform(df_split['date'])
df_split['hour'] = label_encoder.fit_transform(df_split['hour'])
df_split['device_make'] = label_encoder.fit_transform(df_split['device_make'])
df_split['platform_os'] = label_encoder.fit_transform(df_split['platform_os'])
df_split['browser'] = label_encoder.fit_transform(df_split['browser'])
df_split['user_response'] = label_encoder.fit_transform(df_split['user_response'])

#### Take Data With Browsers and Platforms

In [5]:
df_Platform_copied = df_split.copy()
df_platform = df_Platform_copied.drop(columns='browser', axis=1)
df_platform.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,user_response
0,008aafdf-deef-4482-8fec-d98e3da054da,1,1,16,13,1,1
1,00b6fadb-10bd-49e3-a778-290da82f7a8d,0,5,4,43,1,1
2,018af862-486e-4da1-a85b-71872120e57c,0,0,15,13,1,1
3,023ec7b5-cb8f-49a5-995f-e0d7c2f702e5,1,6,13,65,1,1
4,02efdb70-8596-4f3f-b0b2-b91e194f61f7,1,2,6,13,1,1


In [6]:
df_platform.to_csv('../data/data_with_platform.csv', index=False)

In [7]:
df_os_copied = df_split.copy()
df_os = df_os_copied.drop(columns='platform_os', axis=1)
df_os.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,browser,user_response
0,008aafdf-deef-4482-8fec-d98e3da054da,1,1,16,13,1,1
1,00b6fadb-10bd-49e3-a778-290da82f7a8d,0,5,4,43,4,1
2,018af862-486e-4da1-a85b-71872120e57c,0,0,15,13,1,1
3,023ec7b5-cb8f-49a5-995f-e0d7c2f702e5,1,6,13,65,4,1
4,02efdb70-8596-4f3f-b0b2-b91e194f61f7,1,2,6,13,1,1


In [8]:
df_os.to_csv('../data/data_with_os.csv', index=False)

### Access Data From DVC

In [9]:

import dvc.api

with dvc.api.open(
    'data/data_with_os.csv',
    mode='rb',
    
) as data:
    df_with_browser = pd.read_csv(data)
    df_with_browser.head()
df_with_browser.drop(['auction_id'],axis=1,inplace=True)

# Logistic Regression

### For Data Version With Platform Os

In [10]:
df_with_browser.head()

Unnamed: 0,experiment,date,hour,device_make,browser,user_response
0,1,1,16,13,1,1
1,0,5,4,43,4,1
2,0,0,15,13,1,1
3,1,6,13,65,4,1
4,1,2,6,13,1,1


In [11]:
df_with_browser.shape

(1243, 6)

In [12]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df_with_browser, target = 'user_response', 
                                                                            train_size=0.7, valid_size=0.2, test_size=0.1)
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(870, 5)
(870,)
(248, 5)
(248,)
(125, 5)
(125,)


(None, None)

### Create Model

### Model Evaluation

In [13]:
# evaluate model
# scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)


model = LogisticRegression()
model.fit(X_train, y_train)
model_predictions = model.predict(X_test)
result=cross_val_score(estimator=model,X=X_train,y=y_train,cv=5,scoring='accuracy')

print(result)
print('Mean Logistic Regression score for browser data :',result.mean())



[0.48850575 0.52873563 0.52873563 0.55172414 0.56321839]
Mean Logistic Regression score for browser data : 0.5321839080459769


In [14]:
print(f"----Accuracy: {result.mean()}----")

----Accuracy: 0.5321839080459769----


### For Data Version With Browser

In [15]:
with dvc.api.open(
    'data/data_with_platform.csv',
    mode='rb',
    
) as data:
    df_with_platform_os = pd.read_csv(data)
    df_with_platform_os.head()
df_with_platform_os.drop(['auction_id'],axis=1,inplace=True)

In [16]:
df_with_platform_os.head()

Unnamed: 0,experiment,date,hour,device_make,platform_os,user_response
0,1,1,16,13,1,1
1,0,5,4,43,1,1
2,0,0,15,13,1,1
3,1,6,13,65,1,1
4,1,2,6,13,1,1


In [17]:
X_train2, y_train2, X_valid2, y_valid2, X_test2, y_test2 = train_valid_test_split(df_with_platform_os, target = 'user_response', 
                                                                            train_size=0.7,valid_size=0.2, test_size=0.1)


print(X_train2.shape), print(y_train2.shape)
print(X_valid2.shape), print(y_valid2.shape)
print(X_test2.shape), print(y_test2.shape)

(870, 5)
(870,)
(248, 5)
(248,)
(125, 5)
(125,)


(None, None)

In [18]:

cv2 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)


In [19]:
# evaluate model
# scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)


model2 = LogisticRegression()
model2.fit(X_train2, y_train2)
model_predictions2 = model2.predict(X_test2)

result2=cross_val_score(estimator=model2,X=X_train2,y=y_train2,cv=5,scoring='accuracy')

print(result2)
print('Mean Logistic Regression score for Browser data :',result2.mean())



[0.52873563 0.55172414 0.59195402 0.54597701 0.56321839]
Mean Logistic Regression score for Browser data : 0.5563218390804597


In [20]:
print(f"----Accuracy: {result2.mean()}----")

----Accuracy: 0.5563218390804597----


# Decision Tree

### For Data Version With Platform Os

In [21]:

clf = tree.DecisionTreeClassifier()
decision_tree_model = clf.fit(X_train, y_train)
clf_predictions = clf.predict(X_test)

scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=5, scoring='accuracy')

print(scores)
print('Mean Logistic Regression score for platform os data :',scores.mean())



[0.5        0.54022989 0.53448276 0.51724138 0.51149425]
Mean Logistic Regression score for platform os data : 0.5206896551724138


In [22]:
print(f"----Accuracy: {scores.mean()}----")

----Accuracy: 0.5206896551724138----


### For Data Version With Browser

In [23]:

clf2 = tree.DecisionTreeClassifier()
decision_tree_model = clf2.fit(X_train2, y_train2)
clf_predictions2 = clf2.predict(X_test2)

scores2 = cross_val_score(estimator=clf2, X=X_train2, y=y_train2, cv=5, scoring='accuracy')

print(scores2)
print('Mean Logistic Regression score for platform os data :',scores2.mean())


[0.51149425 0.55172414 0.54022989 0.58045977 0.52873563]
Mean Logistic Regression score for platform os data : 0.5425287356321838


In [24]:
print(f"----Accuracy: {scores2.mean()}----")

----Accuracy: 0.5425287356321838----


# XGBoost

### For Data Version With Platform Os

In [25]:
df_with_platform_os.head()

Unnamed: 0,experiment,date,hour,device_make,platform_os,user_response
0,1,1,16,13,1,1
1,0,5,4,43,1,1
2,0,0,15,13,1,1
3,1,6,13,65,1,1
4,1,2,6,13,1,1


In [26]:

xgbs = XGBClassifier(n_estimators=1000, learning_rate=0.1)

xgbs.fit(
    X_train, y_train,
    early_stopping_rounds=5,
    eval_set=[(X_test, y_test)],
    verbose=False
 )
xgbs_predictions = xgbs.predict(X_test)


scores_xgb = cross_val_score(estimator=xgbs, X=X_train, y=y_train, cv=5, scoring='accuracy')

print(scores_xgb)
print('Mean Logistic Regression score for platform os data :',scores_xgb.mean())



[0.52298851 0.54022989 0.50574713 0.54597701 0.56321839]
Mean Logistic Regression score for platform os data : 0.535632183908046


In [27]:
print(f"----Accuracy: {scores_xgb.mean()}----")

----Accuracy: 0.535632183908046----


### For Data Version With Browser

In [28]:

xgbs2 = XGBClassifier(n_estimators=1000, learning_rate=0.1)

xgbs2.fit(
    X_train2, y_train2,
    early_stopping_rounds=5,
    eval_set=[(X_test2, y_test2)],
    verbose=False
 )
xgbs_predictions2 = xgbs2.predict(X_test2)


scores_xgb2 = cross_val_score(estimator=xgbs2, X=X_train2, y=y_train2, cv=5, scoring='accuracy')

print(scores_xgb2)
print('Mean Logistic Regression score for platform os data :',scores_xgb2.mean())



[0.48275862 0.54022989 0.51724138 0.51724138 0.53448276]
Mean Logistic Regression score for platform os data : 0.5183908045977011


In [29]:
print(f"----Accuracy: {scores_xgb2.mean()}----")

----Accuracy: 0.5183908045977011----


# Random Forest

### For Browser Data

In [30]:
clf_rf = RandomForestClassifier(n_estimators = 100)
clf_rf.fit(X_train, y_train)
clf_rf_predictions = clf_rf.predict(X_test)

result_rf=cross_val_score(estimator=clf_rf,X=X_train,y=y_train,cv=5,scoring='accuracy')


print(result_rf)
print('Mean Logistic Regression score for platform os data :',result_rf.mean())

[0.47701149 0.5        0.50574713 0.53448276 0.55747126]
Mean Logistic Regression score for platform os data : 0.5149425287356322


#### Accuracy: 0.708046

### For Platform Os Data

In [None]:
clf_rf2 = RandomForestClassifier(n_estimators = 100)
clf_rf2.fit(X_train2, y_train2)

clf_rf2_predictions2 = clf_rf2.predict(X_test2)

result_rf2=cross_val_score(estimator=clf_rf2,X=X_train2,y=y_train2,cv=5,scoring='accuracy')


print(result_rf2)
print('Mean Logistic Regression score for platform os data :',result_rf2.mean())

#### Accuracy: 0.998851

# Compute feature importance

## Logistics Regression

#### For Browser Data

In [None]:
logistic_imp_feature = pd.DataFrame({"Feature Importance": model.coef_[0]})
logistic_imp_feature["Feature"] = ['experiment'	,'date'	,'hour','device_make',	'browser']
logistic_imp_feature = logistic_imp_feature.set_index("Feature")
logistic_imp_feature = logistic_imp_feature.sort_values(by=["Feature Importance"], ascending=False)
logistic_imp_feature

#### Important Feature: hour

#### For Platform Os Data

In [None]:
logistic_imp_feature_os = pd.DataFrame({"Feature Importance": model2.coef_[0]})
logistic_imp_feature_os["Feature"] = ['experiment'	,'date'	,'hour','device_make',	'platform_os']
logistic_imp_feature_os = logistic_imp_feature_os.set_index("Feature")
logistic_imp_feature_os = logistic_imp_feature_os.sort_values(by=["Feature Importance"], ascending=False)
logistic_imp_feature_os

#### Important Feature: platform_os

## Decision Tree

#### For Browser Data

In [None]:
decision_imp_feature_browser = pd.DataFrame({"Feature Importance": clf.feature_importances_})
decision_imp_feature_browser["Feature"] = ["experiment", "date", "hour", "device_make", "browser"]
decision_imp_feature_browser = decision_imp_feature_browser.set_index("Feature")
decision_imp_feature_browser = decision_imp_feature_browser.sort_values(by=["Feature Importance"], ascending=False)
decision_imp_feature_browser

#### Important Feature: hour

### For Platform Os Data

In [None]:
decision_imp_feature_platform = pd.DataFrame({"Feature Importance": clf2.feature_importances_})
decision_imp_feature_platform["Feature"] = ["experiment", "date", "hour", "device_make", "platform_os"]
decision_imp_feature_platform = decision_imp_feature_platform.set_index("Feature")
decision_imp_feature_platform = decision_imp_feature_platform.sort_values(by=["Feature Importance"], ascending=False)
decision_imp_feature_platform

#### Important Feature: hour

## XGBoost

### For Browser Data

In [None]:
# xgbs2.feature_importances_
xgb_imp_feature_browser = pd.DataFrame({"Feature Importance": xgbs2.feature_importances_})
xgb_imp_feature_browser["Feature"] = ["experiment", "date", "hour", "device_make", "browser"]
xgb_imp_feature_browser = xgb_imp_feature_browser.set_index("Feature")
xgb_imp_feature_browser = xgb_imp_feature_browser.sort_values(by=["Feature Importance"], ascending=False)
xgb_imp_feature_browser

#### Important Feature: device_make

### For Platform Os Data

In [None]:
# xgbs2.feature_importances_
xgb_imp_feature_platform = pd.DataFrame({"Feature Importance": xgbs2.feature_importances_})
xgb_imp_feature_platform["Feature"] = ["experiment", "date", "hour", "device_make", "platform_os"]
xgb_imp_feature_platform = xgb_imp_feature_platform.set_index("Feature")
xgb_imp_feature_platform = xgb_imp_feature_platform.sort_values(by=["Feature Importance"], ascending=False)
xgb_imp_feature_platform

#### Important Feature: device_make

# Random Forest

### For Browser Data

In [None]:
rand_imp_feature_browser = pd.DataFrame({"Feature Importance": clf_rf.feature_importances_})
rand_imp_feature_browser["Feature"] = ["experiment", "date", "hour", "device_make", "browser"]
rand_imp_feature_browser = rand_imp_feature_browser.set_index("Feature")
rand_imp_feature_browser = rand_imp_feature_browser.sort_values(by=["Feature Importance"], ascending=False)
rand_imp_feature_browser

#### Important Feature: hour

### For Platform Os Data

In [None]:
rand_imp_feature_platform = pd.DataFrame({"Feature Importance": clf_rf.feature_importances_})
rand_imp_feature_platform["Feature"] = ["experiment", "date", "hour", "device_make", "platform_od"]
rand_imp_feature_platform = rand_imp_feature_platform.set_index("Feature")
rand_imp_feature_platform = rand_imp_feature_platform.sort_values(by=["Feature Importance"], ascending=False)
rand_imp_feature_platform

#### Important Feature: hour

### What does the relevant features contributes to the goal of gaining more “Yes” results?

****************************************************************************************************************************************
### For the above eight different models, the relevant features are "hour", "platform_os", "device_make". Therefore, these features contribute to the goal of geting more "YES" by targeting those features in playing ads.

### For instance, doing the following might be a good measure after seeing these outputs 
     1. Exposing users the ads during the time most users respond "YES" to the ads.
     2. Exposing users the ads on the devices most users respond "YES" to the ads. Or crafting strategies for users with device having  many "NO" responses.


****************************************************************************************************************************************


# Which data features are relevant to predicting the target variable?

In [None]:

print("***********************")
print("Prediction Accuracy")
print("***********************")

# Logistic Regression
accuracy_logistic = accuracy_score(y_test, model_predictions)
accuracy_logistic2 = accuracy_score(y_test, model_predictions2) # for platform os data
# Desicion Tree
accuracy_decision = accuracy_score(y_test, clf_predictions)
accuracy_decision2 = accuracy_score(y_test, clf_predictions2) # for platform os data
# XGBoost
accuracy_xgb = accuracy_score(y_test, xgbs_predictions)
accuracy_xgb2 = accuracy_score(y_test, xgbs_predictions2) # for platform os data
# Random Forest
accuracy_rf = accuracy_score(y_test, clf_rf_predictions)
accuracy_rf2 = accuracy_score(y_test, clf_rf2_predictions2) # for platform os data

print("Accuracy for Logistic Regression For Browser Data:", accuracy_logistic)
print("Accuracy for Logistic Regression:", accuracy_logistic2)
print(' ')
print("Accuracy for Desicion Tree For Browser Data:", accuracy_decision)
print("Accuracy for Desicion Tree:", accuracy_decision2)
print(' ')

print("Accuracy for XGBoost For Browser Data:", accuracy_xgb)
print("Accuracy for XGBoost:", accuracy_xgb2)
print(' ')

print("Accuracy for Random Forest For Browser Data:", accuracy_rf)
print("Accuracy for Random Forest:", accuracy_rf2)
