In [107]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys

from fast_ml.model_development import train_valid_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn import metrics 

from sklearn import tree


### Split data by browser and platform_os, and version each split as a new version of the data in dvc.

In [14]:
df_split = pd.read_csv('../data/data_with_response.csv')
df_split.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,user_response
0,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1
1,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1
2,018af862-486e-4da1-a85b-71872120e57c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,1
3,023ec7b5-cb8f-49a5-995f-e0d7c2f702e5,exposed,2020-07-09,13,Samsung SM-G935F,6,Facebook,1
4,02efdb70-8596-4f3f-b0b2-b91e194f61f7,exposed,2020-07-05,6,Generic Smartphone,6,Chrome Mobile,1


In [44]:
label_encoder = LabelEncoder()
# 
df_split['experiment'] = label_encoder.fit_transform(df_split['experiment'])
df_split['date'] = label_encoder.fit_transform(df_split['date'])
df_split['hour'] = label_encoder.fit_transform(df_split['hour'])
df_split['device_make'] = label_encoder.fit_transform(df_split['device_make'])
df_split['platform_os'] = label_encoder.fit_transform(df_split['platform_os'])
df_split['browser'] = label_encoder.fit_transform(df_split['browser'])
df_split['user_response'] = label_encoder.fit_transform(df_split['user_response'])

#### Take Data With Browsers and Platforms

In [45]:
df_Platform_copied = df_split.copy()
df_platform = df_Platform_copied.drop(columns='browser', axis=1)
df_platform.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,user_response
0,008aafdf-deef-4482-8fec-d98e3da054da,1,1,16,13,1,1
1,00b6fadb-10bd-49e3-a778-290da82f7a8d,0,5,4,43,1,1
2,018af862-486e-4da1-a85b-71872120e57c,0,0,15,13,1,1
3,023ec7b5-cb8f-49a5-995f-e0d7c2f702e5,1,6,13,65,1,1
4,02efdb70-8596-4f3f-b0b2-b91e194f61f7,1,2,6,13,1,1


In [46]:
df_platform.to_csv('../data/data_with_platform.csv', index=False)

In [47]:
df_os_copied = df_split.copy()
df_os = df_os_copied.drop(columns='platform_os', axis=1)
df_os.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,browser,user_response
0,008aafdf-deef-4482-8fec-d98e3da054da,1,1,16,13,1,1
1,00b6fadb-10bd-49e3-a778-290da82f7a8d,0,5,4,43,4,1
2,018af862-486e-4da1-a85b-71872120e57c,0,0,15,13,1,1
3,023ec7b5-cb8f-49a5-995f-e0d7c2f702e5,1,6,13,65,4,1
4,02efdb70-8596-4f3f-b0b2-b91e194f61f7,1,2,6,13,1,1


In [75]:
df_os.to_csv('../data/data_with_os.csv', index=False)

### Access Data From DVC

In [76]:

import dvc.api

with dvc.api.open(
    'data/data_with_os.csv',
    mode='rb',
    
) as data:
    df_with_browser = pd.read_csv(data)
    df_with_browser.head()
df_with_browser.drop(['auction_id'],axis=1,inplace=True)

# Logistic Regression

### For Data Version With Platform Os

In [77]:
df_with_browser.head()

Unnamed: 0,experiment,date,hour,device_make,browser,user_response
0,1,1,16,13,1,1
1,0,5,4,43,4,1
2,0,0,15,13,1,1
3,1,6,13,65,4,1
4,1,2,6,13,1,1


In [78]:
df_with_browser.shape

(1243, 6)

In [93]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df_with_browser, target = 'browser', 
                                                                            train_size=0.7, valid_size=0.2, test_size=0.1)
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(870, 5)
(870,)
(248, 5)
(248,)
(125, 5)
(125,)


(None, None)

### Prepare cross-validation procedure

In [94]:

cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)


### Create Model

In [95]:

model = LogisticRegression()


### Model Evaluation

In [96]:
# evaluate model
# scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)


model = LogisticRegression()
model.fit(X_train, y_train)
result=cross_val_score(estimator=model,X=X_train,y=y_train,cv=5,scoring='accuracy')

print(result)
print('Mean Logistic Regression score for browser data :',result.mean())



[0.66666667 0.71264368 0.7183908  0.73563218 0.70689655]
Mean Logistic Regression score for browser data : 0.7080459770114943


### Accuracy: 0.708046

### For Data Version With Browser

In [97]:
with dvc.api.open(
    'data/data_with_platform.csv',
    mode='rb',
    
) as data:
    df_with_platform_os = pd.read_csv(data)
    df_with_platform_os.head()
df_with_platform_os.drop(['auction_id'],axis=1,inplace=True)

In [98]:
df_with_platform_os.head()

Unnamed: 0,experiment,date,hour,device_make,platform_os,user_response
0,1,1,16,13,1,1
1,0,5,4,43,1,1
2,0,0,15,13,1,1
3,1,6,13,65,1,1
4,1,2,6,13,1,1


In [99]:
X_train2, y_train2, X_valid2, y_valid2, X_test2, y_test2 = train_valid_test_split(df_with_platform_os, target = 'platform_os', 
                                                                            train_size=0.7, valid_size=0.2, test_size=0.1)
print(X_train2.shape), print(y_train2.shape)
print(X_valid2.shape), print(y_valid2.shape)
print(X_test2.shape), print(y_test2.shape)

(870, 5)
(870,)
(248, 5)
(248,)
(125, 5)
(125,)


(None, None)

In [100]:

cv2 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)


In [101]:

model2 = LogisticRegression()


In [102]:
# evaluate model
# scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)


model2 = LogisticRegression()
model2.fit(X_train2, y_train2)
result2=cross_val_score(estimator=model,X=X_train2,y=y_train2,cv=5,scoring='accuracy')

print(result2)
print('Mean Logistic Regression score for platform os data :',result2.mean())



[1.         0.99425287 1.         1.         1.        ]
Mean Logistic Regression score for platform os data : 0.9988505747126437


### Accuracy: 0.9988506

# Decision Tree

### For Data Version With Platform Os

In [112]:
model2 = LogisticRegression()

clf = tree.DecisionTreeClassifier()
decision_tree_model = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy_platform = metrics.accuracy_score(y_test, y_pred)
print(accuracy_platform)

0.744


### Accuracy: 0.744

### For Data Version With Browser

In [111]:
model2 = LogisticRegression()

clf = tree.DecisionTreeClassifier()
decision_tree_model = clf.fit(X_train2, y_train2)

y_pred2 = clf.predict(X_test)

accuracy_browser = metrics.accuracy_score(y_test2, y_pred2)
print(accuracy_browser)

0.944


### Accuracy: 0.944