### Importing Libraries

In [36]:
# import libraries
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from sklearn import preprocessing
from itertools import islice
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as stat
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from six import StringIO 
from IPython.display import Image  
from sklearn.tree import export_graphviz
from IPython.display import Image  
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
import pydotplus
from sklearn import tree

In [7]:
import warnings
warnings.filterwarnings('ignore')
import dvc.api

In [38]:
data_frame = pd.read_csv('../Data/ADSmartdata.csv')


In [39]:
drop_mask = data_frame.query('yes==0 and no==0') # drop columns where yes and no are both 0
data_frame=data_frame.drop(drop_mask.index)

In [40]:
data_frame.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1


In [41]:
data_frame.isna().sum()

auction_id     0
experiment     0
date           0
hour           0
device_make    0
platform_os    0
browser        0
yes            0
no             0
dtype: int64

In [42]:
def drop_no_responds(df):
    cleaned_df = data_frame.query("not (yes == 0 & no == 0)")
    return cleaned_df

In [43]:
cleaned_df = drop_no_responds(data_frame)
cleaned_df['aware'] = cleaned_df['yes'].map(lambda x: x==1)
cleaned_df = cleaned_df.drop(columns = ['yes', 'no', 'auction_id'], axis=1)

###  Spliting the data by browser and platform_os, and version each split as a new version of the data in dvc.

In [44]:
broweser_df = cleaned_df[["experiment", "hour", "date", 'device_make', 'browser', 'aware']]
platfrom_df = cleaned_df[["experiment", "hour", "date", 'device_make', 'platform_os', 'aware']]

### Logestic regression

In [45]:
# custom logistic regression wrapper that accepts a Linear Regressor and adds p-value calculator 

class LogisticReg:
    def __init__(self,*args,**kwargs):#,**kwargs):
        self.model = LogisticRegression(*args,**kwargs)#,**args)
    

    def fit(self,X,y):
        self.model.fit(X,y)
        denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X/denom).T,X) 
        Cramer_Rao = np.linalg.inv(F_ij) 
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0]/sigma_estimates 
        p_values = [stat.norm.sf(abs(x)) for x in z_scores]
        
        self.z_scores = z_scores
        self.p_values = p_values
        self.sigma_estimates = sigma_estimates
        self.F_ij = F_ij
        return self.model,p_values

In [46]:
# crate an awareness column to the dataframe
def label_awareness (row):
    if row['yes'] == 1 :
         return '1'
    if row['no'] == 1 :
        return '0'
def calculate_metrics(y_test, y_preds):
    
    accuracy = accuracy_score(y_test, y_preds)
    return accuracy

## Function for plotting confusion matrix

In [47]:
# plot confusion metrics of a model. It accepts predictions and actual values for the target variable
def draw_confusion_metrics(y_test, y_preds):
    cnf_matrix = metrics.confusion_matrix(y_test, y_preds)
    ax=sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

## Function for getting the best parameters for a model

In [48]:
def get_best_model(classifier, params):
    best_avg_score = 0
    best_param = ''
    best_model = None
    fold_num=1
    p_value = []
    # create 5 folds
    kf = KFold(n_splits = 5)
    for param in params:
        p_values = []
        scores=[]
        randomIter = kf.split(x_train)
        i=0
        while i<5:
            train_idx, validation_idx = next(randomIter)
            x_validation = x_train.iloc[validation_idx]
            x_training = x_train.iloc[train_idx]
            y_validation = y_train.iloc[validation_idx]
            y_training = y_train.iloc[train_idx]
            x_training_refined = x_training.drop(columns=['id','yes','no','date'], axis=1)
            x_validation_refined = x_validation.drop(columns=['id','yes','no','date'], axis=1)
            if(classifier == 'Decision Tree'):
                clf = DecisionTreeClassifier(**param)
            if(classifier == 'XGBoost'):
                clf = XGBClassifier(**param)
            if(classifier == 'Logistic'):
                clf  = LogisticReg(solver = param)
            
            
            if(classifier == 'Logistic'):
                clf,p_values = clf.fit(x_training_refined.astype(float),y_training.astype(float))
            else:
                clf = clf.fit(x_training_refined.astype(float),y_training.astype(float))
            y_preds = clf.predict(x_validation_refined)
            score = accuracy_score(y_validation.astype(float), y_preds)
            scores.append(score)
            i+=1
        avg = sum(scores)/len(scores)
        print('Fold',fold_num,'average',avg)
        if(avg>best_avg_score):
            best_avg_score = avg
            best_param = param
            best_model = clf
            p_value = p_values
        fold_num+=1
    
    return best_model, best_param, best_avg_score,p_value


In [50]:
data_frame['aware'] = data_frame.apply (lambda row: label_awareness(row), axis=1) # apply awareness to each row
data_frame.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no,awar,aware
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1,0,0
16,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,6,Chrome Mobile,1,0,1,1
20,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,6,Chrome Mobile,0,1,0,0
23,00b6fadb-10bd-49e3-a778-290da82f7a8d,control,2020-07-08,4,Samsung SM-A202F,6,Facebook,1,0,1,1
27,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,1,0,0


### Change categorical variables to numerical value

In [52]:
# change categorical variables to numerical value
lb = LabelEncoder()
data_frame['experiment'] = lb.fit_transform(data_frame['experiment'])
data_frame['browser'] = lb.fit_transform(data_frame['browser'])
data_frame['device_make'] = lb.fit_transform(data_frame['device_make'])

### Split the dataset to training(90%) and test(10%)
The training set will be further divided to training and validation

In [54]:
# separate features and target columns
data_x = data_frame.loc[:, data_frame.columns != 'aware']
data_y = data_frame['aware']

# split the dataset to training and test sets
train_ratio = 0.90
test_ratio = 0.10
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio)

# Logistic Regression

In [None]:
from sklearn.preprocessing import MinMaxScaler
solver_params=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

best_model = get_best_model('Logistic', solver_params)
print('Best score: ' , best_model[1],'--->', best_model[2])