In [97]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
client = bigquery.Client()
from Predicting_Purchase_Intention.utils.get_data import get_raw_data
from Predicting_Purchase_Intention.utils.clean_data import drop_cols, clean_data
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


In [98]:
test_df = pd.read_csv('../raw_data/test_3days_dataset.csv')
#Use path specified on your branch

## Basic Preprocessing(To be expanded)

In [99]:
col_drop = []
for num in test_df.columns:
    if (test_df[num].isnull().sum()/len(test_df)*100) > 95:
        col_drop.append(num)

test_df = test_df.drop(columns =col_drop)
#drop columns with more than 95% missing values

list_y = list(set(test_df['target_variable'].values)) #unique values in the target variable
list_y.remove(0)
#dependent on what we eventually agree as the main class for the precision target
test_df['target_variable'] = test_df['target_variable'].replace(to_replace=list_y, value = -1) 
test_df['target_variable'] = test_df['target_variable'].replace(to_replace=0, value = 1)
test_df['target_variable'] = test_df['target_variable'].replace(to_replace=-1, value = 0)

In [109]:
test_df['target_variable'].value_counts(normalize = False)

1    9730
0      97
Name: target_variable, dtype: int64

❓ **Question (Imbalance)** ❓
Horrible imbalance in the dataset. Hence the need for a very high precision score at the micro level

In [122]:
X = test_df.drop(columns = ['target_variable', 'user_pseudo_id', 'Unnamed: 0'])
y = test_df['target_variable']

#basic Pipeline, can be improved once we insert categorical features 
preproc_baseline = make_pipeline(
    SimpleImputer(),
    MinMaxScaler())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Modeling

❓ **Question (best models)** ❓

Attempting various models for the baseline: LinearSVC, KNC and Decision Tree Classifier. Based on our goals, we wants a very high score so that we can predict the a good base of customers who do not consummate their transactions and then target them with strategic advertisements. 

In [123]:
model1 = LinearSVC()
pipe_baseline = make_pipeline(preproc_baseline, model1)
pipe_baseline.fit(X_train,y_train)
scores = cross_validate(pipe_baseline, X_test, y_test, cv=5, scoring=['precision','f1'])


scores['test_f1'].mean(), scores['test_precision'].mean()

(0.9964173866916933, 0.9942176277446346)

In [124]:
model2 = KNeighborsClassifier(n_neighbors=5)
pipe_baseline = make_pipeline(preproc_baseline, model2)
pipe_baseline.fit(X_train,y_train)
scores = cross_validate(pipe_baseline, X_test, y_test, cv=5, scoring=['precision','f1'])


scores['test_f1'].mean(), scores['test_precision'].mean()

(0.9959102436864604, 0.9928680717612369)

In [125]:
model3 = SVC()
pipe_baseline = make_pipeline(preproc_baseline, model3)
pipe_baseline.fit(X_train,y_train)
scores = cross_validate(pipe_baseline, X_test, y_test, cv=5, scoring=['precision','f1'])


scores['test_f1'].mean(), scores['test_precision'].mean()

(0.9960851053966839, 0.9922010877384825)

In [126]:
model4 = DecisionTreeClassifier()

pipe_baseline = make_pipeline(preproc_baseline, model4)
pipe_baseline.fit(X_train,y_train)
scores = cross_validate(pipe_baseline, X_test, y_test, cv=5, scoring=['precision','f1'])

scores['test_f1'].mean(), scores['test_precision'].mean()



(0.9926338134040676, 0.9941795881791192)

❓ **Question (what to keep)** ❓
Let's expand on Linear SVC, KNC and DecisionTreeCLassifiers based on their better Precision Scores.
We will do a grid search for each and tweak the parameters. Afterwards, we can modify with XGboost


## Grid Search

In [130]:
# model5 = KNeighborsClassifier()
# k_grid = {'n_neighbors' : [1,5,10,20,50]}
# grid = GridSearchCV(model5, k_grid, n_jobs=-1,  cv = 5)
# grid.fit(X_train, y_train)
# Did Not Work

In [138]:

# model6 = LinearSVC()
# SVCpipe = make_pipeline(preproc_baseline, model6)
# param_grid = {'steps':np.arange(0,1)}
# grid = GridSearchCV(SVCpipe,param_grid,cv=5,return_train_score=True)

# grid.fit(X_train, y_train)


