In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn #remove unnecessary warnings

In [3]:
data = pd.read_csv('biddings.csv')
dfo = pd.DataFrame(data)


### Severely underbalanced
Converted accounts for 1908/999999 = .191% of our data

We will have to oversample or undersample our target. Most likely undersample to minimize noise

In [9]:
dfo = dfo.sort_values(by=['convert'])
dfo = dfo.reset_index(drop=True)
df1 = dfo.drop(dfo.index[0:996800])


In [41]:
df = df1.reset_index(drop=True) #reset index

#Convert column = shown ad was clicked: 1 = clicked ad
print(df.convert.value_counts())

df.tail()

1    1908
0    1292
Name: convert, dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,convert
3195,-0.02,2.33,-3.71,-0.1,-0.94,0.4,-7.08,-1.26,0.82,-0.46,...,-0.09,0.7,0.31,0.02,0.49,0.06,0.09,0.45,-0.12,1
3196,0.01,1.88,-1.23,-0.7,0.98,0.22,-1.39,0.05,-0.16,-0.58,...,0.15,0.22,-0.64,-0.1,-0.03,0.35,-0.02,0.56,-0.17,1
3197,-0.01,-1.18,1.99,-0.63,0.79,0.28,0.13,0.5,-0.32,0.96,...,-0.06,-1.41,0.24,0.83,-0.1,0.47,-0.17,0.42,-0.13,1
3198,0.09,-8.92,-4.29,0.38,-1.54,-1.63,2.09,-2.65,2.59,-4.74,...,0.02,0.13,-0.07,0.44,-0.01,0.05,0.09,-0.27,0.03,1
3199,-0.01,-0.76,-4.4,-0.19,-0.76,2.45,0.92,-0.5,-0.18,0.01,...,0.01,-1.22,0.24,0.22,0.31,0.1,0.13,-0.17,0.08,1


In [36]:
from sklearn.model_selection import train_test_split

x = df.drop(['convert'],axis=1) #training features
y = df.convert #target

#Set our train/test values
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                  test_size = .10)
#Test sample
y_test.value_counts()

1    194
0    126
Name: convert, dtype: int64

In [55]:
#First model using our undersampled data
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

clf = LinearSVC(random_state=42)
clf.fit(x_train, y_train) 

print('Training accuracy LinearSVC is: %0.2f' % (clf.score(x_train, y_train)))
print('Test accuracy LinearSVC is: %0.2f' % (clf.score(x_test, y_test)))

scores = cross_val_score(clf, x_train, y_train, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)\n" % (scores.mean(), scores.std() * 2))

y_pred = clf.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

print('\nThis LinearSVC runs at a speed of:')
%timeit clf.predict(df.iloc[5:6, :88] )

Training accuracy LinearSVC is: 0.67
Test accuracy LinearSVC is: 0.67
Cross Val LinearSVC Accuracy: 0.65 (+/- 0.04)

              precision    recall  f1-score   support

           0       0.60      0.44      0.51       126
           1       0.69      0.81      0.75       194

   micro avg       0.67      0.67      0.67       320
   macro avg       0.65      0.63      0.63       320
weighted avg       0.66      0.67      0.65       320


This LinearSVC runs at a speed of:
980 µs ± 80.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [53]:
from sklearn.metrics import f1_score

#Show f1 score 
print("LinearSVC F1 Score: %0.2f \n" % f1_score(y_test, y_pred))


LinearSVC F1 Score: 0.75 



In [32]:
from sklearn import ensemble

params = {'n_estimators': 100,
          'max_features': 'sqrt',
          'max_depth': 6}

rfc = ensemble.RandomForestClassifier(**params)
rfc.fit(x_train, y_train)

print('Training accuracy normal RandomForest is: {}'.format(rfc.score(x_train, y_train)))
print('Test accuracy normal RandomForest is: {}'.format(rfc.score(x_test, y_test)))

scores = cross_val_score(rfc, x_train, y_train, cv=5)
print("Cross Val RandomForest Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

y_pred = rfc.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value
print('This RandomForest runs at a speed of:')
%timeit rfc.predict(df.iloc[5:6, :88] )

Training accuracy normal RandomForest is: 0.7520833333333333
Test accuracy normal RandomForest is: 0.65625
Cross Val RandomForest Accuracy: 0.65 (+/- 0.02)
              precision    recall  f1-score   support

           0       0.59      0.38      0.46       124
           1       0.68      0.83      0.75       196

   micro avg       0.66      0.66      0.66       320
   macro avg       0.63      0.61      0.60       320
weighted avg       0.64      0.66      0.64       320

This RandomForest runs at a speed of:
9.06 ms ± 232 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [35]:
params = {'n_estimators': 250,
          'max_features': 'sqrt',
          'max_depth': 8}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(x_train, y_train)

print('Training accuracy normal GradientBoosting is: {}'.format(clf1.score(x_train, y_train)))
print('Test accuracy normal GradientBoosting is: {}'.format(clf1.score(x_test, y_test)))

scores = cross_val_score(clf1, x_train, y_train, cv=5)
print("CrossVal GradientBoosting Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(),scores.std() * 2))

y_pred = clf1.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value
print('This GradientBoosting model runs at a speed of:')
%timeit clf1.predict(df.iloc[5:6, :88] )

Training accuracy normal GradientBoosting is: 0.9923611111111111
Test accuracy normal GradientBoosting is: 0.65
CrossVal GradientBoosting Accuracy: 0.64 (+/- 0.02)
              precision    recall  f1-score   support

           0       0.56      0.45      0.50       124
           1       0.69      0.78      0.73       196

   micro avg       0.65      0.65      0.65       320
   macro avg       0.63      0.61      0.62       320
weighted avg       0.64      0.65      0.64       320

This GradientBoosting model runs at a speed of:
1.28 ms ± 41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [34]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression().fit(x_train, y_train)

print('Training accuracy Logistic is: {}'.format(log.score(x_train, y_train)))
print('Test accuracy Logistic is: {}'.format(log.score(x_test, y_test)))

scores = cross_val_score(log, x_train, y_train, cv=5)
print("Cross Val Logistic Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

y_pred = log.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value
print('This LogisticRegression runs at a speed of:')
%timeit log.predict(df.iloc[5:6, :88] )

Training accuracy Logistic is: 0.6729166666666667
Test accuracy Logistic is: 0.65
Cross Val Logistic Accuracy: 0.65 (+/- 0.04)
              precision    recall  f1-score   support

           0       0.56      0.44      0.50       124
           1       0.69      0.78      0.73       196

   micro avg       0.65      0.65      0.65       320
   macro avg       0.63      0.61      0.61       320
weighted avg       0.64      0.65      0.64       320

This LogisticRegression runs at a speed of:
1.12 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [43]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [100, 250, 500],
    'max_features': ['sqrt'],
    'max_depth' : [5,6,7,8]
}

CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 4)
CV_clf.fit(x_train, y_train)

CV_clf.best_params_

ValueError: Invalid parameter max_depth for estimator LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0). Check the list of available parameters with `estimator.get_params().keys()`.