In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
data = pd.read_csv('biddings.csv')
dfo = pd.DataFrame(data)


In [147]:
dfo = dfo.sort_values(by=['convert'])
dfo = dfo.reset_index(drop=True)
df1 = dfo.drop(dfo.index[0:997500])

In [148]:
df = df1.reset_index(drop=True)
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,convert
2495,0.02,-0.95,-1.8,-0.63,0.71,0.55,-0.88,-2.01,0.48,-4.59,...,0.3,0.37,-1.3,0.37,-0.21,0.54,-0.1,-1.66,0.51,1
2496,-0.01,0.08,2.2,-0.56,0.77,0.28,0.04,-0.28,-0.09,-0.71,...,-0.09,2.02,0.64,1.86,-0.88,-1.77,-0.19,-0.72,0.22,1
2497,-0.01,-1.18,1.99,-0.63,0.79,0.28,0.13,0.5,-0.32,0.96,...,-0.06,-1.41,0.24,0.83,-0.1,0.47,-0.17,0.42,-0.13,1
2498,0.01,-6.02,1.09,-0.6,0.56,0.45,0.34,0.63,-0.34,1.45,...,-0.22,1.32,0.47,-0.68,1.06,0.13,-0.28,0.73,-0.23,1
2499,-0.01,-0.76,-4.4,-0.19,-0.76,2.45,0.92,-0.5,-0.18,0.01,...,0.01,-1.22,0.24,0.22,0.31,0.1,0.13,-0.17,0.08,1


In [149]:
#Convert column = shown ad was clicked: 1 = clicked ad
df.convert.value_counts()

1    1908
0     592
Name: convert, dtype: int64

### Severely underbalanced

Converted accounts for 1908/999999 = .191% of our data

We will have to oversample or undersample our target. Most likely undersample to minimize our noise

I will use RandomUnderSampler, NearMiss, and EditedNearestNeighbors
to underersample our target here first to get baselines

In [150]:
from sklearn.model_selection import train_test_split

x = df.drop(['convert'],axis=1) #training features
y = df.convert #target

#Need to set our train/test values before undersampling
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                  test_size = .10)

In [151]:
from collections import Counter
from imblearn.ensemble import RUSBoostClassifier

rus = RUSBoostClassifier()
rus.fit(x_train, y_train)


RUSBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None,
          replacement=False, sampling_strategy='auto')

In [152]:
from sklearn.metrics import classification_report

print('Training accuracy LinearSVC is: {}'.format(rus.score(x_train, y_train)))
print('Test accuracy LinearSVC is: {}'.format(rus.score(x_test, y_test)))

Training accuracy LinearSVC is: 0.9644444444444444
Test accuracy LinearSVC is: 0.672


In [153]:
y_pred = rus.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

              precision    recall  f1-score   support

           0       0.41      0.29      0.34        73
           1       0.74      0.83      0.78       177

   micro avg       0.67      0.67      0.67       250
   macro avg       0.58      0.56      0.56       250
weighted avg       0.64      0.67      0.65       250



In [155]:
#First model using our undersampled data (RandomUnderSampler)
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

clf = LinearSVC()
clf.fit(x_train, y_train) 

print('Training accuracy LinearSVC is: {}'.format(clf.score(x_train, y_train)))
print('Test accuracy LinearSVC is: {}'.format(clf.score(x_test, y_test)))

scores = cross_val_score(clf, x_train, y_train, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


y_pred = clf.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

Training accuracy LinearSVC is: 0.7786666666666666
Test accuracy LinearSVC is: 0.712
Cross Val LinearSVC Accuracy: 0.77 (+/- 0.02)
              precision    recall  f1-score   support

           0       0.53      0.11      0.18        73
           1       0.72      0.96      0.83       177

   micro avg       0.71      0.71      0.71       250
   macro avg       0.63      0.54      0.50       250
weighted avg       0.67      0.71      0.64       250



In [158]:
from sklearn import ensemble

params = {'n_estimators': 750,
          'max_features': 'log2',
          'max_depth': 7}

rfc = ensemble.RandomForestClassifier(**params)
rfc.fit(x_train, y_train)

print('Training accuracy normal RandomForest is: {}'.format(rfc.score(x_train, y_train)))
print('Test accuracy normal RandomForest is: {}'.format(rfc.score(x_test, y_test)))

scores = cross_val_score(rfc, x_train, y_train, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

y_pred = rfc.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

Training accuracy normal RandomForest is: 0.7902222222222223
Test accuracy normal RandomForest is: 0.712
Cross Val LinearSVC Accuracy: 0.77 (+/- 0.01)
              precision    recall  f1-score   support

           0       0.67      0.03      0.05        73
           1       0.71      0.99      0.83       177

   micro avg       0.71      0.71      0.71       250
   macro avg       0.69      0.51      0.44       250
weighted avg       0.70      0.71      0.60       250



In [157]:
params = {'n_estimators': 750,
          'max_features': 'log2',
          'max_depth': 6}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(x_train, y_train)

print('Training accuracy normal GradientBoosting is: {}'.format(clf1.score(x_train, y_train)))
print('Test accuracy normal GradientBoosting is: {}'.format(clf1.score(x_test, y_test)))

y_pred = clf1.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

Training accuracy normal GradientBoosting is: 0.9942222222222222
Test accuracy normal GradientBoosting is: 0.684
              precision    recall  f1-score   support

           0       0.31      0.07      0.11        73
           1       0.71      0.94      0.81       177

   micro avg       0.68      0.68      0.68       250
   macro avg       0.51      0.50      0.46       250
weighted avg       0.59      0.68      0.60       250



In [22]:
scores = cross_val_score(clf1, X_resampled, y_resampled, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Val LinearSVC Accuracy: 0.94 (+/- 0.09)


In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [750],
    'max_features': ['log2'],
    'max_depth' : [4,5,6]
}

CV_clf = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 6)
CV_clf.fit(X_resampled, y_resampled)

CV_clf.best_params_

{'max_depth': 6, 'max_features': 'log2', 'n_estimators': 750}

In [85]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

y_pred = clf.predict(x_test) #predict y based on x_test
balanced_accuracy_score(y_test, y_pred)

0.5047423244557656

In [86]:
f1_score(y_test, y_pred)

0.00393927619149706

In [159]:
%timeit rfc.predict(df.iloc[5:6, 0:88] )

62.4 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


366