In [2]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
data = pd.read_csv('biddings.csv')
dfo = pd.DataFrame(data)
dfo.convert.value_counts()

0    998092
1      1908
Name: convert, dtype: int64

In [57]:
dfo = dfo.sort_values(by=['convert'])
dfo = dfo.reset_index(drop=True)
df1 = dfo.drop(dfo.index[0:997900])

df = df1.reset_index(drop=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,convert
0,-0.0,-4.14,1.88,-0.49,0.63,0.36,-0.04,0.38,-0.26,0.82,...,-0.12,-0.33,0.44,0.3,2.76,0.68,-0.27,-1.27,0.37,0
1,-0.02,1.9,2.59,-0.55,0.85,0.19,-0.14,-0.09,-0.13,-0.31,...,0.1,0.3,-0.34,0.9,-0.12,1.14,-0.22,0.29,-0.09,0
2,-0.01,0.85,2.53,-0.49,0.71,0.32,0.02,-0.83,0.11,-1.67,...,0.57,0.4,-1.75,2.77,-0.29,0.23,-0.17,-0.64,0.2,0
3,-0.01,0.52,2.28,-0.59,0.8,0.24,0.04,0.24,-0.22,0.45,...,0.13,-0.94,-0.4,0.0,-1.36,-0.06,-0.2,-0.67,0.22,0
4,-0.02,1.94,2.71,-0.51,0.81,0.21,-0.19,-0.29,-0.08,-0.71,...,-0.23,-0.29,0.81,-0.34,0.1,-1.44,-0.22,0.09,-0.03,0


In [44]:
#Convert column = shown ad was clicked: 1 = clicked ad
df.convert.value_counts()

1    1908
0     192
Name: convert, dtype: int64

### Severely underbalanced

Converted accounts for 1908/999999 = .191% of our data

We will have to oversample or undersample our target. Most likely undersample to minimize our noise

I will use RandomUnderSampler, NearMiss, and EditedNearestNeighbors
to underersample our target here first to get baselines

After doing that the accuracy of models was a bit lower than desirable, I will try a different resampling technique. That is SMOTEEN which combines under/over sampling

In [45]:
from sklearn.model_selection import train_test_split

x = df.drop(['convert'],axis=1) #training features
y = df.convert #target

#Need to set our train/test values before resampling
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                  test_size = .1)

In [46]:
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN

#Oversample

X_resampled, y_resampled = SMOTE().fit_resample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))


[(0, 1723), (1, 1723)]


In [47]:
#First model using our oversampled data (SMOTE)
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

clf = LinearSVC()
clf.fit(X_resampled, y_resampled) 

print('Training accuracy LinearSVC is: {}'.format(clf.score(X_resampled, y_resampled)))
print('Test accuracy LinearSVC is: {}'.format(clf.score(x_test, y_test)))

from sklearn.metrics import classification_report

y_pred = clf.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

# scores = cross_val_score(clf, X_resampled, y_resampled, cv=5)
# print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Training accuracy LinearSVC is: 0.6993615786419036
Test accuracy LinearSVC is: 0.5952380952380952
              precision    recall  f1-score   support

           0       0.17      0.64      0.27        25
           1       0.92      0.59      0.72       185

   micro avg       0.60      0.60      0.60       210
   macro avg       0.55      0.61      0.50       210
weighted avg       0.83      0.60      0.67       210



In [48]:
from sklearn import ensemble

params = {'n_estimators': 600,
          'max_features': 'log2',
          'max_depth': 6}

rfc = ensemble.RandomForestClassifier(**params)
rfc.fit(X_resampled, y_resampled)

print('Training accuracy normal RandomForest is: {}'.format(rfc.score(X_resampled, y_resampled)))
print('Test accuracy normal RandomForest is: {}'.format(rfc.score(x_test, y_test)))


y_pred = rfc.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

# scores = cross_val_score(rfc, X_resampled, y_resampled, cv=5)
# print("Cross Val RandomForestClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Training accuracy normal RandomForest is: 0.9158444573418456
Test accuracy normal RandomForest is: 0.7095238095238096
              precision    recall  f1-score   support

           0       0.19      0.44      0.27        25
           1       0.91      0.75      0.82       185

   micro avg       0.71      0.71      0.71       210
   macro avg       0.55      0.59      0.54       210
weighted avg       0.82      0.71      0.75       210



In [49]:
params = {'n_estimators': 500,
          'max_features': 'sqrt',
          'max_depth': 4}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X_resampled, y_resampled)

print('Training accuracy normal GradientBoosting is: {}'.format(clf1.score(X_resampled, y_resampled)))
print('Test accuracy normal GradientBoosting is: {}'.format(clf1.score(x_test, y_test)))


y_pred = clf1.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

# scores = cross_val_score(clf1, X_resampled, y_resampled, cv=5)
# print("Cross Val GradientBoostingClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Training accuracy normal GradientBoosting is: 0.9994196169471852
Test accuracy normal GradientBoosting is: 0.8761904761904762
              precision    recall  f1-score   support

           0       0.43      0.12      0.19        25
           1       0.89      0.98      0.93       185

   micro avg       0.88      0.88      0.88       210
   macro avg       0.66      0.55      0.56       210
weighted avg       0.84      0.88      0.84       210



In [50]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [400,500,600],
    'max_features': ['sqrt','log2'],
    'max_depth' : [4,5,6]
}

CV_clf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3)
CV_clf.fit(X_resampled, y_resampled)

CV_clf.best_params_

{'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 400}

In [51]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average=None)


array([0.1875    , 0.93298969])

In [52]:
from sklearn.metrics import balanced_accuracy_score

y_pred = clf.predict(x_test) #predict y based on x_test
balanced_accuracy_score(y_test, y_pred)


0.6145945945945945

In [53]:
#SMOTE beats RandomOverSampling AND ADASYN 

In [54]:
%timeit clf.predict(df.iloc[5:6, 0:88] )

872 µs ± 53.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [55]:
%timeit rfc.predict(df.iloc[5:6, 0:88] )

42.2 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [56]:
%timeit clf1.predict(df.iloc[5:6, 0:88] )

1.14 ms ± 43 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
