In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [7]:
data = pd.read_csv('biddings.csv')
dfo = pd.DataFrame(data)


In [8]:
df = dfo
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,convert
999995,-0.01,-2.42,2.0,-0.5,0.55,0.21,0.11,0.15,-0.15,0.77,...,0.27,0.35,-1.18,-1.51,-0.0,-0.97,-0.0,-0.44,0.13,0
999996,-0.02,2.84,-3.7,-0.15,-0.67,2.65,2.59,-0.85,-0.21,-0.68,...,-0.1,0.88,0.5,-0.05,-0.89,-0.04,0.17,0.12,-0.01,0
999997,-0.01,-0.29,-1.9,-0.81,1.02,0.42,0.13,0.79,-0.58,0.6,...,0.03,-1.04,-0.35,0.17,0.36,0.08,-0.1,-0.05,0.01,0
999998,-0.02,2.07,2.05,0.04,0.84,0.11,-0.19,0.35,-0.17,-0.07,...,0.58,0.42,-0.69,-0.47,0.06,-0.2,-0.73,0.24,-0.07,0
999999,-0.01,2.46,-0.13,10.56,-0.05,-1.03,-0.53,-1.89,-3.49,0.34,...,0.04,-0.53,0.84,0.19,-0.08,0.04,-0.02,0.14,-0.25,0


In [9]:
#Convert column = shown ad was clicked: 1 = clicked ad
df.convert.value_counts()

0    998092
1      1908
Name: convert, dtype: int64

### Severely underbalanced

Converted accounts for 1908/999999 = .191% of our data

We will have to oversample or undersample our target. Most likely undersample to minimize our noise

I will use RandomUnderSampler, NearMiss, and EditedNearestNeighbors
to underersample our target here first to get baselines

In [42]:
from sklearn.model_selection import train_test_split

x = df.drop(['convert'],axis=1) #training features
y = df.convert #target

#Need to set our train/test values before undersampling
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                  test_size = .20)

In [43]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

#perform our undersampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 1521), (1, 1521)]


In [44]:
#First model using our undersampled data (RandomUnderSampler)
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

clf = LinearSVC()
clf.fit(X_resampled, y_resampled) 

print('Training accuracy LinearSVC is: {}'.format(clf.score(X_resampled, y_resampled)))
print('Test accuracy LinearSVC is: {}'.format(clf.score(x_test, y_test)))

scores = cross_val_score(clf, X_resampled, y_resampled, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


y_pred = clf.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

Training accuracy LinearSVC is: 0.6607495069033531
Test accuracy LinearSVC is: 0.626715
Cross Val LinearSVC Accuracy: 0.63 (+/- 0.06)
              precision    recall  f1-score   support

           0       1.00      0.63      0.77    199613
           1       0.00      0.62      0.01       387

   micro avg       0.63      0.63      0.63    200000
   macro avg       0.50      0.62      0.39    200000
weighted avg       1.00      0.63      0.77    200000



In [45]:
from sklearn import ensemble

params = {'n_estimators': 750,
          'max_features': 'log2',
          'max_depth': 6}

rfc = ensemble.RandomForestClassifier(**params)
rfc.fit(X_resampled, y_resampled)

print('Training accuracy normal RandomForest is: {}'.format(rfc.score(X_resampled, y_resampled)))
print('Test accuracy normal RandomForest is: {}'.format(rfc.score(x_test, y_test)))

scores = cross_val_score(rfc, X_resampled, y_resampled, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

y_pred = rfc.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

Training accuracy normal RandomForest is: 0.7478632478632479
Test accuracy normal RandomForest is: 0.630045
Cross Val LinearSVC Accuracy: 0.63 (+/- 0.06)
              precision    recall  f1-score   support

           0       1.00      0.63      0.77    199613
           1       0.00      0.63      0.01       387

   micro avg       0.63      0.63      0.63    200000
   macro avg       0.50      0.63      0.39    200000
weighted avg       1.00      0.63      0.77    200000



In [46]:
params = {'n_estimators': 750,
          'max_features': 'log2',
          'max_depth': 6}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X_resampled, y_resampled)

print('Training accuracy normal GradientBoosting is: {}'.format(clf1.score(X_resampled, y_resampled)))
print('Test accuracy normal GradientBoosting is: {}'.format(clf1.score(x_test, y_test)))

y_pred = clf1.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

Training accuracy normal GradientBoosting is: 0.9927679158448389
Test accuracy normal GradientBoosting is: 0.603255
              precision    recall  f1-score   support

           0       1.00      0.60      0.75    199613
           1       0.00      0.60      0.01       387

   micro avg       0.60      0.60      0.60    200000
   macro avg       0.50      0.60      0.38    200000
weighted avg       1.00      0.60      0.75    200000



In [47]:
scores = cross_val_score(clf1, X_resampled, y_resampled, cv=5)
print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Val LinearSVC Accuracy: 0.60 (+/- 0.06)


In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [750],
    'max_features': ['log2'],
    'max_depth' : [4,5,6]
}

CV_clf = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 6)
CV_clf.fit(X_resampled, y_resampled)

CV_clf.best_params_

{'max_depth': 6, 'max_features': 'log2', 'n_estimators': 750}

In [41]:
from sklearn.metrics import classification_report

y_pred = clf.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

              precision    recall  f1-score   support

           0       1.00      0.65      0.79     99816
           1       0.00      0.60      0.01       184

   micro avg       0.65      0.65      0.65    100000
   macro avg       0.50      0.62      0.40    100000
weighted avg       1.00      0.65      0.78    100000



In [48]:
from sklearn.metrics import balanced_accuracy_score

y_pred = clf.predict(x_test) #predict y based on x_test
balanced_accuracy_score(y_test, y_pred)

0.6234413784471402

In [51]:
%timeit clf.predict(df.iloc[5:6, 0:88] )

1.19 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
