In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
data = pd.read_csv('biddings.csv')
dfo = pd.DataFrame(data)


In [23]:
df = dfo.iloc[:250000, ] 
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,convert
249995,-0.02,2.39,-1.2,-0.69,0.94,0.57,0.19,0.03,-0.41,-0.53,...,0.08,-0.07,-0.31,-1.0,0.09,-0.76,-0.1,0.05,-0.05,0
249996,-0.01,2.03,-1.99,-0.63,0.77,-1.61,0.1,1.64,-0.28,2.22,...,-0.03,-0.02,0.04,0.12,-0.05,0.07,0.06,0.66,-0.18,0
249997,-0.02,2.04,2.11,-0.3,0.14,-0.59,0.31,-0.8,0.11,0.23,...,0.09,-1.03,-0.55,0.05,0.39,0.01,-0.68,0.26,-0.13,0
249998,-0.01,-2.39,2.04,-0.09,-0.22,-0.58,0.16,-1.47,0.37,-0.67,...,0.06,0.92,-0.62,-0.14,-1.04,2.37,-0.44,-0.57,0.12,0
249999,-0.01,0.8,-4.22,-0.21,-0.69,2.71,2.85,-0.51,-0.33,0.08,...,-0.15,0.17,0.67,0.15,2.7,0.23,0.12,1.49,-0.46,0


In [24]:
#Convert column = shown ad was clicked: 1 = clicked ad
df.convert.value_counts()

0    249523
1       477
Name: convert, dtype: int64

### Severely underbalanced

Converted accounts for 1908/999999 = .191% of our data

We will have to oversample or undersample our target. Most likely undersample to minimize our noise

I will use RandomUnderSampler, NearMiss, and EditedNearestNeighbors
to underersample our target here first to get baselines

After doing that the accuracy of models was a bit lower than desirable, I will try a different resampling technique. That is SMOTEEN which combines under/over sampling

In [25]:
from sklearn.model_selection import train_test_split

x = df.drop(['convert'],axis=1) #training features
y = df.convert #target

#Need to set our train/test values before resampling
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                  test_size = .1)

In [37]:
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN

#OVERSAMPLE
X_resampled, y_resampled = ADASYN().fit_resample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 224578), (1, 224532)]


In [38]:
#First model using our undersampled/oversampled data (SMOTEEN)
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

clf = LinearSVC()
clf.fit(X_resampled, y_resampled) 

print('Training accuracy LinearSVC is: {}'.format(clf.score(X_resampled, y_resampled)))
print('Test accuracy LinearSVC is: {}'.format(clf.score(x_test, y_test)))

from sklearn.metrics import classification_report

y_pred = clf.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

# scores = cross_val_score(clf, X_resampled, y_resampled, cv=5)
# print("Cross Val LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Training accuracy LinearSVC is: 0.6668455389548218
Test accuracy LinearSVC is: 0.65264
              precision    recall  f1-score   support

           0       1.00      0.65      0.79     24945
           1       0.00      0.51      0.01        55

   micro avg       0.65      0.65      0.65     25000
   macro avg       0.50      0.58      0.40     25000
weighted avg       1.00      0.65      0.79     25000



In [32]:
from sklearn import ensemble

params = {'n_estimators': 600,
          'max_features': 'log2',
          'max_depth': 6}

rfc = ensemble.RandomForestClassifier(**params)
rfc.fit(X_resampled, y_resampled)

print('Training accuracy normal RandomForest is: {}'.format(rfc.score(X_resampled, y_resampled)))
print('Test accuracy normal RandomForest is: {}'.format(rfc.score(x_test, y_test)))


y_pred = rfc.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

# scores = cross_val_score(rfc, X_resampled, y_resampled, cv=5)
# print("Cross Val RandomForestClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Training accuracy normal RandomForest is: 0.8323633659574847
Test accuracy normal RandomForest is: 0.82372
              precision    recall  f1-score   support

           0       1.00      0.82      0.90     24945
           1       0.00      0.35      0.01        55

   micro avg       0.82      0.82      0.82     25000
   macro avg       0.50      0.59      0.46     25000
weighted avg       1.00      0.82      0.90     25000



In [34]:
params = {'n_estimators': 500,
          'max_features': 'sqrt',
          'max_depth': 4}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf1.fit(X_resampled, y_resampled)

print('Training accuracy normal GradientBoosting is: {}'.format(clf1.score(X_resampled, y_resampled)))
print('Test accuracy normal GradientBoosting is: {}'.format(clf1.score(x_test, y_test)))


y_pred = clf1.predict(x_test) #predict y based on x_test
print(classification_report(y_test, y_pred)) #true value vs predicted value

# scores = cross_val_score(clf1, X_resampled, y_resampled, cv=5)
# print("Cross Val GradientBoostingClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Training accuracy normal GradientBoosting is: 0.9893578177737802
Test accuracy normal GradientBoosting is: 0.97668
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     24945
           1       0.01      0.07      0.01        55

   micro avg       0.98      0.98      0.98     25000
   macro avg       0.50      0.53      0.50     25000
weighted avg       1.00      0.98      0.99     25000



In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [400,500,600],
    'max_features': ['sqrt','log2'],
    'max_depth' : [4,5,6]
}

CV_clf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3)
CV_clf.fit(X_resampled, y_resampled)

CV_clf.best_params_

KeyboardInterrupt: 

In [10]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average=None)


array([0.82607164, 0.0026936 ])