# Book 7 - Gradient Boost

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import random
import datetime

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patheffects as path_effects
from matplotlib import cm as cm2
matplotlib.style.use('ggplot') 
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn import linear_model, datasets
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier, ExtraTreesClassifier, BaggingRegressor, \
    ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score \
#     confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split, KFold, cross_val_score


matplotlib.style.use('ggplot') 
%matplotlib inline



# Bring in X & T train & test

In [2]:
with open('X_train.pkl', 'r') as picklefile:
    X_train = pickle.load(picklefile)
    
with open('X_test.pkl', 'r') as picklefile:
    X_test = pickle.load(picklefile)
    
with open('Y_train.pkl', 'r') as picklefile:
    Y_train = pickle.load(picklefile)
    
with open('Y_test.pkl', 'r') as picklefile:
    Y_test = pickle.load(picklefile)

In [3]:
Y_test = np.transpose(Y_test.values)[0]
Y_train = np.transpose(Y_train.values)[0]
type(Y_test)

numpy.ndarray

In [4]:
with open('Disp_train.pkl', 'r') as picklefile:
    Disp_train = pickle.load(picklefile)
    
with open('Disp_test.pkl', 'r') as picklefile:
    Disp_test = pickle.load(picklefile)

In [5]:
print('X_train length: {0:,}'.format(len(X_train)))
print('X_test  length:  {0:,}'.format(len(X_test)))

X_train length: 23,179
X_test  length:  2,396


In [6]:
X = X_train.append(X_test)
print('Combined length: {0:,}'.format(len(X)))

Combined length: 25,575


In [7]:
X.columns

Index([u'pltfrm', u'state', u'cmpgn_id', u'adv_id', u'period', u'doc_topic',
       u'ad_topic'],
      dtype='object')

In [8]:
start = datetime.datetime.now()

X = pd.get_dummies(X, prefix=['pltfrm', 'state', 'cmpgn_id', 'adv_id',\
                                          'period','doc_topic', 'ad_topic'])

X.drop(['pltfrm_3','state_AK','cmpgn_id_32840','adv_id_1890','period_morning',
       'doc_topic_17.0','ad_topic_91.0'])
finish = datetime.datetime.now()
print(finish - start)

0:00:07.460484


In [9]:
X_train = X[:23179]
X_test = X[23179:]

In [10]:
print('X_train length: {0:,}'.format(len(X_train)))
print('X_test  length:  {0:,}'.format(len(X_test)))

X_train length: 23,179
X_test  length:  2,396


# Gradient Boost

Gradient Boost default settings
* correct predictions: 155
* Precent correct: 33.9%

Gradient Boost (learning rate = 0.2, n_estimators = 100 - precision gridsearch)
* correct predictions: 171
* Precent correct: 37.4%


In [11]:
start = datetime.datetime.now()

gbc = GradientBoostingClassifier()

gbc.fit(X_train,Y_train)

finish = datetime.datetime.now()
print(finish - start)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

# <font color='blue'> Post Processing Steps

In [12]:
# get probabilities of clicked/not clicked
gbc_proba = gbc.predict_proba(X_test)
gbc_Y_proba = pd.DataFrame(gbc_proba,columns = ['p_no_click','p_click'])

In [13]:
ppp = pd.concat([Disp_test, gbc_Y_proba], axis=1)
# post processing of predictor

## Reduce to 1-click per display id

In [14]:
checker = ppp
checker.head(3)

Unnamed: 0,display_id,p_no_click,p_click
0,65537,0.773755,0.226245
1,65537,0.799616,0.200384
2,65537,0.773755,0.226245


In [15]:
checker['Y_test'] = pd.Series(Y_test)
checker['prediction'] = [0 for _ in range(len(checker))] # puts zeros in place
checker.head()

Unnamed: 0,display_id,p_no_click,p_click,Y_test,prediction
0,65537,0.773755,0.226245,0,0
1,65537,0.799616,0.200384,0,0
2,65537,0.773755,0.226245,0,0
3,65537,0.773755,0.226245,1,0
4,66286,0.773755,0.226245,1,0


In [16]:
# List of unique display_id's, with each display id corresponding to an event
pages = checker['display_id'].unique()
len(pages)

457

In [17]:
# system to use probabilities to choose which page is clicked
# compensates for zero probability across the board, or muliple matching high probabilities

for d_num in pages:  
    indx = 0 # index that becomes click
    disp_list = checker[checker['display_id'] == d_num].index.tolist()

    temp = pd.DataFrame(ppp[ppp['display_id'] == d_num])

    p_max = temp['p_click'].max()

    if p_max == 0:
        indx = random.choice(disp_list) # randomly picks from all matched values
    else:
        counter = 0
        short_list =[]
        for index, row in temp.iterrows():
            if row['p_click'] == p_max:
                counter += 1
                short_list.append(index)
        indx = random.choice(short_list) # randomly picks from all matched values
    checker.set_value(indx, 'prediction', 1)


In [18]:
sum(checker['Y_test'])

457

In [19]:
sum(checker['prediction'])

457

In [20]:
count = 0
for index, row in checker.iterrows():
    if row['prediction'] == 1 and row['Y_test'] == 1:
        count +=1
print('Clicks correctly predicted: {0:,}'.format(count))
print('Percent correctly predicted: {0:%}'.format(float(count)/sum(checker['Y_test'])))

Clicks correctly predicted: 171
Percent correctly predicted: 37.417943%


# <font color='purple'> Gridsearch goes below: <font>

In [None]:
PARAMETERS = {"learning_rate":[0.1, 0.2, 0.4], "n_estimators": [50,100,500,1000]}
SCORING = "precision"

start = datetime.datetime.now()

grid = GridSearchCV(gbc, scoring = SCORING, param_grid = PARAMETERS)
grid.fit(X_train,Y_train)

finish = datetime.datetime.now()
print(finish - start)

In [None]:
grid.best_estimator_
