# Step 1: Extremely Randomized Forest #

In [3]:
import pandas as pd
import numpy as np
import os
import xlrd
import urllib

# Set your path here
cwd = os.getcwd()
print cwd
os.chdir(cwd)

# Read the data
target = 'default payment next month'
data = pd.read_excel('defaultcredit.xls', skiprows = 1)

# Define the target variable and features
target = 'default payment next month'
y = np.asarray(data[target])
features = data.columns.drop(['ID', target])
X = np.asarray(data[features])

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split

# ExtraTreesClassifier: splitting the data and fitting on training data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.30, 
                                                    random_state=101)

clf = ExtraTreesClassifier(n_estimators=500, 
                           random_state=101)

clf.fit(X_train,y_train)
scores = cross_val_score(clf, 
                         X_train, 
                         y_train, 
                         cv=3,
                         scoring='accuracy', 
                         n_jobs=-1)

print "ExtraTreesClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))


# Make predictions on the test set
y_pred=clf.predict(X_test)
from sklearn.metrics import confusion_matrix

'''
a is the number of correct predictions that an instance is negative, (TN)
b is the number of incorrect predictions that an instance is positive, (FP)
c is the number of incorrect of predictions that an instance negative, (FN)
d is the number of correct predictions that an instance is positive. (TP)

a b 
c d
'''
confusionMatrix = confusion_matrix(y_test, y_pred)
print confusionMatrix

# Get the accuracy score on the test set
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

/Users/adarshnair/Desktop/PyML_at_scale/Classification_Regression_at_scale
ExtraTreesClassifier -> cross validation accuracy: mean = 0.812 std = 0.003
[[6610  448]
 [1238  704]]


0.81266666666666665

## Step 1.1: Fast parameter optimization with randomized search ##

In [4]:
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

# Create parameter distribution
param_dist = {"max_depth": [1,3, 7,8,12,None],
             "max_features": [8,9,10,11,16,22],
             "min_samples_split": [8,10,11,14,16,19],
             "min_samples_leaf": [1,2,3,4,5,6,7],
             "bootstrap": [True, False]}

# We use only 25 random parameter 
# valuations but we manage to keep training times in check.
rsearch = RandomizedSearchCV(clf, 
                             param_distributions = param_dist,
                             n_iter=25)  

# Fit the data
rsearch.fit(X_train,y_train)
rsearch.grid_scores_

# Choose the best estimator
bestclf = rsearch.best_estimator_
print bestclf

# Make predictions using the best estimator
y_pred = bestclf.predict(X_test)


# Get performance metrics
confusionMatrix = confusion_matrix(y_test, y_pred)
print confusionMatrix 

accuracy=accuracy_score(y_test, y_pred)
print accuracy

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=12, max_features=22, max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=101, verbose=0, warm_start=False)
[[6718  340]
 [1245  697]]
0.823888888889


We have increased our accuracy by using RandomizedSearchCV.

# Step 2: Using subsampling for using Extremely Randomized Forest on large datasets #

## Step 2.1: Fetch the data ##

In [5]:
from sklearn.datasets import fetch_covtype
import numpy as np
from sklearn.cross_validation import train_test_split
import os

# Set current path
cwd = os.getcwd()
print cwd
os.chdir(cwd)

dataset = fetch_covtype(random_state=111, shuffle=True)
dataset = fetch_covtype()

# Set the feature and target variables
X, y = dataset.data, dataset.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=0)
del(X,y)
covtrain = np.c_[X_train,y_train]
covtest = np.c_[X_test,y_test]
np.savetxt('covtrain.csv', covtrain, delimiter=",")
np.savetxt('covtest.csv', covtest, delimiter=",")

/Users/adarshnair/Desktop/PyML_at_scale/Classification_Regression_at_scale


## Step 2.2: Split the data into 3 batches ##

In [None]:
$ subsample --reservoir -n 10000 covtrain.csv > cov1.csv
$ subsample --reservoir -n 10000 covtrain.csv > cov2.csv
$ subsample --reservoir -n 10000 covtrain.csv > cov3.csv

## Step 2.3: Load each subsample and fit ##

In [8]:
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
import pandas as pd
import os

#here we load sample 1 - 100 trees
df = pd.read_csv('cov1.csv')
y=df[df.columns[54]]
X=df[df.columns[0:54]]

clf1=ExtraTreesClassifier(n_estimators=100, 
                          random_state=101,
                          warm_start=True)
clf1.fit(X,y)
scores = cross_val_score(clf1, 
                         X, 
                         y, 
                         cv=3,
                         scoring='accuracy', 
                         n_jobs=-1)
print "ExtraTreesClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
print scores
print 'amount of trees in the model: %s' % len(clf1.estimators_)

ExtraTreesClassifier -> cross validation accuracy: mean = 0.802 std = 0.006
[ 0.80665468  0.79351741  0.80576403]
amount of trees in the model: 100


In [9]:
# sample 2 - 150 trees
df = pd.read_csv('cov2.csv')
y=df[df.columns[54]]
X=df[df.columns[0:54]]

clf1.set_params(n_estimators=150, 
                random_state=101,
                warm_start=True)
clf1.fit(X,y)
scores = cross_val_score(clf1, 
                         X, 
                         y, 
                         cv=3,
                         scoring='accuracy', 
                         n_jobs=-1)
print "ExtraTreesClassifier after params -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
print scores
print 'amount of trees in the model: %s' % len(clf1.estimators_)

ExtraTreesClassifier after params -> cross validation accuracy: mean = 0.798 std = 0.001
[ 0.79736211  0.80042017  0.79735815]
amount of trees in the model: 150


In [10]:
# sample 3
df = pd.read_csv('cov3.csv')
y=df[df.columns[54]]
X=df[df.columns[0:54]]

clf1.set_params(n_estimators=200, 
                random_state=101,
                warm_start=True)
clf1.fit(X,y)
scores = cross_val_score(clf1, 
                         X, 
                         y, 
                         cv=3,
                         scoring='accuracy', 
                         n_jobs=-1)
print "ExtraTreesClassifier after params -> cross validation accuracy: mean = %0.3f std = %0.3f" % (np.mean(scores), np.std(scores))
print scores
print 'amount of trees in the model: %s' % len(clf1.estimators_)

ExtraTreesClassifier after params -> cross validation accuracy: mean = 0.811 std = 0.001
[ 0.81000899  0.81332533  0.81111111]
amount of trees in the model: 200


In [11]:
# Now let’s predict our combined model on the test set and check our score.
df = pd.read_csv('covtest.csv')
X = df[df.columns[0:54]]
y = df[df.columns[54]]

pred2 = clf1.predict(X)
scores = cross_val_score(clf1, 
                         X, 
                         y, 
                         cv=3,
                         scoring='accuracy', 
                         n_jobs=-1)
print "final test score %r" % np.mean(scores)

final test score 0.92185447181058278


# Step 3: Gradient Boosting Classifier #

## Step 3.1: GBM without warm_start ##

In [12]:
import pandas
import urllib2
import urllib2
from sklearn import ensemble

# Grab data
columnNames1_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names'
columnNames1 = [
   line.strip().split(':')[0]
   for line in urllib2.urlopen(columnNames1_url).readlines()[33:]]

# Change column names
columnNames1
n = 0
for i in columnNames1:
   columnNames1[n] = i.replace('word_freq_','')
   n += 1
print columnNames1

# Data preprocessing
spamdata = pandas.read_csv(
   'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',
   header=None, names=(columnNames1 + ['spam'])
)

# Set feature and target variables
# All rows, all eclumns except column 58
X = spamdata.values[:,:57]
y = spamdata['spam']

spamdata.head()

['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']


Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [22]:
import numpy as np
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import train_test_split
from sklearn.metrics  import recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=22)

# Define the classifier
clf = ensemble.GradientBoostingClassifier(n_estimators = 300,
                                          random_state= 222,
                                          max_depth = 16,
                                          learning_rate= .1,
                                          subsample= .5)
# Fit the classifier on the training data
scores= clf.fit(X_train,y_train)

# Find scores for all the cross validated combinations
scores2 = cross_val_score(clf, 
                          X_train, 
                          y_train, 
                          cv = 3, 
                          scoring = 'accuracy',
                          n_jobs = -1)

print "Scores mean value: ", scores2.mean()

# Make predictions on the test set
y_pred = cross_val_predict(clf, 
                           X_test, 
                           y_test, 
                           cv = 10)


confusionMatrix = confusion_matrix(y_test, y_pred)
print confusionMatrix

print "Accuracy score: ", accuracy_score(y_test, y_pred)

# Show feature relevance
print "Show feature relevance: ", clf.feature_importances_


def featureImp_order(clf, X, k=5):
    return X[:,clf.feature_importances_.argsort()[::-1][:k]]
newX = featureImp_order(clf,X,2)

print "Show feature relevance in order:", newX

# let's order the features in amount of importance
import pprint
pprint.pprint(sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), columnNames1),
            reverse=True))

Scores mean value:  0.945030177548
[[799  36]
 [ 63 483]]
Accuracy score:  0.928312816799
Show feature relevance:  [  3.92048783e-03   3.60027473e-03   9.74413716e-03   2.65435429e-03
   1.20586557e-02   4.23505596e-03   1.32726979e-02   2.06756938e-02
   3.63957053e-03   1.49384505e-02   8.96946481e-03   3.32963571e-02
   2.33324239e-03   9.45292434e-02   3.96500865e-04   1.69613556e-02
   4.02531761e-02   1.12791975e-02   4.67209613e-02   3.68033142e-03
   2.75038519e-02   8.06681367e-04   2.59315600e-02   5.56167797e-03
   1.09143309e-02   3.81396392e-03   3.57384108e-03   4.89295950e-03
   6.71863955e-04   2.56619971e-03   1.48475132e-04   2.50652925e-04
   2.80756735e-03   2.19083910e-05   1.09440947e-03   4.69189420e-03
   5.13302322e-03   4.36303813e-04   1.94261505e-03   6.08174726e-04
   2.71902886e-04   2.27073239e-03   5.00886660e-03   4.80224592e-03
   1.00947737e-02   1.88114627e-02   1.03080853e-04   5.29809743e-04
   2.26245798e-01   2.57178324e-02   1.59664784e-03   3.9

## Step 3.2: GBM with warm_start ##

** warm_start ** allows for storing new tree information after each iteration, and is added to the previous tree without generating new trees.

In [21]:
# with warm_start
gbc = GradientBoostingClassifier(warm_start = True, 
                                 learning_rate = .05, 
                                 max_depth = 20,
                                 random_state = 0)

# Build tree models incrementally
for n_estimators in range(1, 1500, 100):
    gbc.set_params(n_estimators = n_estimators)
    gbc.fit(X_train, y_train) 

# Make predictions on warm started Gradient Boosting Machine
y_pred = gbc.predict(X_test)

print "Classification results: \n", classification_report(y_test, y_pred)
print "\nGBM parameters: \n", gbc.set_params

Classification results: 
             precision    recall  f1-score   support

          0       0.93      0.95      0.94       835
          1       0.92      0.89      0.91       546

avg / total       0.93      0.93      0.93      1381


GBM parameters: 
<bound method GradientBoostingClassifier.set_params of GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=20, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1401,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=True)>


## Step 3.3: Storing the model for later use after training ##

To store the model and use it later, we can use `joblib`

In [23]:
import errno
import os

cwd = os.getcwd()
print cwd
os.chdir(cwd)

# set your path here to store the pickled model
path = cwd + '/clf'

# create directory
clfm = os.makedirs(path)
os.chdir(path)

# now let's load our stored model and use it for prediction.
from sklearn.externals import joblib

# joblib dumps the model in the form of a pickle
joblib.dump( gbc,'clf_gbc.pkl')

# load the model
model_clone = joblib.load('clf_gbc.pkl')
# make predictions with the loaded model
zpred = model_clone.predict(X_test)

print zpred

/Users/adarshnair/Desktop/PyML_at_scale/Classification_Regression_at_scale
[1 1 1 ..., 1 0 0]


# Step 4: Extreme Gradient Boosting(XGBoost)#

Advantages of XGBoost:

* XGBoost can leverage multithreading on a single machine and parallel processing on clusters of multiple servers(sharding) while GBM(Gradient boosting machine) has no options for parallel processing.
* XGBoost can handle sparse data(GBM cannot) as input without storing zero values in memory.
* Best node splits are calculated with better efficiency than GBM through a process called quantile sketch. This method transforms the data by a weighting algorithm so the candidate splits are sorted based on a certain accuracy level.

## Step 4.1: Classification - XGBoost on spam dataset ##

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import numpy as np
from sklearn.metrics import classification_report
from sklearn import cross_validation

# Define XGBoost classifer
clf = xgb.XGBClassifier(n_estimators=100,
                        max_depth=8,
                        learning_rate=.1,
                        subsample=.5)

# Gradient boosting machine (GBM)
clf1 = GradientBoostingClassifier(n_estimators=100,
                                  max_depth=8,
                                  learning_rate=.1,
                                  subsample=.5)

# Fit XGBoost
xgm = clf.fit(X_train,y_train)

# Fit GBM
gbmf = clf1.fit(X_train,y_train)

# Get predictions
y_pred = xgm.predict(X_test)
y_pred2 = gbmf.predict(X_test)

print 'XGBoost results %r' % (classification_report(y_test, y_pred))
print 'gbm results %r' % (classification_report(y_test, y_pred2))

## Step 4.2 Regression - XGBoost on California housing dataset ##

### Step 4.2.a: XGBoost regressor without parameter grid ###

In [None]:
import numpy as np
import scipy.sparse
import xgboost as xgb
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
pd=fetch_california_housing()

# Log the target variable to even out skews
y = np.log(pd.target)
X_train, X_test, y_train, y_test = train_test_split(pd.data,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=111)

# Define features
names = pd.feature_names
print names


from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV

# Define XGBoost Regressor
clf = xgb.XGBRegressor(gamma = 0,
                       objective = "reg:linear",
                       nthread = -1)


# Fit the regressor and make predictions
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)


print 'score before gridsearch %r' % mean_squared_error(y_test, y_pred)

### Step 4.2.b: XGBoost regressor with parameter grid ###

In [None]:
# Set up parameter grid
params = {'max_depth':[4,6,8],
          'n_estimators':[1000],
          'min_child_weight':range(1,3),
          'learning_rate':[.1,.01,.001],
          'colsample_bytree':[.8,.9,1],
          'gamma':[0,1]}

# With the parameter nthread we specify XGBoost for parallelisation 
cvx = xgb.XGBRegressor(objective= "reg:linear",
                       nthread=-1)
clf = GridSearchCV(estimator = cvx,
                   param_grid = params,
                   n_jobs = -1,
                   scoring = 'mean_absolute_error',
                   verbose = True)

# Fit the regressor and make predictions
clf.fit(X_train,y_train)
print clf.best_params_

y_pred = clf.predict(X_test)

print 'score after gridsearch %r' %mean_squared_error(y_test, y_pred)

## Step 4.3: Feature selection with XGBoost ##

In [None]:
import numpy as np
import os
from matplotlib import pylab as plt
%matplotlib inline   

#our best parameter set 
# {'colsample_bytree': 1, 'learning_rate': 0.1, 'min_child_weight': 1, 'n_estimators': 500, #'max_depth': 8, 'gamma': 0}

params={'objective': "reg:linear",
        'eval_metric': 'rmse',
        'eta': 0.1,
        'max_depth':8,
        'min_samples_leaf':4,
        'subsample':.5,
        'gamma':0
       }

# Create the Dimension Matrix
dm = xgb.DMatrix(X_train, 
                 label = y_train,
                 feature_names=names)

# Train the model
regbgb = xgb.train(params, 
                   dm, 
                   num_boost_round=100)
# Get F scores
np.random.seed(1)
regbgb.get_fscore()

# Get the features with their F score
regbgb.feature_names
regbgb.get_fscore()
xgb.plot_importance(regbgb,
                    color='magenta',
                    title='california-housing|variable importance')

In [None]:
# TODO
# https://github.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/blob/master/Chapter%2006/Chapter_6_code.ipynb