In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import os
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn import preprocessing
import time

In [2]:
# Import data (for Mac the "\\" might be replaced by "/")
cwd = os.getcwd()
data = pd.read_csv(cwd + "\\WA_Fn-UseC_-Sales-Win-Loss.csv", index_col = 0)

First do data preparation the Xiao way except standardization which is not required for Random Forest.

In [3]:
# Data Preparition
data_clean = data.copy()
# Drop rows with opportunity amount 0
data_clean = data_clean.drop(data_clean[data_clean['Opportunity Amount USD'] == 0].index, axis = 0)
# Drop features that should not be used in training and prediction
data_clean = data_clean.drop(['Sales Stage Change Count', 'Elapsed Days In Sales Stage', 
                              'Ratio Days Identified To Total Days',
                              'Ratio Days Qualified To Total Days', 'Ratio Days Validated To Total Days', 
                              'Total Days Identified Through Closing', 'Total Days Identified Through Qualified',
                              'Deal Size Category'], 
                             axis = 1)
# Change opportunity result(our target variable) to numeric
minority = data_clean['Opportunity Result'].value_counts().idxmin()
result = np.array((data_clean['Opportunity Result'] == minority).astype(int))
data_clean = data_clean.drop(['Opportunity Result'], axis = 1)
# For each categorical variables in the dataset, try to assign an unique indicator variable 
#      for each different category
# (This may not be a good step because it adds too many columns, but we'll see)
for col in data_clean.columns:
    if is_string_dtype(data_clean[col]):
        name_set = set(data_clean[col])
        for name in name_set:
            data_clean[' '.join([col, name, 'indicator'])] = (data_clean[col] == name).astype(int)
        data_clean = data_clean.drop([col], axis = 1)
# Check if there is any null value in our dataset
if data_clean.isnull().any().any():
    print('NO!!!!!')
# Gladly, we don't have any

data_clean['result'] = result

In [4]:
data_clean.describe()

Unnamed: 0,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Supplies Subgroup Car Electronics indicator,Supplies Subgroup Performance Parts indicator,Supplies Subgroup Interior Accessories indicator,Supplies Subgroup Exterior Accessories indicator,Supplies Subgroup Batteries & Accessories indicator,Supplies Subgroup Replacement Parts indicator,...,Region Southeast indicator,Route To Market Fields Sales indicator,Route To Market Other indicator,Route To Market Telecoverage indicator,Route To Market Telesales indicator,Route To Market Reseller indicator,Competitor Type Known indicator,Competitor Type None indicator,Competitor Type Unknown indicator,result
count,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,...,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0,75949.0
mean,94096.181727,1.617177,1.600851,0.302874,0.003252,0.032719,0.070626,0.178462,0.114564,0.099014,...,0.118198,0.475042,0.035524,0.007663,0.029217,0.452554,0.15463,0.118132,0.727238,0.227508
std,134083.441593,1.232121,1.207153,0.928062,0.056935,0.177902,0.256201,0.382904,0.318497,0.298683,...,0.322844,0.49938,0.185101,0.087203,0.168415,0.497747,0.361554,0.322766,0.445382,0.419226
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,110000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,1000000.0,5.0,5.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Split data into training and test (Xiao's version)
length = len(data_clean['result'])
train_index = []
test_index = []
for i in range(length):
    train_index.append(i) if np.random.rand() <= 0.8 else test_index.append(i)
data_train = data_clean.iloc[train_index, :]
data_test = data_clean.iloc[test_index, :]

In [10]:
# Use Grid Search for Gradient Boosting
min_samples_split = [100 * i for i in range(1, 5)]
min_samples_leaf = [10 * i for i in range(1, 7)]

gs_gbc = GBC()
param_dist = {'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'max_features': ['auto', 'sqrt', 'log2', None],
              'loss' : ['deviance', 'exponential'], 
              'n_estimators': [100, 500, 1500, 2000]}

cv_gbc = GridSearchCV(gs_gbc, cv = 3,
                     param_grid=param_dist,
                     n_jobs = 3)

In [12]:
start = time.time()
cv_gbc.fit(data_train.drop(['result'], axis = 1), data_train['result'])

print('Best Parameters using grid search: \n',
      cv_gbc.best_params_)

end = time.time()
print('Time taken in random grid: {0: .2f}'.format(end - start))

Best Parameters using grid search: 
 {'loss': 'exponential', 'max_features': 'auto', 'min_samples_leaf': 30, 'min_samples_split': 300, 'n_estimators': 2000}
Time taken in random grid:  39373.08


In [13]:
# Train a first model with these parameters
gb_1 = cv_gbc.best_estimator_
gb_1.fit(data_train.drop("result", 1), data_train['result'])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='exponential', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=30, min_samples_split=300,
              min_weight_fraction_leaf=0.0, n_estimators=2000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [14]:
# Make prediction
gb_1_pred = gb_1.predict_proba(data_test.drop("result", 1))

In [15]:
roc_auc_score(data_test["result"], gb_1_pred[:,1])

0.8553832665542003