# Deal Opportunity Scoring Predictive Model (Re-Training and PKL File Output)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; 
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [7]:
# open deal data and assign to data frame
deal_df = pd.read_excel("deal_retraining_q3_fy21.xlsx")

In [8]:
# sets global parameters to show all columns for .head function
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [9]:
deal_df.head()

Unnamed: 0,selected_count,create_to_selected,create_to_propose,create_to_shop,create_to_close,shop_to_close,days_shop_from_qtr_beg,days_shop_from_qtr_end,days_created_from_qtr_beg,cycle_per_stage,exp_outcome
0,0,0,0,12,631,619,84,6,72,315,0
1,0,0,0,0,47,47,72,18,72,0,0
2,0,0,0,0,110,110,72,18,72,0,0
3,0,0,0,0,203,203,72,18,72,0,0
4,0,0,0,0,384,384,72,18,72,192,0


### Check for outliers and missing values

In [10]:
deal_df.describe()

Unnamed: 0,selected_count,create_to_selected,create_to_propose,create_to_shop,create_to_close,shop_to_close,days_shop_from_qtr_beg,days_shop_from_qtr_end,days_created_from_qtr_beg,cycle_per_stage,exp_outcome
count,3468.0,3468.0,3468.0,3468.0,3468.0,3468.0,3468.0,3468.0,3468.0,3468.0,3468.0
mean,0.189158,55.314014,49.940311,84.97895,420.485006,83.962226,21.433968,16.449539,50.827278,196.511534,0.20271
std,0.391691,184.230354,175.351123,224.949493,390.404253,212.768969,30.176406,25.516935,25.287727,269.936407,0.402077
min,0.0,0.0,0.0,-764.0,-530.0,-365.0,0.0,0.0,0.0,-8.0,0.0
25%,0.0,0.0,0.0,0.0,155.0,0.0,0.0,0.0,29.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,312.0,0.0,0.0,0.0,58.0,91.0,0.0
75%,0.0,0.0,0.0,33.0,588.0,49.25,44.0,25.25,72.0,291.0,0.0
max,1.0,2220.0,2156.0,2156.0,3505.0,1848.0,91.0,91.0,91.0,2616.0,1.0


# Assign features and target varibles to X and y

In [11]:
# Get X and y
X = deal_df.iloc[:, :-1].values
y = deal_df.iloc[:, -1].values

In [12]:
X[:12]

array([[  0,   0,   0,  12, 631, 619,  84,   6,  72, 315],
       [  0,   0,   0,   0,  47,  47,  72,  18,  72,   0],
       [  0,   0,   0,   0, 110, 110,  72,  18,  72,   0],
       [  0,   0,   0,   0, 203, 203,  72,  18,  72,   0],
       [  0,   0,   0,   0, 384, 384,  72,  18,  72, 192],
       [  0,   0,   0,   0, 394, 394,  72,  18,  72, 197],
       [  0,   0,   0,   0, 203,   0,   0,   0,  72,   0],
       [  0,   0,   0,   0, 203,   0,   0,   0,  72, 203],
       [  0,   0,   0,   0, 447,   0,   0,   0,  72, 223],
       [  0,   0,   0,   0, 233, 233,  72,  18,  72,   0],
       [  0,   0,   0,   0, 114,   0,   0,   0,  72,   0],
       [  0,   0,   0,   0, 202, 202,  72,  18,  72, 202]])

In [13]:
#y.ravel() flattens if 2 dimensional
y

array([0, 0, 0, ..., 0, 0, 1])

In [14]:
X.shape

(3468, 10)

In [15]:
y.shape

(3468,)

### Standardize Features

In [16]:
# standardize features
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

# train data calcualtes mean/std and then standarizes it using 'fit'
# test just calculates mean/std for test data. 
# DO NOT 'standarize' test data. Only need to standarize train to train model
X_train_std = stdsc.fit_transform(X)

In [17]:
X_train_std[:-1]

array([[-0.48299681, -0.30028705, -0.28484285, ..., -0.40957292,
         0.83739338,  0.43901286],
       [-0.48299681, -0.30028705, -0.28484285, ...,  0.06077082,
         0.83739338, -0.72809694],
       [-0.48299681, -0.30028705, -0.28484285, ...,  0.06077082,
         0.83739338, -0.72809694],
       ...,
       [ 2.07040705,  0.00372406, -0.1422512 , ...,  1.11904421,
        -1.17968606, -0.67622539],
       [-0.48299681, -0.30028705, -0.28484285, ..., -0.64474478,
        -1.17968606, -0.14268949],
       [-0.48299681, -0.30028705, -0.28484285, ..., -0.64474478,
        -1.17968606,  0.7724728 ]])

In [18]:
X_train_std.shape

(3468, 10)

In [19]:
y.shape

(3468,)

In [20]:
X_full_train = X
y_full_train = y

# Random Forest Model

In [21]:
# Random Forest (ensemble of Decision Trees)
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=2000, random_state=0)
final_rf = rf.fit(X_full_train, y_full_train)
full_train_score_rf = rf.score(X_full_train, y_full_train)
print("Random Forest model score: {0:.2f}%".format(100*full_train_score_rf))

Random Forest model score: 100.00%


# Create Final Model PKL file to be used for Redshift/Tableau

In [22]:
# Import Joblib Module from Scikit Learn

import pickle

In [23]:
# Save RF_Model to file in the current working directory

filename = "Opportunity_Score_RF_Model.pkl"  
pickle.dump(final_rf, open(filename,'wb'))

In [24]:
# Load from file

pickle_RF_model = pickle.load(open(filename,'rb'))


pickle_RF_model

RandomForestClassifier(n_estimators=2000, random_state=0)

In [25]:
# Use the Reloaded Joblib Model to 
# Calculate the accuracy score and predict target values

# Calculate the Score 
score = pickle_RF_model.score(X_full_train, y_full_train)

# Print the Score
print("Model training score: {0:.2f} %".format(100 * score))  


# Predict the Labels using the reloaded Model
Ypredict = pickle_RF_model.predict_proba(X_full_train)


Ypredict

Model training score: 100.00 %


array([[9.985e-01, 1.500e-03],
       [9.990e-01, 1.000e-03],
       [9.880e-01, 1.200e-02],
       ...,
       [9.995e-01, 5.000e-04],
       [1.000e+00, 0.000e+00],
       [5.500e-02, 9.450e-01]])