<h1>Data Scientist Opening Pre-employment Assessment: Second Exercise</h1><br>
<p>Ben Trey</p>


In [2]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
import datetime
import time
import scipy.ndimage
from tqdm import tqdm_notebook
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score
%matplotlib inline




<h1>Step 0:</h1>
<p>A few words of caution:<br> 
1) Read all the way through the instructions.<br> 
2) Models must be deployed as an API using Python.<br>
3) No additional data may be added or used.<br> 
4) Not all data must be used to build an adequate model, but making use of complex variables will help us identify high-performance candidates.<br>
5) The predictions returned by the API should be the class probabilities for belonging to the positive class, not the class itself (i.e. a decimal value, not just 1 or 0).</p> 

<p><b>Given Model</b><br>
  Here is a brief summary of the operation of the given model. It will not     work on this dataset so I am going to modify it to analyze its     
  performance. I am going to try to keep as much orginal code as   
  possible.<br>
  <ul><li>Loads Data<br>
      Drops nan
      Combines train and test data
      Gets percentage of Null values for each feature
    </li>
    <li>Feature Selection<br>
        Drops features not used for modeling<br>
        Drops features requiring Natural Language Processing<br>
        Removes Redundant features
    </li>
    <li>Feature Engineering<br>
      Change dollar figures represented as strings to floats<br>
      Turns percentages represented as strings to floats<br>
      Creates new features to be used in modeling
      Converts date times from strings into float
      Creates a new feature
      Sets target variable
    </li>
    <li>Split input variables into numerical and categorical features
    </li>
    <li>Data Preperation for Tree Model<br>
      Replace null value with large number<br>
      Label encoding for categorical feature
    </li>
    <li>Split Into Training and Test Dataset<br>
    </li>
    <li>Build Random Forest Model
    </li>
    <li>Develop A Random Forest Model
    </li>
    <li>Save the Model in Local Disk
    </li>
    <li>Evaluate the AUC Performance of the Model
    </li>
    <li>Make Prediciton on Test Data
    </li>
  </ul>
</p>

<p><b>Replicating Given Model</b><br>
The code below is a faithful represenation on the given model. Some of the libraries were changed in order to run with the current version of sklearn. The data in the given model did not match the column names of the given data. The feature engineering in the given model was applied appropriately to the given data.
</p>

In [3]:
def load_data():
    raw_train=pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_train.csv')
    raw_test=pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_test.csv')
    raw_train_drop = raw_train.dropna(axis=0, subset=['y'])
    list_all=[raw_train_drop,raw_test]
    len_train = len(raw_train_drop)
    len_test =len(raw_test)
    raw=pd.concat(list_all, ignore_index=True, sort=False)
    del(raw_train,raw_test,raw_train_drop)
    return [raw, len_train, len_test]

def get_null(raw):
    Null_list = raw.isnull().sum().sort_values(ascending=False)/float(raw.shape[0])*100
    print('the pencentage of NUll value in each features are:\r\n', Null_list[:10])

def remove_features(features, raw):
    for feature in features:
        del raw[feature]
    return raw

def remove_dollar(x):
    try:
        x = str(x)
        return float(x.strip('$').replace(',',''))
    except:
        return np.nan

def convert_dollars(features, raw):
    for feature in features:
        raw[feature]=raw[feature].apply(remove_dollar)
    return raw

def per_float(x):
    try:
        x = str(x)
        return float(x.strip('%'))/100
    except:
        return np.nan

def convert_percents(features, raw):
    for feature in features:
        raw[feature]=raw[feature].apply(per_float)
    return raw

def set_target_varible(raw):
    return raw['y']

def set_input_variable_list(raw):
    del raw['y']
    return raw

def get_object_columns(raw):
    return raw.dtypes[raw.dtypes == 'object'].index

def get_numerical_columns(raw):
    return raw.dtypes[raw.dtypes == 'float64'].index

def fill_null(raw,value=10**20):
    labels=get_numerical_columns(raw)
    for label in labels:
        raw[label].fillna(value,inplace=True)
    return raw    

In [4]:
raw_data = load_data()
raw = raw_data[0]
len_train = raw_data[1]
len_test = raw_data[2]
get_null(raw)

the pencentage of NUll value in each features are:
 y      20.000
x13     0.034
x55     0.034
x42     0.034
x18     0.032
x62     0.030
x99     0.030
x24     0.030
x96     0.028
x63     0.028
dtype: float64


In [5]:
raw = convert_dollars(['x12'], raw)

In [6]:
raw = convert_percents(['x79'], raw)

In [7]:
raw_y = raw['y']
del raw['y']

In [8]:
cat_cols = get_object_columns(raw)
num_cols = get_numerical_columns(raw)

In [9]:
raw = fillNull(raw)

In [10]:
### Label encoding for categorical feature
from sklearn import preprocessing
LBL = preprocessing.LabelEncoder()
dict_list = []
for i in cat_cols:
    raw[i] = LBL.fit_transform(raw[i].fillna('0'))
    j = dict(zip(np.arange(len(LBL.classes_)),LBL.classes_))
    k = {i:j}
    dict_list.append(k)

In [11]:
x = raw[:len_train]
y = raw_y[:len_train]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.33, random_state=42)
holdout_x = raw[len_train:]
holdout_y = raw_y[len_train:]

In [12]:
rfr = RandomForestRegressor(n_jobs=-1) 

In [13]:
param_grid = { 
    'n_estimators': [60,120,180]
}

print('start the hypermeter grid search for n_estimator, it may take a few minutes')
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
CV_rfr.fit(train_x, train_y)

best_estimators = list(CV_rfr.best_params_.values())[0]

print(' hypermeter grid search is over')
print('The best paramter for n_estimator is:', best_estimators)

start the hypermeter grid search for n_estimator, it may take a few minutes
 hypermeter grid search is over
The best paramter for n_estimator is: 180


In [14]:
### Develop a random forest model with 'n_estimators' = best_estimators

print('random forest model is developing, it may take 10 minutes')
best_estimators = 180
rfr_best = RandomForestRegressor(n_jobs=-1,n_estimators=best_estimators) 
rfr_best.fit(train_x, train_y)

random forest model is developing, it may take 10 minutes


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=180, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [15]:
### Save the model in local disk

from sklearn.externals import joblib
joblib.dump(rfr_best, 'rmodelforest.pkl')

['rmodelforest.pkl']

In [16]:
### Evaluate the training AUC performance of the model
from sklearn.metrics import roc_auc_score

rfr_load = joblib.load('rforest.pkl')
print('random forest model is running')
train_y_pred = rfr_load.predict(train_x)
auc_train = roc_auc_score(train_y, train_y_pred)
print('Training AUC: ', auc_train)

random forest model is running
Training AUC:  1.0


In [17]:
### Make prediction on test data
test_y_pred = rfr_load.predict(test_x)
auc_test = roc_auc_score(test_y, test_y_pred)
print('Testing AUC: ', auc_test)

Testing AUC:  0.9486901912291219


<h1>Step 1:</h1><p>
Optimize the model: There are areas for improvement for the base model presented. Modify the code to improve the accuracy of the model. Look for opportunities to improve  performance including data cleaning/preparation, model selection, train/test split, and hyper-parameter tuning. The model performance will be measured by AUC against the holdout test set.</p>

<p><b>Replacement Model</b><br>
The replacement model also uses the random forest model. The data underwent additional cleaning. First the categorical data was checked for alternate references to the same value. The conversion from dollars to floats was also improved.<br><br>
To check the amount of data and the and parameters where not excessive random values for both were taken and the AUC was measured. A parabaloid was then fit to the values to find the opitimum values for each parameter. As there where few values for this simulation a best fit parabaloid smoothed out any irregularities. This code was written with the ability to combine the results of several simulations.<br><br>
The given model also looks like wasn't run enough times with subsets of data to tune hyper parameters. Even replicating the model and running it gave a dramatic improvement in its original AUC score of 0.937. As seen in the data taken for the paraballoid an accuracy of 0.95 (no idea why I didn't keep those values) was acheived an improvement of accuracy of 0.013. As getting this data was parallelizable refining hyperparamaters would also be parallelizable. 
</p>

In [19]:
def load_data(samples=0):
    raw_train = pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_train.csv')
    raw_test = pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_test.csv')
    if samples > 0 and samples < len(raw_train):
        raw_train.sample(samples)
    raw_train_drop = raw_train.dropna(axis=0, subset=['y'])
    list_all = [raw_train_drop,raw_test]
    len_train = len(raw_train_drop)
    len_test = len(raw_test)
    raw = pd.concat(list_all, ignore_index=True, sort=False)
    del(raw_train,raw_test,raw_train_drop)
    return [raw, len_train, len_test]

def get_null(raw):
    Null_list = raw.isnull().sum().sort_values(ascending=False)/float(raw.shape[0])*100
    print('the pencentage of NUll value in each features are:\r\n', Null_list[:10])

def remove_features(features, raw):
    for feature in features:
        del raw[feature]
    return raw

def remove_dollar(x):
    try:
        x = str(x)
        return float(x.strip('$').replace(',','').replace('(','').replace(')',''))
    except:
        return np.nan

def convert_dollars(features, raw):
    for feature in features:
        raw[feature] = raw[feature].apply(remove_dollar)
    return raw

def per_float(x):
    try:
        x = str(x)
        return float(x.strip('%'))/100
    except:
        return np.nan

def convert_percents(features, raw):
    for feature in features:
        raw[feature] = raw[feature].apply(per_float)
    return raw

def set_target_varible(raw):
    return raw['y']

def set_input_variable_list(raw):
    del raw['y']
    return raw

def get_object_columns(raw):
    return raw.dtypes[raw.dtypes == 'object'].index

def get_numerical_columns(raw):
    return raw.dtypes[raw.dtypes == 'float64'].index

def fill_null(raw,value=10**20):
    labels=get_numerical_columns(raw)
    for label in labels:
        raw[label].fillna(value,inplace=True)
    return raw    

def column_replace(raw, column, given_values, replacement_values):
    raw[column].replace(given_values, replacement_values, inplace=True)
    return raw

In [20]:
old_weekday_names=['thur', 'wed', 'thurday', 'wednesday', 'fri', 'friday',\
                 'tuesday', 'monday']
weekday_names=['Thursday', 'Wednesday', 'Thursday', 'Wednesday', 'Friday',\
                 'Friday', 'Tuesday', 'Monday']

In [21]:
old_month_names=['Jun', 'Aug', 'July', 'May', 'sept.', 'Oct', 'Apr', 'Feb',\
               'Nov', 'Mar', 'Dev', 'January']
month_names=['June', 'August', 'July', 'May', 'September', 'October',\
               'April', 'February', 'November', 'March', 'December',\
               'January']

In [22]:
old_location_names = ['asia', 'euorpe', 'america']
location_names = ['Asia', 'Europe', 'America']

In [23]:
old_car_names = ['volkswagon', 'bmw', 'ford', 'chrystler', 'tesla',\
                     'mercades', 'chevrolet', 'nissan']
car_names = ['Volkswagen', 'BMW', 'Ford', 'Chrysler', 'Tesla',\
                   'Mercedes', 'Chevrolet', 'Nissan']

In [24]:
rfr = RandomForestRegressor(n_jobs=-1) 

In [25]:
def gets_estimators_number():
    sample_size = []
    estimators_number = []
    AUC=[]
    for n in tqdm_notebook(range(5)):
        #getting subset of the data
        samples = np.random.randint(10000,4040000)
        raw_data = loadData(samples)
        sub_raw = raw_rata[0]
        len_train = raw_data[1]
        len_test = raw_data[2]
        sub_raw = convert_dollars(['x12'], sub_raw)
        sub_raw = convert_percents(['x79'], sub_raw)
        sub_raw = column_replace(sub_raw, 'x1', old_weekday_names, weekday_names)
        sub_raw = column_replace(sub_raw, 'x54', old_month_names, month_names)
        sub_raw = column_replace(sub_raw, 'x82', old_location_names, location_names)
        sub_raw = column_replace(sub_raw, 'x84', old_location_names, location_names)
        sub_raw_y = sub_raw['y']
        del sub_raw['y']
        cat_cols = get_object_columns(sub_raw)
        num_cols = get_numerical_columns(sub_raw)
        sub_raw = fill_null(sub_raw)
        LBL = preprocessing.LabelEncoder()
        dict_list = []
        for i in cat_cols:
            sub_raw[i] = LBL.fit_transform(sub_raw[i].fillna('0'))
            j = dict(zip(np.arange(len(LBL.classes_)),LBL.classes_))
            k = {i:j}
            dict_list.append(k)
        x = sub_raw[:len_train]
        y = sub_raw_y[:len_train]
        train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.33, random_state=42)
        holdout_x = sub_raw[len_train:]
        holdout_y = sub_raw_y[len_train:]
        testN = 180+np.random.randint(0,120)
        param_grid = {'n_estimators':[testN]}
        CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
        CV_rfr.fit(train_x, train_y)
        best_estimators = list(CV_rfr.best_params_.values())[0]
        rfr_best = RandomForestRegressor(n_jobs=-1,n_estimators=best_estimators) 
        rfr_best.fit(train_x, train_y)
        joblib.dump(rfr_best, 'rforest.pkl')
        rfr_load = joblib.load('rforest.pkl')
        train_y_pred = rfr_load.predict(train_x)
        auc_train = roc_auc_score(train_y, train_y_pred)
        test_y_pred = rfr_load.predict(test_x)
        auc_test = roc_auc_score(test_y, test_y_pred)
        sample_size.append(samples)
        estimators_umber.append(testN)
        AUC.append(auc_test)
    print(sample_size)
    print(estimators_number)
    print(AUC)

In [60]:
get_estimators_number()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[3861179, 3594780, 1934837, 680574, 142256]
[213, 218, 229, 232, 257]
[0.9467715328563835, 0.9473127409008374, 0.9489243613842255, 0.9487746486943772, 0.9489413140652975]


<p>There is a missing cell here that solved for the paraballoid parameters using matrices.</p>

In [219]:
print('Best n_estimators')
2*C[0,0]*C[4,0]/(C[5,0]**2-4*C[0,0]*C[3,0])

255.46690444618892

In [228]:
print('Best amount of data')
(2*C[3,0]*C[1,0]-C[4,0]*C[5,0])/(C[5,0]**2-4*C[0,0]*C[3,0])

3088578.549800607

In [44]:
raw_data = load_data(3088579)
raw = raw_data[0]
len_train = raw_data[1]
len_test = raw_data[2]
get_null(raw)

the pencentage of NUll value in each features are:
 y      20.000
x13     0.034
x55     0.034
x42     0.034
x18     0.032
x62     0.030
x99     0.030
x24     0.030
x96     0.028
x63     0.028
dtype: float64


In [45]:
raw = convert_dollars(['x12'], raw)
raw = convert_percents(['x79'], raw)

In [46]:
raw = column_replace(raw, 'x1', old_weekday_names, weekday_names)

In [47]:
raw = column_replace(raw, 'x54', old_month_names, month_names)

In [48]:
raw = column_replace(raw, 'x82', old_location_names, location_names)

In [49]:
raw = column_replace(raw, 'x84', old_car_names, car_names)

In [50]:
raw_y = raw['y']
del raw['y']

In [51]:
cat_cols = get_object_columns(raw)
num_cols = get_numerical_columns(raw)

In [52]:
raw = fill_null(raw)

In [53]:
### Label encoding for categorical feature
from sklearn import preprocessing
LBL = preprocessing.LabelEncoder()
dict_list = []
for i in cat_cols:
    raw[i] = LBL.fit_transform(raw[i].fillna('0'))
    j = dict(zip(np.arange(len(LBL.classes_)),LBL.classes_))
    k = {i:j}
    dict_list.append(k)

In [54]:
x = raw[:len_train]
y = raw_y[:len_train]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.33, random_state=42)
holdout_x = raw[len_train:]
holdout_y = raw_y[len_train:]

In [55]:
param_grid = { 
    'n_estimators': [255]
}

print('start the hypermeter grid search for n_estimator, it may take a few minutes')
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
CV_rfr.fit(train_x, train_y)

best_estimators = list(CV_rfr.best_params_.values())[0]

print(' hypermeter grid search is over')
print('The best paramter for n_estimator is:', best_estimators)

start the hypermeter grid search for n_estimator, it may take a few minutes
 hypermeter grid search is over
The best paramter for n_estimator is: 255


In [56]:
### Develop a random forest model with 'n_estimators' = best_estimators

print('random forest model is developing, it may take 10 minutes')
rfr_best = RandomForestRegressor(n_jobs=-1,n_estimators=best_estimators) 
rfr_best.fit(train_x, train_y)

random forest model is developing, it may take 10 minutes


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=255, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [57]:
### Save the model in local disk

from sklearn.externals import joblib
joblib.dump(rfr_best, 'rforest.pkl')

['rforest.pkl']

In [58]:
### Evaluate the training AUC performance of the model
from sklearn.metrics import roc_auc_score

rfr_load = joblib.load('rforest.pkl')
print('random forest model is running')
train_y_pred = rfr_load.predict(train_x)
auc_train = roc_auc_score(train_y, train_y_pred)
print('Training AUC: ', auc_train)

random forest model is running
Training AUC:  1.0


In [59]:
### Make prediction on test data
test_y_pred = rfr_load.predict(test_x)
auc_test = roc_auc_score(test_y, test_y_pred)
print('Testing AUC: ', auc_test)

Testing AUC:  0.9494908004890436


<h1>Step 2:</h1><p>
Prepare model deployment for production: Update your code to meet common production coding standards and best practices. These include modularization, code quality, proper unit testing, and comments/documentation. This should be completed for all parts of Step 1. The code will be evaluated using tooling that evaluates code coverage and code quality.</p>

<p><b>Preparing the Model</b><br>
Getting ready to apply the model does not require much code. In this model it is assumed that data will be checked prior to getting to the prediciton function. To reduce uncessesary redundancy the data will not be checked again here. 
</p>

In [308]:
def prediction(row):
    if not ('clf' in vars() or 'clf' in globals()):
        filename = 'rforest.pkl'
        clf = joblib.load(filename)
    features = row.columns
    return clf.predict(row[features])[0]

<h1>Step 3:</h1><p>
Wrap the model code inside an API: The model must be made callable via API call. The call will pass 1 to N rows of data in JSON format, and expects a N responses each with a predicted class and probability belonging to the predicted class.<br><br>

Here is an example curl call to your API:<br><br>

curl --request POST --url http://localhost:8080/predict --header 'content-type: application/json' --data '{"x0": "9.521496806", "x1": "wed", "x2": "-5.087588682", "x3": "-17.21471427", ..., "x97": "2.216918955", "x98": "-18.64465705", "x99": "-1.926577376"}'<br><br>

or a batch curl call:<br><br>

curl --request POST --url http://localhost:8080/predict --header 'content-type: application/json' --data '[{"x0": "9.521496806", "x1": "wed", "x2": "-5.087588682", "x3": "-17.21471427", ..., "x97": "2.216918955", "x98": "-18.64465705", "x99": "-1.926577376"},{"x0": "8.415753628", "x1": "thur", "x2": "-4.934359322", "x3": "-6.21844247", ..., "x97": "6.2714321", "x98": "-38.057369", "x99": "-2.76817620"},...,{"x0": "0.96691828", "x1": "thursday", "x2": "-3.86881782", "x3": "-2.2981827", ..., "x97": "3.1854471", "x98": "-33.6058873", "x99": "-2.02788172"}]'<br><br>

Each of the 10,000 rows in the test dataset will be passed through an API call. The call could be a single batch call w/ all 10,000 rows, or 10,000 individual calls. API should be able to handle either case with minimal impact to performance. </p>


<p><b>API</b><br>
The API hides its parallel computation in the flask library. Flask is threaded by default. Given the ability Pandas to hold the data in dataframes I would not expect it to be unreasonable for Flask to hold the calls in a stack.<br><br>
I'm acutally disappointed I didn't select a model with similar performance but also with the ability to use the GPU for parallel computation. That would have been fun.
</p>

In [380]:
file = open('flaskTest.py','w')
file.write('import pandas as pd\r\n')
file.write('import json\r\n')
file.write('import joblib\r\n')
file.write('from flask import Flask, jsonify, request\r\n\r\n')
file.write('def prediction(row):\r\n')
file.write('    filename = "rforest.pkl"\r\n')
file.write('    clf = joblib.load(filename)\r\n')
file.write('    features=row.columns\r\n')
file.write('    return clf.predict(row[features]) [0]\r\n\r\n')
file.write('app = Flask(__name__)\r\n\r\n')
file.write('@app.route("/predict", methods=["GET", "POST"])\r\n')
file.write('def calculate():\r\n')
file.write('    data=request.get_json()[0]\r\n')
file.write('    df = pd.DataFrame([list(data.values())],columns=list(data.keys()))\r\n')
file.write('    return str(prediction(df))\r\n\r\n')
file.write('if __name__ == "__main__":\r\n')
file.write('    app.run(host="localhost", port=8080, debug=True)')
file.close()

<p>Test call</p>

In [381]:
!curl --request POST --url http://localhost:8080/predict --header 'content-type: application/json' --data '[{"x0":"-18.80182962", "x1":"3", "x2":"-6.552565502", "x3":"9.538783853", "x4":"1.312835053", "x5":"-30.91975648", "x6":"-1.74670099", "x7":"10.35321926", "x8":"7.5405277989999995", "x9":"2.367212697", "x10":"3.3487117539999995", "x11":"-54.06989036", "x12":"567.15", "x13":"0.065574594", "x14":"-11.77904987", "x15":"22.50833577", "x16":"-13.597729000000001", "x17":"-14.84269673", "x18":"-5.87168539", "x19":"5.247740211", "x20":"-2.998573817", "x21":"-1.422839837", "x22":"2.33840389", "x23":"-2.555334256", "x24":"-19.04533265", "x25":"-74.65301752", "x26":"60.00202245", "x27":"-13.48779451", "x28":"1.9058288469999998", "x29":"-1.8450353030000002", "x30":"-18.00345303", "x31":"-0.335476461", "x32":"4.359108128", "x33":"-11.03844639", "x34":"3.4179060139999997", "x35":"3.911574551", "x36":"6.626978633999999", "x37":"157.34343859999998", "x38":"-2.545060753", "x39":"0.101563534", "x40":"39.40455103", "x41":"13.55925085", "x42":"-10.72240312", "x43":"70.4481205", "x44":"-10.37225358", "x45":"9.693022091", "x46":"-23.60131422", "x47":"-1.880379031", "x48":"-5.050058431", "x49":"-16.3366905", "x50":"-148.93878130000002", "x51":"-2.302510723", "x52":"42.95773684", "x53":"7.024470695", "x54":"7", "x55":"1.1978967409999999", "x56":"-1.020319325", "x57":"40.85278578", "x58":"-1.07447457", "x59":"-4.671086305", "x60":"0.917255133", "x61":"42.22603471", "x62":"69.41205636", "x63":"-0.472638781", "x64":"-0.19482051", "x65":"38.17640107", "x66":"-1.7928915909999998", "x67":"1.886820608", "x68":"-4.182808393999999", "x69":"-1.6729802569999999", "x70":"-0.25559694", "x71":"11.94328284", "x72":"-2.3974483280000003", "x73":"12.63558802", "x74":"2.2493899219999998", "x75":"-0.315405289", "x76":"0.0912948", "x77":"-8.269190527000001", "x78":"1.882057053", "x79":"0.0001", "x80":"-39.23067599", "x81":"9.004164193", "x82":"2", "x83":"0.920773905", "x84":"9", "x85":"-0.41756686299999995", "x86":"-2.178081012", "x87":"1.5203101069999998", "x88":"8.832329334", "x89":"2.541372381", "x90":"2.346654626", "x91":"1.123001753", "x92":"0.100648931", "x93":"-2.289469986", "x94":"-31.6471515", "x95":"33.02701707", "x96":"0.18542210399999998", "x97":"6.273963103", "x98":"13.97970419", "x99":"-6.463641615"}]'

0.5450980392156862

<h1>Step 4:</h1><p>
Wrap your API in a Docker image: Create a Dockerfile that builds your API into an image. Write a shell script titled run_api.sh that either runs your image using traditional docker run commands or orchestrates your deployment using Compose, Swarm or Kubernetes (include relevant *.yml config files).</p>

<p># Dockfile<br>
FROM python:latest<br>
WORKDIR /home/ubuntu/docker<br>
COPY flaskTest.py .<br>
CMD ["hello.py", "-flag"]

#!/bin/bash<br>
docker run -ti --name flaskTest pulkit/flaskTest:1.0

<h1>Step 5:</h1><p>
Optimize your deployment for enterprise production and scalability: Identify opportunities to optimize your deployment for scalability. Consider how your API might handle a large number of calls (thousands per minute). What additional steps/tech could you add to your deployment in order to make it scalable for enterprise level production. You can incorporate any relevant code (optional), or you can describe your steps in the write-up as part of Step 6.</p>

<h1>Step 6:</h1><p>
Submit your work: Please submit all of your code, including relevant python files for the API, data prep, model build, Dockerfile (if relevant, orchestration config files), startup shell script, and a brief write-up documenting justification for your end-to-end process in PDF format. Recommend to tar or zip all files into a single archive for submission.<br><br> 
Please do not submit the original data back to us. Your work will be scored on model performance - measured by AUC - on the data hold out, API performance and scalability, code quality and coverage, and creativity points based on a review of Step 5.<p>

<h1>Appendix A: Data Exploration</h1>

<p>Looking at raw data</p>

In [None]:
raw_train=pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_train.csv')
raw_test=pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_test.csv')

<p>Finding the columns that are not numerical</p>

In [None]:
raw_train.loc[:, raw_train.dtypes=='object'].head()

In [None]:
raw_train=raw_train.dropna()

<p>Finding the columns that are ints</p>

In [None]:
raw_train.loc[:, raw_train.dtypes=='int'].head()

<p>Finding the columns that are float64</p>

In [None]:
raw_train.loc[:, raw_train.dtypes=='float64'].head()

<p>94 + 1 + 6 adds up to all of the columns<br><br>
Inspecting the object columns</p>

In [None]:
raw_train['x1'].unique()

In [None]:
#replace week names with integers
oldWeekdayNames=['thur', 'wed', 'thurday', 'wednesday', 'fri', 'friday',\
                 'tuesday', 'monday']
weekdayIntegers=[4, 3, 4, 3, 5, 5, 2, 1]
raw_train['x1'].replace(oldWeekdayNames, weekdayIntegers, inplace=True)

In [None]:
raw_train['x1'].unique()

In [None]:
raw_train['x54'].unique()

In [None]:
#replace month names with integers
oldMonthsNames=['Jun', 'Aug', 'July', 'May', 'sept.', 'Oct', 'Apr', 'Feb',\
               'Nov', 'Mar', 'Dev', 'January']
monthIntegers=[5, 7, 6, 4, 8, 9, 3, 1, 10, 2, 11, 0]
raw_train['x54'].replace(oldMonthsNames, monthIntegers, inplace=True)

In [None]:
raw_train['x54'].unique()

In [None]:
raw_train['x12'].unique()

In [None]:
raw_train['x12']=raw_train['x12'].str.replace('$','')
raw_train['x12']=raw_train['x12'].str.replace('(','')
raw_train['x12']=raw_train['x12'].str.replace(')','')
raw_train['x12']=raw_train['x12'].str.replace(',','')
raw_train['x12']=pd.to_numeric(raw_train['x12'])

In [None]:
raw_train['x12'].unique()

In [None]:
raw_train['x79'].unique()

In [None]:
raw_train['x79']=raw_train['x79'].str.replace('%','')
raw_train['x79']=pd.to_numeric(raw_train['x79'])

In [None]:
raw_train['x79'].unique()

<p>Check for duplicate columns</p>

In [None]:
for n in range(1,100):
    for m in range(n+1,100):
        if list(raw_train['x'+str(n)])==list(raw_train['x'+str(m)]):
            print('Duplicate columns: x'+str(n)+' '+'x'+str(m))

In [None]:
raw_train['x84'].unique()

In [None]:
#replacing car names
mispelledCarNames = ['volkswagon', 'bmw', 'ford', 'chrystler', 'tesla', \
'mercades', 'chevrolet', 'nissan']
correctCarNames = ['Volkswagen', 'BMW', 'Ford', 'Chrysler', 'Tesla', \
'Mercedes', 'Chevrolet', 'Nissan']
raw_train['x84'].replace(mispelledCarNames, correctCarNames,inplace=True)

In [None]:
raw_train['x84'].unique()

In [None]:
raw_train['x82'].unique()

In [None]:
#replacing location names
mispelledLocationNames = ['asia', 'euorpe', 'america']
correctLocationNames = ['Asia', 'Europe', 'America']
raw_train['x82'].replace(mispelledLocationNames, correctLocationNames,inplace=True)

In [None]:
raw_train['x82'].unique()

In [None]:
raw_train.loc[:, raw_train.dtypes=='object'].head()

<h1>Appendix B: Given Model</h1>

### Loading data

In [None]:
raw_train=pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_train.csv')
raw_test=pd.read_csv('/mnt/c/users/我的电脑/desktop/JobApplications/StateFarmDataScience/Exercise2/exercise_06_test.csv')

In [None]:
raw_train.head()

In [None]:
### Drop NULL values in column x1
raw_train_drop = raw_train.dropna(axis=0,subset=['y'])

In [None]:
### Combine train and test data
list_all=[raw_train_drop,raw_test]
raw = pd.concat(list_all,ignore_index=True)

In [None]:
len_train = len(raw_train_drop)
len_test =len(raw_test)
print('The size of effective training and test dataset is', len_train, len_test)

In [None]:
del(raw_train,raw_test,raw_train_drop)

In [None]:
### Get percentage of NULL values for each feature
Null_list = raw.isnull().sum().sort_values(ascending=False)/float(raw.shape[0])*100
print('the pencentage of NUll value in each features are:', Null_list[:10])

### Feature Selection

In [None]:
#remove features not used for modeling
del raw['x2']
del raw['x3']
del raw['x19']

#These feature need Nature Language Processing before using, thus increasing the complexity of current model
del raw['x10']
del raw['x16']
del raw['x18']

#Remove redundant feature
del raw['x8']

### Feature Engineering

In [None]:
# Remove $ from dollar amount features
def remove_dollar(x):
    try:
        x = str(x)
        return float(x.strip('$').replace(',',''))
    except:
        return np.nan

In [None]:
raw['x4'] = raw['x4'].apply(remove_dollar)
raw['x5'] = raw['x5'].apply(remove_dollar)
raw['x6'] = raw['x6'].apply(remove_dollar)
raw['x12'] = raw['x12'].apply(remove_dollar)

In [None]:
# Convert the variable format from percentage to float
def per_float(x):
    try:
        x = str(x)
        return float(x.strip('%'))/100
    except:
        return np.nan

In [None]:
raw['x30'] = raw['x30'].apply(per_float)

In [None]:
# Create new features to be used in modeling
raw['x33'] = raw['x5']/raw['x4']
raw['x34'] = raw['x6']/raw['x5']

In [None]:
# Convert time from string format to float (Number of years since 1900-01-01)
def toYears(x):
    try:
        x = datetime.datetime.strptime(x, "%b-%Y")
        x = x-datetime.datetime(1900,1,1)
        return x.days/365.0
    except:
        try:
            x = datetime.datetime.strptime(x, "%b-%y")
            if (x - datetime.datetime(2017,12,31)).days> 0:
                x = x-datetime.datetime(2000,1,2)
                return x.days/365.0
            x = x-datetime.datetime(1900,1,1)
            return x.days/365.0        
        except:
            return np.nan

In [None]:
raw['x15'] = raw['x15'].apply(toYears)
raw['x23'] = raw['x23'].apply(toYears)

In [None]:
# Time difference between issue date and the date opened
raw['x35'] = raw['x15']  - raw['x23']

In [None]:
### Set target variable and remove it from input variable list
raw_y = raw['y']
del raw['y']

### Split input variables into numerical features and categorical features

In [None]:
cat_cols = raw.dtypes[raw.dtypes == 'object'].index
num_cols = raw.dtypes[raw.dtypes == 'float64'].index

### Make statistics analysis on target variables, numerical features and categorical features
### This cell is very slow to run analysis. Don't run unless you're interested in viewing individual features'
fig= plt.figure(figsize=(5,5))
ax1 = fig.add_subplot(1,1,1)
y.plot(kind= 'hist',axes =ax1)
plt.title('Histgram for interest rate')
plt.show()

for i in cat_cols:
    fig= plt.figure(figsize=(15,5))
    ax1 = fig.add_subplot(1,2,1)
    raw[i].value_counts().plot(kind= 'bar',axes =ax1)
    plt.title('Histgram for feature: %s' %(i))
    ax2 = fig.add_subplot(1,2,2)
    raw[i].value_counts(normalize = 'True').plot(kind= 'bar', axes =ax2)
    plt.title('Histgram for feature: %s (in percentate)' %(i))
    plt.show()

for i in num_cols:
    a = raw[i]
    b = a[abs(a - a.mean()) <=3*a.std()]
    fig= plt.figure(figsize=(8,5))
    b.plot(kind= 'hist',bins = 10)
    plt.title('Histgram of %s' %(i))
    plt.show()

In [None]:
## Make a copy of raw input, will be used later as input variables in the linear regression model 
raw_bp_linear = copy.deepcopy(raw)

### Data preparation for tree model: 

In [None]:
### Replace the Null value with very large number (10**20), let tree model to interpret by itself
for i in num_cols:
    raw[i].fillna(10**20,inplace=True)

In [None]:
### Label encoding for categorical feature
from sklearn import preprocessing
LBL = preprocessing.LabelEncoder()
dict_list = []
for i in cat_cols:
    raw[i] = LBL.fit_transform(raw[i].fillna('0'))
    j = dict(zip(np.arange(len(LBL.classes_)),LBL.classes_))
    k = {i:j}
    dict_list.append(k)

# print(dict_list)

In [None]:
### Split into training and test dataset
from sklearn.model_selection import train_test_split
x = raw[:len_train]
y = raw_y[:len_train]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.33, random_state=42)
holdout_x = raw[len_train:]
holdout_y = raw_y[len_train:]

In [None]:
### Build random forest model

from sklearn import metrics
#from sklearn import cross_validation
from sklearn.model_selection import cross_validate
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(n_jobs=-1) 

#### hyper-parameter search: n_estimators
start to build random forest model"
To reduce the code running time, the process of hypermeter grid search for (n_estimator)
which may take one hour or so. Here we only use the final search result
If interested in checking the search process, run the following cell

In [None]:
### Develop a random forest model with 'n_estimators' = best_estimators

print('random forest model is developing, it may take 10 minutes')
best_estimators = 180
rfr_best = RandomForestRegressor(n_jobs=-1,n_estimators=best_estimators) 
rfr_best.fit(train_x, train_y)

In [None]:
### Save the model in local disk

from sklearn.externals import joblib
joblib.dump(rfr_best, 'rforest.pkl')

In [None]:
### Evaluate the training AUC performance of the model
from sklearn.metrics import roc_auc_score

rfr_load = joblib.load('rforest.pkl')
print('random forest model is running')
train_y_pred = rfr_load.predict(train_x)
auc_train = roc_auc_score(train_y, train_y_pred)
print('Training AUC: ', auc_train)

In [None]:
### Make prediction on test data
test_y_pred = rfr_load.predict(test_x)
auc_test = roc_auc_score(test_y, test_y_pred)
print('Testing AUC: ', auc_test)