### Load libraries

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
import joblib

### Load data

In [2]:
clean_df = pd.read_csv('data/clean.csv')

# Idea for the model
To create a model, which predicts the sending time TS so that opening time TO will be as early as possible, the direct way is focusing on the data with low TS TO difference. 
Now we can abstract the problem into a standard machine learning problem with filtered dataset.
* Feature: X1, X2, X3
* Target: TS
* Dataset: df.TS_TO_diff_minutes<TS_TO_diff_threshold

### Define the cutting threshold

In [3]:
TS_TO_diff_threshold=5

### Extract features and targets

In [4]:
filtered_df=clean_df.loc[clean_df['TS_TO_diff_minutes']<TS_TO_diff_threshold,['X1','X2','X3_0','X3_1','X3_2','X3_3','TS']]
features = filtered_df[['X1','X2','X3_0','X3_1','X3_2','X3_3']].values
targets = filtered_df.TS.values

### Prepare for cross-validation and grid search

In [5]:
cv = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
reg = RandomForestRegressor(random_state=42)
grid = {'n_estimators':[10,100,200,500], 'max_depth': [None, 100,200,500,1000]}
est = GridSearchCV(reg, grid, cv=cv, scoring='neg_mean_absolute_error')

### Run cross-validation and grid search

In [6]:
est.fit(features,targets)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.3,
            train_size=None),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
     

In [7]:
est.best_params_

{'max_depth': None, 'n_estimators': 500}

### Save randomforest model

In [10]:
joblib.dump(est.best_estimator_, 'model/rf_model.pkl') 

['model/rf_model.pkl']

### Train a SVM regression model

In [14]:
#cross-validaton of svm model
from sklearn import svm
reg = svm.SVR()
grid = {'kernel':['poly','rbf','sigmoid'], 'gamma': ['scale','auto'], 'degree':[1,2,3]}
est = GridSearchCV(reg, grid, cv=cv, scoring='neg_mean_absolute_error')
est.fit(features,targets)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.3,
            train_size=None),
             error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'degree': [1, 2, 3], 'gamma': ['scale', 'auto'],
                         'kernel': ['poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=0)

In [21]:
est.best_params_

{'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}

### Save SVM model

In [16]:
joblib.dump(est.best_estimator_, 'model/svm_model.pkl') 

['model/svm_model.pkl']