# Model Fine Tuning

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score 
from sklearn import preprocessing
import pickle

In [2]:
df_train = pd.read_pickle(r'C:\DataScience\PROJECT_AFIMILK\Data\Flat_File_for_cleansing\df_train_80.p')
df_val = pd.read_pickle(r'C:\DataScience\PROJECT_AFIMILK\Data\Flat_File_for_cleansing\df_validation_80.p')

In [3]:
df_train = df_train.drop(columns=['FarmCode_32','split'])
df_val = df_val.drop(columns=['FarmCode_32','split'])

In [4]:
X_train = df_train.drop(columns= 'CurMet_t7_t30')
y_train = df_train['CurMet_t7_t30'].astype('category')

print(X_train.shape)
print(y_train.shape)

(2140, 78)
(2140,)


In [5]:
X_val = df_val.drop(columns= 'CurMet_t7_t30')
y_val = df_val['CurMet_t7_t30'].astype('category')

print(X_val.shape)
print(y_val.shape)

(536, 78)
(536,)


In [6]:
## Normalized the variables
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
##
scaler_val = preprocessing.StandardScaler().fit(X_val)
X_val_scaled = scaler.transform(X_val)

In [14]:
def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    prf1 = precision_recall_fscore_support(test_labels,y_pred)
    recall = prf1[1][1]
    auc = roc_auc_score(test_labels, y_pred)
    ##
    print('Model Performance')
    print('Recall: {:0.4f}'.format(recall))
    print('AUC: {:0.4f}'.format(auc))
    return recall, auc

### Random Forest

In [8]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 3, 4, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [9]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [2, 3, 4, 5], 'bootstrap': [True, False]}


In [10]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator= rf, param_distributions= random_grid, n_iter= 1000, cv =10, 
                               verbose= 2, random_state= 42, n_jobs= -1, scoring= 'roc_auc' )

# Fit the random search model
rf_random.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 36.2min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 49.3min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 65.6min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 84.6min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 104.8min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 126.5min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 151.4min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 178.3min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 208.0min
[Parallel(n_jobs=-1)]: Done 9097 tasks    

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_iter=1000,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 3, 4, 5],
                                        'min_samples_split': [2, 3, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   random_state=42, scoring='roc_auc', verbose=2)

In [11]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 3,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': False}

#### Base Estimator

In [15]:
base_model_rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
base_model_rf.fit(X_train_scaled, y_train)
base_accuracy_rf = evaluate(base_model_rf, X_val_scaled, y_val)

Model Performance
Recall: 0.7312
AUC: 0.8616


####  fine-tunned model

In [16]:
best_random_rf = rf_random.best_estimator_
random_accuracy_rf = evaluate(best_random_rf, X_val_scaled, y_val)

Model Performance
Recall: 0.7250
AUC: 0.8585


In [30]:
## Save model
filename = r'C:\DataScience\PROJECT_AFIMILK\Data\Flat_File_for_cleansing\rf_final_model.p'
pickle.dump(base_model_rf, open(filename, 'wb'))

### ADABoost

In [17]:
criterion = ["gini", "entropy"]
#splitter = ["best", "random"]
max_depth = [int(x) for x in np.linspace(start = 2, stop = 10, num = 9)]
min_samples_split = [2,3,4,5]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 10, num = 9)]
max_features = ['auto', 'sqrt', 'log2']
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 9)]

ada_random_grid = {"base_estimator__criterion" : criterion,
                  #"base_estimator__splitter" : splitter  ,
                  "base_estimator__max_depth" : max_depth,
                   "base_estimator__min_samples_split" : min_samples_split,
                   "base_estimator__min_samples_leaf" : min_samples_leaf,
                   #"base_estimator__max_features" : max_features,
                  "n_estimators": n_estimators
                 }


In [18]:
DTC = DecisionTreeClassifier(random_state = 42)

ABC = AdaBoostClassifier(base_estimator = DTC)

random_search_ABC = RandomizedSearchCV(estimator= ABC, param_distributions= ada_random_grid, n_iter= 1000, cv =10, 
                               verbose= 2, random_state= 42, n_jobs= -1)


In [19]:
random_search_ABC.fit(X_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 40.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 140.7min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 151.2min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 165.5min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 179.3min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 196.0min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 211.9min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 229.2min
[Parallel(n_jobs=-1)]: Done 9097 tasks  

RandomizedSearchCV(cv=10,
                   estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=42)),
                   n_iter=1000, n_jobs=-1,
                   param_distributions={'base_estimator__criterion': ['gini',
                                                                      'entropy'],
                                        'base_estimator__max_depth': [2, 3, 4,
                                                                      5, 6, 7,
                                                                      8, 9,
                                                                      10],
                                        'base_estimator__min_samples_leaf': [2,
                                                                             3,
                                                                             4,
                                                                             5,
                                     

In [20]:
random_search_ABC.best_params_

{'n_estimators': 43,
 'base_estimator__min_samples_split': 4,
 'base_estimator__min_samples_leaf': 3,
 'base_estimator__max_depth': 8,
 'base_estimator__criterion': 'entropy'}

#### Base Estimator

In [21]:
base_model_ada = AdaBoostClassifier(random_state=1)
base_model_ada.fit(X_train_scaled, y_train)
base_accuracy_ada = evaluate(base_model_ada, X_val_scaled, y_val)

Model Performance
Recall: 0.7750
AUC: 0.8769


####  Fine-tunned model

In [22]:
best_random_ada = random_search_ABC.best_estimator_
random_accuracy_ada = evaluate(best_random_ada, X_val_scaled, y_val)

Model Performance
Recall: 0.7000
AUC: 0.8487


In [31]:
## Save model
filename = r'C:\DataScience\PROJECT_AFIMILK\Data\Flat_File_for_cleansing\adaboost_final_model.p'
pickle.dump(base_model_ada, open(filename, 'wb'))

### Logistic Regression

In [23]:
# Tolerance for stopping criteria. float, default=1e-4
tol = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3]
# Algorithm to use in the optimization problem.
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# Maximum number of iterations taken for the solvers to converge.  int, default=100
max_iter = [int(x) for x in np.linspace(start = 100, stop = 10000, num = 100)]


In [24]:
# Create the random grid
lr_random_grid = {'tol': tol,
               'solver': solver,
               'max_iter': max_iter
               }

print(lr_random_grid)

{'tol': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000]}


In [25]:
lr = LogisticRegression()

random_search_LR = RandomizedSearchCV(estimator= lr, param_distributions= lr_random_grid, n_iter= 1000, cv =10, 
                               verbose= 2, random_state= 42, n_jobs= -1)

random_search_LR.fit(X_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 45.3min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 62.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 80.3min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 96.2min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 116.8min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 140.8min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 164.1min
[Parallel(n_jobs=-1)]: Done 9097 tasks      

RandomizedSearchCV(cv=10, estimator=LogisticRegression(), n_iter=1000,
                   n_jobs=-1,
                   param_distributions={'max_iter': [100, 200, 300, 400, 500,
                                                     600, 700, 800, 900, 1000,
                                                     1100, 1200, 1300, 1400,
                                                     1500, 1600, 1700, 1800,
                                                     1900, 2000, 2100, 2200,
                                                     2300, 2400, 2500, 2600,
                                                     2700, 2800, 2900, 3000, ...],
                                        'solver': ['newton-cg', 'lbfgs',
                                                   'liblinear', 'sag', 'saga'],
                                        'tol': [1e-05, 5e-05, 0.0001, 0.0005,
                                                0.001, 0.005]},
                   random_state=42, verbose=2)

#### Base Estimator

In [26]:
base_model_lr = LogisticRegression(max_iter= 1000, random_state= 1)
base_model_lr.fit(X_train_scaled, y_train)
base_accuracy_lr = evaluate(base_model_lr, X_val_scaled, y_val)

Model Performance
Recall: 0.7688
AUC: 0.8684


####  Fine-tunned model

In [27]:
best_random_lr = random_search_LR.best_estimator_
random_accuracy_lr = evaluate(best_random_lr, X_val_scaled, y_val)

Model Performance
Recall: 0.7188
AUC: 0.8554


In [32]:
## Save model
filename = r'C:\DataScience\PROJECT_AFIMILK\Data\Flat_File_for_cleansing\lr_final_model.p'
pickle.dump(base_model_lr, open(filename, 'wb'))