# Model Selection and Tuning
1. Split data in to train, validation, test sets
2. Train different models (optional train on full dataset and clusters)
2. Copy algorithm name and parameters to Pandas dataframe with metrics (accuracy, recall, precision, roc auc, f1-score if classication; mae, mse, r2 score if regression)
3. Pick best algorithm + parameters

In [15]:
import numpy as np
import pandas as pd
import psycopg2 as pg
import datetime as dt
import pickle
import ast
import os
import time
import itertools
import yaml

# modeling
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

%matplotlib inline
import matplotlib.pyplot as plt

### Inputs and connection string

In [2]:
# model configurations

file_args = yaml.load(open('./conf/pipeline_args.yml','r'))

SyntaxError: invalid syntax (<unknown>, line 20)

In [3]:
# create db connection objects

conn_str_file = './conf/db_conn_str.txt'

pg_conn_str = open(conn_str_file, 'r').read()
conn = pg.connect(pg_conn_str)

### Load data

In [5]:
# clustered_training_data = pd.read_sql('SELECT * FROM clustered_training_data', con=conn)
clustered_training_data = pickle.load(open('./data/train_test/clustered_training_data.pkl','rb'))

FileNotFoundError: [Errno 2] No such file or directory: './data/train_test/clustered_training_data.pkl'

### Modify features for training

In [4]:
drop_columns_df = clustered_training_data.drop(columns=['index','date','time','segment_id',
                                                        'level_min','level_mean','level_count'])

NameError: name 'clustered_training_data' is not defined

In [6]:
train_data, test_data = train_test_split(drop_columns_df, test_size=0.2)

### Save Training and Test Data

In [None]:
pickle.dump(train_data, open('./data/train_test/final_training_data.pkl','wb'), protocol=4)

In [None]:
pickle.dump(test_data, open('./data/train_test/final_test_data.pkl','wb'), protocol=4)

# Training

## Create metrics df for storing performance of model+parameter

In [7]:
metrics_df = pd.DataFrame(columns=['model','parameters','step','cluster','accuracy',
                                   'precision','recall','f1score','roc_auc','time_taken',
                                   'train_size','pos_neg_ratio'])

# Training Binary Levels

## Model and Parameters

In [1]:
model_param = {'penalty':'l1','C':5**i}
clf_binary = LogisticRegression(**model_param)

NameError: name 'LogisticRegression' is not defined

## Full data set

In [None]:
if param['cluster'] == False:
    
    time_start = time.time()
    if param['verbose'] == True:
            print('Starting training for full data set: {}'.format(str(time_start)))
                  
    metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1score': [], 'roc_auc': []}

    skf = StratifiedKFold(n_splits=5, shuffle=False)
    X = train_data.drop(columns=['level_binary','level_max','cluster'])
    y = train_data['level_binary']
    skf.get_n_splits(X, y)

    i = 1

    for train_index, val_index in skf.split(X, y):

        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        clf_binary.fit(X_train, y_train)
        predictions = clf_binary.predict(X_val)

        metrics['accuracy'].append(accuracy_score(y_val,predictions))
        metrics['precision'].append(precision_score(y_val,predictions,average='binary'))
        metrics['recall'].append(recall_score(y_val,predictions,average='binary'))
        metrics['f1score'].append(f1_score(y_val,predictions,average='binary'))
        metrics['roc_auc'].append(roc_auc_score(y_val,predictions))

        if param['verbose'] == True:
            print('Cross validation set {} complete'.format(str(i)))

        i+=1

    time_end = time.time()
    time_taken = time_end-time_start
    if param['verbose'] == True:
        print('Finishing training for full data set: {}'.format(str(time_end)))

    metrics_aggregated = {
                'version': version,
                'experiment': experiment,
                'model': model,
                'pos_neg_ratio': train_data.loc[train_data['level_binary'] == 1].shape[0]/train_data.shape[0],
                'parameters': str(parameters),
                'step': 'Binary',
                'cluster': 'All',
                'accuracy': np.mean(metrics['accuracy']),
                'precision': np.mean(metrics['precision']),
                'recall': np.mean(metrics['recall']),
                'f1score': np.mean(metrics['f1score']),
                'roc_auc': np.mean(metrics['roc_auc']),
                'time_taken': time_taken
                }
    
    metrics_df = metrics_df.append(pd.DataFrame(metrics_aggregated, index=[0]))
              

## Clustered data

In [None]:
if param['cluster'] == True:
    
    if param['verbose'] == True:
        print('Starting training: {}'.format(str(time.time())))
    
    for cluster in train_data['cluster'].unique():
        
        time_start = time.time()
        if param['verbose'] == True:
            print('Starting training for cluster {}: {}'.format(str(cluster),str(time_start)))
        
        metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1score': [], 'roc_auc': []}

        cluster_train = train_data.loc[train_data['cluster'] == cluster]
        skf = StratifiedKFold(n_splits=5, shuffle=False)
        X = cluster_train.drop(columns=['level_binary','level_max','cluster'])
        y = cluster_train['level_binary']
        skf.get_n_splits(X, y)

        i = 1

        for train_index, val_index in skf.split(X, y):

            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            clf = LogisticRegression(**parameters)
            clf.fit(X_train, y_train)
            predictions = clf.predict(X_val)

            metrics['accuracy'].append(accuracy_score(y_val,predictions))
            metrics['precision'].append(precision_score(y_val,predictions,average='binary'))
            metrics['recall'].append(recall_score(y_val,predictions,average='binary'))
            metrics['f1score'].append(f1_score(y_val,predictions,average='binary'))
            metrics['roc_auc'].append(roc_auc_score(y_val,predictions))

            if param['verbose'] == True:
                print('Cross validation set {} complete'.format(str(i)))

            i+=1

        time_end = time.time()
        time_taken = time_end-time_start
        if param['verbose'] == True:
            print('Finishing training for cluster {}: {}'.format(str(cluster),str(time_start)))

        metrics_aggregated = {
                    'version': version,
                    'experiment': experiment,
                    'model': model,
                    'pos_neg_ratio': train_data.loc[(train_data['cluster'] == cluster) & (train_data['level_binary'] == 1)].shape[0] \
                                      /train_data.loc[(train_data['cluster'] == cluster)].shape[0],
                    'parameters': str(parameters),
                    'step': 'Binary',
                    'cluster': cluster,
                    'accuracy': np.mean(metrics['accuracy']),
                    'precision': np.mean(metrics['precision']),
                    'recall': np.mean(metrics['recall']),
                    'f1score': np.mean(metrics['f1score']),
                    'roc_auc': np.mean(metrics['roc_auc']),
                    'time_taken': time_taken
                    }
        
        metrics_df = metrics_df.append(pd.DataFrame(metrics_aggregated, index=[0]))

        if verbose == True:
            print('Cluster {} complete'.format(str(cluster)))
    
    if param['verbose'] == True:
        print('Training complete for all clusters: {}'.format(str(time.time())))

# Training Multi Levels

In [None]:
model_param = {'penalty':'l1','C':5**i}
clf_binary = LogisticRegression(**model_param)

## Full data set

In [None]:
if param['cluster'] == 'False':
    
    time_start = time.time()
    if param['verbose'] == True:
            print('Starting training for full data set: {}'.format(str(time_start)))
    
    metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1score': [], 'roc_auc': []}

    traffic_only_train = train_data.loc[train_data['level_binary'] == 1]
    skf = StratifiedKFold(n_splits=5, shuffle=False)
    X = traffic_only_train.drop(columns=['level_binary','level_max','cluster'])
    y = traffic_only_train['level_max']
    skf.get_n_splits(X, y)

    i = 1

    for train_index, val_index in skf.split(X, y):

        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        clf = LogisticRegression(**parameters)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_val)

        metrics['accuracy'].append(accuracy_score(y_val,predictions))
        metrics['precision'].append(precision_score(y_val,predictions,average='weighted'))
        metrics['recall'].append(recall_score(y_val,predictions,average='weighted'))
        metrics['f1score'].append(f1_score(y_val,predictions,average='weighted'))
        metrics['roc_auc'].append(roc_auc_score(y_val,predictions))

        if verbose == True:
            print('Cross validation set {} complete'.format(str(i)))

        i+=1

    time_end = time.time()
    time_taken = time_end-time_start
    if param['verbose'] == True:
        print('Finishing training for full data set: {}'.format(str(time_end)))

    metrics_aggregated = {
                'version': version,
                'experiment': experiment,
                'model': model,
                'pos_neg_ratio': train_data.loc[train_data['level_binary'] == 1].shape[0]/train_data.shape[0],
                'train_size': train_data.loc[(train_data['cluster'] == cluster)].shape[0],
                'parameters': str(parameters),
                'step': 'Multi',
                'cluster': cluster,
                'accuracy': np.mean(metrics['accuracy']),
                'precision': np.mean(metrics['precision']),
                'recall': np.mean(metrics['recall']),
                'f1score': np.mean(metrics['f1score']),
                'roc_auc': np.mean(metrics['roc_auc']),
                'time_taken': time_taken
                }
    
    metrics_df = metrics_df.append(pd.DataFrame(metrics_aggregated, index=[0]))

## Clustered data

In [2]:
if param['cluster'] == 'True':
    
    if param['verbose'] == True:
        print('Starting training: {}'.format(str(time.time())))
        
    for cluster in train_data['cluster'].unique():

        time_start = time.time()
        if param['verbose'] == True:
            print('Starting training for cluster {}: {}'.format(str(cluster),str(time_start))
                  
        metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1score': [], 'roc_auc': []}

        traffic_only_train = train_data.loc[train_data['level_binary'] == 1]
        skf = StratifiedKFold(n_splits=5, shuffle=False)
        X = traffic_only_train.drop(columns=['level_binary','level_max','cluster'])
        y = traffic_only_train['level_max']
        skf.get_n_splits(X, y)

        i = 1

        for train_index, val_index in skf.split(X, y):

            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            clf = LogisticRegression(**parameters)
            clf.fit(X_train, y_train)
            predictions = clf.predict(X_val)

            metrics['accuracy'].append(accuracy_score(y_val,predictions))
            metrics['precision'].append(precision_score(y_val,predictions,average='weighted'))
            metrics['recall'].append(recall_score(y_val,predictions,average='weighted'))
            metrics['f1score'].append(f1_score(y_val,predictions,average='weighted'))
            metrics['roc_auc'].append(roc_auc_score(y_val,predictions))

            if verbose == True:
                print('Cross validation set {} complete'.format(str(i)))

            i+=1

        time_end = time.time()
        time_taken = time_end-time_start
        if param['verbose'] == True:
            print('Finishing training for cluster {}: {}'.format(str(cluster),str(time_start)))
              
        metrics_aggregated = {
                    'version': version,
                    'experiment': experiment,
                    'model': model,
                    'pos_neg_ratio': train_data.loc[train_data['level_binary'] == 1].shape[0]/train_data.shape[0],
                    'train_size': train_data.loc[(train_data['cluster'] == cluster)].shape[0],
                    'parameters': str(parameters),
                    'step': 'Multi',
                    'cluster': cluster,
                    'accuracy': np.mean(metrics['accuracy']),
                    'precision': np.mean(metrics['precision']),
                    'recall': np.mean(metrics['recall']),
                    'f1score': np.mean(metrics['f1score']),
                    'roc_auc': np.mean(metrics['roc_auc']),
                    'time_taken': time_taken
                    }
        
        metrics_df = metrics_df.append(pd.DataFrame(metrics_aggregated, index=[0]))
    
    if param['verbose'] == True:
        print('Training complete for all clusters: {}'.format(str(time.time())))

SyntaxError: invalid syntax (<ipython-input-2-1879a7b4a30d>, line 8)

# Evaluation

In [10]:
metrics_df = metrics_df[['version','experiment','model','parameters','step','cluster','accuracy','precision',
                         'recall','f1score','roc_auc','time_taken','train_size','pos_neg_ratio']]

In [11]:
metrics_df

Unnamed: 0,model,parameters,step,cluster,accuracy,precision,recall,f1score,roc_auc,time_taken,pos_neg_ratio
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,2,0.904119,0.780561,0.705529,0.741142,0.828808,14.652998,0.194552
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,4,0.868758,0.819867,0.857426,0.838213,0.866814,3.146999,0.396509
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,6,0.919781,0.84588,0.815654,0.830486,0.884242,34.213467,0.240924
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,1,0.89017,0.786751,0.779826,0.78327,0.853833,22.282028,0.2545
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,0,0.871656,0.812069,0.815681,0.813864,0.858345,3.515044,0.344001
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,7,0.919446,0.762359,0.663767,0.709636,0.813865,8.898,0.148295
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,5,0.948514,0.870717,0.918224,0.893817,0.938047,5.496938,0.23601
0,XGBoost,"{'max_depth': 5, 'objective': 'binary:logistic...",Binary,3,0.880551,0.821161,0.834834,0.827928,0.869692,12.044033,0.344227
0,XGBoost,"{'max_depth': 6, 'objective': 'binary:logistic...",Binary,2,0.90698,0.789202,0.712085,0.748658,0.83307,16.35305,0.194552
0,XGBoost,"{'max_depth': 6, 'objective': 'binary:logistic...",Binary,4,0.872291,0.825248,0.860072,0.842293,0.870196,3.522,0.396509


## Save to Database

In [None]:
metrics_df.to_sql(name='metrics', con=conn, if_exists='append', index=False))