In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


# Tune the Classifier
In this step, hyperparameters of the feature extractors and several classifiers will be tuned using 5-fold cross validation. 

*Note: See scripts in "4.tune_classifier" folder for scripting implementation. The tuning was run with a python script on the CHPC clusters as this step is computationally expensive and time consuming.

Feature extractors include:
<ul>
    <li>Count vectorizer</li>
    <li>TF-IDF vectorizer</li>
</ul>

Classifiers include:
<ul>
    <li>support vector machine</li>
    <li>random forest</li>
    <li>neural net</li>
</ul>


## Feature Extractors
### Count vectorizer
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html<br/>
Hyperparameters to tune:
<ul>
    <li>ngram_range</li>
    <li>max_df</li>
    <li>min_df</li>
</ul>

### TF-IDF vectorizer
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html<br/>
Hyperparameters to tune:
<ul>
    <li>ngram_range</li>
    <li>max_df</li>
    <li>min_df</li>
</ul>

Note: Was going to tune max_features as well but forgot to put it into vectorizer call. May look at this later.

## Classifiers
### Support Vector Machine
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html<br/>
Hyperparameters to tune:
<ul>
    <li>C</li>
    <li>kernel</li>
    <li>degree</li>
    <li>gamma</li>
</ul>

### Random Forest
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html<br/>
Hyperparameters to tune:
<ul>
    <li>n_estimators</li>
    <li>criterion</li>
    <li>max_depth</li>
</ul>

### Neural Net
Reference: This one will use keras with tensorflow, rather than sci-kit learn. Need to still extract features with sklearn and return those vectors. Then feed them to keras.<br/>
Hyperparameters to tune:
<ul>
    <li>number of nodes in hidden layer</li>
    <li>optimizer</li>
</ul>


In [2]:
# Load training data and stopwords
train_data = pd.read_pickle('../data/train_data.pkl')
with open('../data/stopwords.pkl', 'rb') as f:
    stopwords = pickle.load(f)
    
# smaller subset for testing/demo
train_data = train_data[:500]

In [3]:
class TuneSVM(object):
    def __init__(self, train_data, kernel, cv_num, stopwords, title):
        self.data = train_data
        self.kernel = kernel
        self.stopwords = stopwords
        self.title = title
        self.k_folds = KFold(n_splits=cv_num, shuffle=True)
        self.cv_scores = pd.DataFrame()

    def tune_parameters(self, params, vector):
        ngram_range = params['ngram_range']
        max_df = params['max_df']
        min_df = params['min_df']

        C = params['C']

        for n in ngram_range:
            for mx in max_df:
                for mn in min_df:
                    for c in C:
                        self.run_cv(n, mx, mn, c, vector)
        return None

    def save_scores_csv(self, title):
        self.cv_scores.to_csv('../results/tuning/%s_tuning.csv' % title)
        return None

    def run_cv(self, ngram_range, max_df, min_df, C, vector):
        fold = 0
        for train_index, val_index in self.k_folds.split(self.data):
            fold += 1
#             print(fold)
            X_train = self.data.iloc[train_index]['text'].values
            y_train = self.data.iloc[train_index]['label'].values
            X_val = self.data.iloc[val_index]['text'].values
            y_val = self.data.iloc[val_index]['label'].values

            if vector == 'count':
                vectorizer = CountVectorizer(ngram_range=ngram_range,
                                             max_df=max_df,
                                             min_df=min_df,
                                             stop_words=self.stopwords)
            else:
                vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                             max_df=max_df,
                                             min_df=min_df,
                                             stop_words=self.stopwords)

            try:
                X_train_vec = vectorizer.fit_transform(X_train)
                X_val_vec = vectorizer.transform(X_val)
            except:
                return None
            else:
                clf = SVC(C=C, kernel=self.kernel, probability=True, gamma='scale')
                clf.fit(X_train_vec, y_train)

                y_train_pred = clf.predict(X_train_vec)
                y_train_prob = clf.predict_proba(X_train_vec)
                y_train_prob = y_train_prob[:, 1]
                train_scores = self.evaluate_cv_results(y_train, y_train_pred, y_train_prob,
                                                        ngram_range, max_df, min_df, C)

                y_val_pred = clf.predict(X_val_vec)
                y_val_prob = clf.predict_proba(X_val_vec)
                y_val_prob = y_val_prob[:, 1]
                val_scores = self.evaluate_cv_results(y_val, y_val_pred, y_val_prob,
                                                      ngram_range, max_df, min_df, C)

                eval_df = self.create_scores_dataframe(train_scores, val_scores, fold, vector)
                self.cv_scores = pd.concat([self.cv_scores, eval_df])
                self.save_scores_csv('temp_%s' % self.title)
        return None

    def evaluate_cv_results(self, y_true, y_pred, y_prob, ngram_range, max_df, min_df, C):
        scores = {'ngram_range': [], 'max_df': [], 'min_df': [], 'C': [],
                  'Acc': [], 'recall': [], 'PPV': [], 'AUC': []}

        scores['ngram_range'].append(ngram_range)
        scores['max_df'].append(max_df)
        scores['min_df'].append(min_df)
        scores['C'].append(C)
        scores['Acc'].append(accuracy_score(y_true, y_pred))
        scores['recall'].append(recall_score(y_true, y_pred))
        scores['PPV'].append(precision_score(y_true, y_pred))
        scores['AUC'].append(roc_auc_score(y_true, y_prob))

        return scores

    def create_scores_dataframe(self, train_dict, val_dict, fold, vector):
        train_df = pd.DataFrame(train_dict)
        train_df['dataset'] = 'train'
        train_df['fold'] = fold
        train_df['vector'] = vector

        val_df = pd.DataFrame(val_dict)
        val_df['dataset'] = 'val'
        val_df['fold'] = fold
        val_df['vector'] = vector
        eval_df = pd.concat([train_df, val_df]).reset_index(drop=True)
        return eval_df

In [4]:
# svm_params = {
#     'ngram_range':[(1,1),(1,2),(2,2)],
#     'max_df':np.linspace(0, 1, 5),
#     'min_df':np.linspace(0, 1, 5),
#     'C':np.linspace(0.01, 5, 5)
# }

# sample set of parameters for demo
svm_params = {
    'ngram_range':[(1,1)],
    'max_df':[0.5],
    'min_df':[0],
    'C':[1.0]
}

tune_psvm = TuneSVM(train_data, 'poly', 3, stopwords, 'psvm_demo')
tune_psvm.tune_parameters(svm_params, 'count')
tune_psvm.tune_parameters(svm_params, 'tfidf')
tune_psvm.save_scores_csv('psvm_demo')

tune_lsvm = TuneSVM(train_data, 'linear', 3, stopwords, 'lsvm_demo')
tune_lsvm.tune_parameters(svm_params, 'count')
tune_lsvm.tune_parameters(svm_params, 'tfidf')
tune_lsvm.save_scores_csv('lsvm_demo')

tune_rsvm = TuneSVM(train_data, 'rbf', 3, stopwords, 'rsvm_demo')
tune_rsvm.tune_parameters(svm_params, 'count')
tune_rsvm.tune_parameters(svm_params, 'tfidf')
tune_rsvm.save_scores_csv('rsvm_demo')

In [5]:
class TuneRandomForest(object):
    def __init__(self, train_data, cv_num, stopwords, title):
        self.data = train_data
        self.stopwords = stopwords
        self.title = title
        self.k_folds = KFold(n_splits=cv_num, shuffle=True)
        self.cv_scores = pd.DataFrame()
        
    def tune_parameters(self, params, vector):
        ngram_range = params['ngram_range']
        max_df = params['max_df']
        min_df = params['min_df']
        
        n_estimators = params['n_estimators']
        criterion = params['criterion']
        max_depth = params['max_depth']

        for n in ngram_range:
            for mx in max_df:
                for mn in min_df:
                    for nest in n_estimators:
                        for c in criterion:
                            for mxd in max_depth:
                                self.run_cv(n, mx, mn, nest, c, mxd, vector)
        return None

    def save_scores_csv(self, title):
        self.cv_scores.to_csv('../results/tuning/%s_tuning.csv' %title)
        return None
    
    def run_cv(self, ngram_range, max_df, min_df, n_estimators, criterion, max_depth, vector):
        fold = 0
        for train_index, val_index in self.k_folds.split(self.data):
            fold += 1
#             print(fold)
            X_train = self.data.iloc[train_index]['text'].values
            y_train = self.data.iloc[train_index]['label'].values
            X_val = self.data.iloc[val_index]['text'].values
            y_val = self.data.iloc[val_index]['label'].values
            
            if vector == 'count':
                vectorizer = CountVectorizer(ngram_range=ngram_range, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words=self.stopwords)
            else:
                vectorizer = TfidfVectorizer(ngram_range=ngram_range, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words=self.stopwords)
            
            try:
                X_train_vec = vectorizer.fit_transform(X_train)
                X_val_vec = vectorizer.transform(X_val)
            except:
                return None
            else:
                clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
                clf.fit(X_train_vec, y_train)

                y_train_pred = clf.predict(X_train_vec)
                y_train_prob = clf.predict_proba(X_train_vec)
                y_train_prob = y_train_prob[:, 1]
                train_scores = self.evaluate_cv_results(y_train, y_train_pred, y_train_prob, 
                                                        ngram_range, max_df, min_df, 
                                                        n_estimators, criterion, max_depth)

                y_val_pred = clf.predict(X_val_vec)
                y_val_prob = clf.predict_proba(X_val_vec)
                y_val_prob = y_val_prob[:, 1]
                val_scores = self.evaluate_cv_results(y_val, y_val_pred, y_val_prob, 
                                                      ngram_range, max_df, min_df,  
                                                      n_estimators, criterion, max_depth)

                eval_df = self.create_scores_dataframe(train_scores, val_scores, fold, vector)
                self.cv_scores = pd.concat([self.cv_scores, eval_df])
                self.save_scores_csv('temp_%s' %self.title)
        return None
    
    def evaluate_cv_results(self, y_true, y_pred, y_prob, ngram_range, max_df, min_df,  
                            n_estimators, criterion, max_depth):
        scores = {'ngram_range':[],'max_df':[],'min_df':[],'n_estimators':[],'criterion':[],
                  'max_depth':[],'Acc':[],'recall':[],'PPV':[],'AUC':[]}

        scores['ngram_range'].append(ngram_range)
        scores['max_df'].append(max_df)
        scores['min_df'].append(min_df)
        scores['n_estimators'].append(n_estimators)
        scores['criterion'].append(criterion)
        scores['max_depth'].append(max_depth)
        scores['Acc'].append(accuracy_score(y_true, y_pred))
        scores['recall'].append(recall_score(y_true, y_pred))
        scores['PPV'].append(precision_score(y_true, y_pred))
        scores['AUC'].append(roc_auc_score(y_true, y_prob))

        return scores

    def create_scores_dataframe(self, train_dict, val_dict, fold, vector):
        train_df = pd.DataFrame(train_dict)
        train_df['dataset'] = 'train'
        train_df['fold'] = fold
        train_df['vector'] = vector

        val_df = pd.DataFrame(val_dict)
        val_df['dataset'] = 'val'
        val_df['fold'] = fold
        val_df['vector'] = vector
        eval_df = pd.concat([train_df, val_df]).reset_index(drop=True)
        return eval_df
    

In [6]:
# rf_params = {
#     'ngram_range':[(1,1),(1,2),(2,2)],
#     'max_df':np.linspace(0, 1, 5),
#     'min_df':np.linspace(0, 1, 5),
#     'n_estimators':[10, 25, 50, 100, 300],
#     'criterion':['gini','entropy'],
#     'max_depth':[2, 10, 20, 50, 100]
# }

# sample set of parameters for demo
rf_params = {
    'ngram_range':[(1,1)],
    'max_df':[0.5],
    'min_df':[0],
    'max_features':[None],
    'n_estimators':[10],
    'criterion':['gini'],
    'max_depth':[2]
}

tune_rf = TuneRandomForest(train_data, 3, stopwords, 'rf_demo')
tune_rf.tune_parameters(rf_params, 'count')
tune_rf.tune_parameters(rf_params, 'tfidf')
tune_rf.save_scores_csv('rf_demo')

In [7]:
class TuneNeuralNet(object):
    def __init__(self, train_data, cv_num, stopwords, title):
        self.data = train_data
        self.stopwords = stopwords
        self.title = title
        self.k_folds = KFold(n_splits=cv_num, shuffle=True)
        self.cv_scores = pd.DataFrame()
        
    def tune_parameters(self, params, vector):
        ngram_range = params['ngram_range']
        max_df = params['max_df']
        min_df = params['min_df']
        
        h1_nodes = params['h1_nodes']
        optimizer = params['optimizer']

        for n in ngram_range:
            for mx in max_df:
                for mn in min_df:
                    for h1 in h1_nodes:
                        for o in optimizer:
                            self.run_cv(n, mx, mn, h1, o, vector)
        return None

    def save_scores_csv(self, title):
        self.cv_scores.to_csv('../results/tuning/%s_tuning.csv' %title)
        return None
    
    def run_cv(self, ngram_range, max_df, min_df, h1_nodes, optimizer, vector):
        fold = 0
        for train_index, val_index in self.k_folds.split(self.data):
            fold += 1
#             print(fold)
            X_train = self.data.iloc[train_index]['text'].values
            y_train = self.data.iloc[train_index]['label'].values
            X_val = self.data.iloc[val_index]['text'].values
            y_val = self.data.iloc[val_index]['label'].values
            
            if vector == 'count':
                vectorizer = CountVectorizer(ngram_range=ngram_range, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words=self.stopwords)
            else:
                vectorizer = TfidfVectorizer(ngram_range=ngram_range, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words=self.stopwords)
            
            try:
                X_train_vec = vectorizer.fit_transform(X_train)
                X_val_vec = vectorizer.transform(X_val)
            except:
                return None
            else:
                n_dim = X_train_vec.shape[1]
                early_stopping_monitor = EarlyStopping(monitor='val_loss', patience=3)
                
                model = Sequential()
                model.add(Dense(h1_nodes, activation='relu', input_dim=n_dim))
                model.add(Dense(1, activation='sigmoid'))
                model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 
                history = model.fit(X_train_vec, y_train, epochs=3000, validation_split=0.2, batch_size=100, 
                                    callbacks=[early_stopping_monitor], verbose=0)
                
                
                y_train_prob = model.predict(X_train_vec).flatten()
                y_train_pred = model.predict_classes(X_train_vec).flatten()
                train_scores = self.evaluate_cv_results(y_train, y_train_pred, y_train_prob, 
                                                        ngram_range, max_df, min_df, 
                                                        h1_nodes, optimizer)

                y_val_prob = model.predict(X_val_vec).flatten()
                y_val_pred = model.predict_classes(X_val_vec).flatten()
                val_scores = self.evaluate_cv_results(y_val, y_val_pred, y_val_prob, 
                                                      ngram_range, max_df, min_df, 
                                                      h1_nodes, optimizer)

                eval_df = self.create_scores_dataframe(train_scores, val_scores, fold, vector)
                self.cv_scores = pd.concat([self.cv_scores, eval_df])
                self.save_scores_csv('temp_%s' %self.title)
        return None
    
    def evaluate_cv_results(self, y_true, y_pred, y_prob, ngram_range, max_df, min_df, 
                            h1_nodes, optimizer):
        scores = {'ngram_range':[],'max_df':[],'min_df':[],'h1_nodes':[],'optimizer':[],
                  'Acc':[],'recall':[],'PPV':[],'AUC':[]}

        scores['ngram_range'].append(ngram_range)
        scores['max_df'].append(max_df)
        scores['min_df'].append(min_df)
        scores['h1_nodes'].append(h1_nodes)
        scores['optimizer'].append(optimizer)
        scores['Acc'].append(accuracy_score(y_true, y_pred))
        scores['recall'].append(recall_score(y_true, y_pred))
        scores['PPV'].append(precision_score(y_true, y_pred))
        scores['AUC'].append(roc_auc_score(y_true, y_prob))

        return scores

    def create_scores_dataframe(self, train_dict, val_dict, fold, vector):
        train_df = pd.DataFrame(train_dict)
        train_df['dataset'] = 'train'
        train_df['fold'] = fold
        train_df['vector'] = vector

        val_df = pd.DataFrame(val_dict)
        val_df['dataset'] = 'val'
        val_df['fold'] = fold
        val_df['vector'] = vector
        eval_df = pd.concat([train_df, val_df]).reset_index(drop=True)
        return eval_df
    

In [8]:
# nn_params = {
#     'ngram_range':[(1,1),(1,2),(2,2)],
#     'max_df':np.linspace(0, 1, 5),
#     'min_df':np.linspace(0, 1, 5),
#     'h1_nodes':[128, 512, 1024, 2048, 3200],
#     'optimizer':['Adam','RMSprop','Adadelta']
# }

# sample set of parameters for demo
nn_params = {
    'ngram_range':[(1,1)],
    'max_df':[0.5],
    'min_df':[0],
    'h1_nodes':[128],
    'optimizer':['Adam']
}

tune_nn = TuneNeuralNet(train_data, 3, stopwords, 'nn_demo')
tune_nn.tune_parameters(nn_params, 'count')
tune_nn.tune_parameters(nn_params, 'tfidf')
tune_nn.save_scores_csv('nn_demo')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



## Choosing Parameters

In [9]:
rf = pd.read_csv('../results/tuning/rf_tuning.csv')
rf.drop(columns='Unnamed: 0', inplace=True)

rf_count = rf.loc[(rf['vector'] == 'count') & (rf['dataset'] == 'val')].copy()
rf_count.drop(columns='fold', inplace=True)
rf_count = rf_count.groupby(['ngram_range','max_df','min_df','n_estimators','criterion','max_depth']).mean()
rf_count_acc = rf_count.sort_values('Acc', ascending=False).head(1).reset_index()
rf_count_auc = rf_count.sort_values('AUC', ascending=False).head(1).reset_index()
print('Random Forest - Count Vectorizer, Best parameters for accuracy:')
print('ngram_range=%s' %str(rf_count_acc['ngram_range'].values[0]))
print('max_df=%f' %rf_count_acc['max_df'].values[0])
print('min_df=%f' %rf_count_acc['min_df'].values[0])
print('n_estimators=%d' %rf_count_acc['n_estimators'].values[0])
print('criterion=%s' %rf_count_acc['criterion'].values[0])
print('max_depth=%d' %rf_count_acc['max_depth'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(rf_count_acc['Acc'].values[0], 
                                                                rf_count_acc['AUC'].values[0], 
                                                                rf_count_acc['recall'].values[0], 
                                                                rf_count_acc['PPV'].values[0]))
print('')

print('Random Forest - Count Vectorizer, Best parameters for AUC:')
print('ngram_range=%s' %str(rf_count_auc['ngram_range'].values[0]))
print('max_df=%f' %rf_count_auc['max_df'].values[0])
print('min_df=%f' %rf_count_auc['min_df'].values[0])
print('n_estimators=%d' %rf_count_auc['n_estimators'].values[0])
print('criterion=%s' %rf_count_auc['criterion'].values[0])
print('max_depth=%d' %rf_count_auc['max_depth'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(rf_count_auc['Acc'].values[0], 
                                                                rf_count_auc['AUC'].values[0], 
                                                                rf_count_auc['recall'].values[0], 
                                                                rf_count_auc['PPV'].values[0]))
print('')
print('')


rf_tfidf = rf.loc[(rf['vector'] == 'tfidf') & (rf['dataset'] == 'val')].copy()
rf_tfidf.drop(columns='fold', inplace=True)
rf_tfidf = rf_tfidf.groupby(['ngram_range','max_df','min_df','n_estimators','criterion','max_depth']).mean()
rf_tfidf_acc = rf_tfidf.sort_values('Acc', ascending=False).head(1).reset_index()
rf_tfidf_auc = rf_tfidf.sort_values('AUC', ascending=False).head(1).reset_index()
print('Random Forest - TF-IDF Vectorizer, Best parameters for accuracy:')
print('ngram_range=%s' %str(rf_tfidf_acc['ngram_range'].values[0]))
print('max_df=%f' %rf_tfidf_acc['max_df'].values[0])
print('min_df=%f' %rf_tfidf_acc['min_df'].values[0])
print('n_estimators=%d' %rf_tfidf_acc['n_estimators'].values[0])
print('criterion=%s' %rf_tfidf_acc['criterion'].values[0])
print('max_depth=%d' %rf_tfidf_acc['max_depth'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(rf_tfidf_acc['Acc'].values[0], 
                                                                rf_tfidf_acc['AUC'].values[0], 
                                                                rf_tfidf_acc['recall'].values[0], 
                                                                rf_tfidf_acc['PPV'].values[0]))
print('')

print('Random Forest - TF-IDF Vectorizer, Best parameters for AUC:')
print('ngram_range=%s' %str(rf_tfidf_auc['ngram_range'].values[0]))
print('max_df=%f' %rf_tfidf_auc['max_df'].values[0])
print('min_df=%f' %rf_tfidf_auc['min_df'].values[0])
print('n_estimators=%d' %rf_tfidf_auc['n_estimators'].values[0])
print('criterion=%s' %rf_tfidf_auc['criterion'].values[0])
print('max_depth=%d' %rf_tfidf_auc['max_depth'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(rf_tfidf_auc['Acc'].values[0], 
                                                                rf_tfidf_auc['AUC'].values[0], 
                                                                rf_tfidf_auc['recall'].values[0], 
                                                                rf_tfidf_auc['PPV'].values[0]))

Random Forest - Count Vectorizer, Best parameters for accuracy:
ngram_range=(1, 1)
max_df=1.000000
min_df=0.000000
n_estimators=300
criterion=gini
max_depth=100
Scores: Acc=0.8601, AUC=0.9389, Recall=0.9313, PPV=0.8209

Random Forest - Count Vectorizer, Best parameters for AUC:
ngram_range=(1, 1)
max_df=0.750000
min_df=0.000000
n_estimators=300
criterion=entropy
max_depth=100
Scores: Acc=0.8597, AUC=0.9407, Recall=0.9353, PPV=0.8182


Random Forest - TF-IDF Vectorizer, Best parameters for accuracy:
ngram_range=(1, 1)
max_df=0.500000
min_df=0.000000
n_estimators=300
criterion=entropy
max_depth=100
Scores: Acc=0.8601, AUC=0.9403, Recall=0.9325, PPV=0.8203

Random Forest - TF-IDF Vectorizer, Best parameters for AUC:
ngram_range=(1, 1)
max_df=1.000000
min_df=0.000000
n_estimators=300
criterion=entropy
max_depth=100
Scores: Acc=0.8596, AUC=0.9405, Recall=0.9317, PPV=0.8201


### Random Forest:
For Count Vectorizer:
<ul>
    <li>ngram_range: (1, 1)</li>
    <li>max_df: 1.0</li>
    <li>min_df: 0</li>
    <li>n_estimators: 300</li>
    <li>criterion: gini</li>
    <li>max_depth: 100</li>
</ul>
<br/>
<br/>
For TF-IDF Vectorizer:
<ul>
    <li>ngram_range: (1, 1)</li>
    <li>max_df: 0.5</li>
    <li>min_df: 0</li>
    <li>n_estimators: 300</li>
    <li>criterion: entropy</li>
    <li>max_depth: 100</li>
</ul>

In [11]:
svm = pd.read_csv('../results/tuning/svm_tuning.csv')
svm.drop(columns='Unnamed: 0', inplace=True)

svm_count = svm.loc[(svm['vector'] == 'count') & (svm['dataset'] == 'val')].copy()
svm_count.drop(columns='fold', inplace=True)
svm_count = svm_count.groupby(['ngram_range','max_df','min_df','C','kernel']).mean()
svm_count_acc = svm_count.sort_values('Acc', ascending=False).head(1).reset_index()
svm_count_auc = svm_count.sort_values('AUC', ascending=False).head(1).reset_index()
print('SVM - Count Vectorizer, Best parameters for accuracy:')
print('ngram_range=%s' %str(svm_count_acc['ngram_range'].values[0]))
print('max_df=%f' %svm_count_acc['max_df'].values[0])
print('min_df=%f' %svm_count_acc['min_df'].values[0])
print('C=%f' %svm_count_acc['C'].values[0])
print('kernel=%s' %svm_count_acc['kernel'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(svm_count_acc['Acc'].values[0], 
                                                                svm_count_acc['AUC'].values[0], 
                                                                svm_count_acc['recall'].values[0], 
                                                                svm_count_acc['PPV'].values[0]))
print('')

print('SVM - Count Vectorizer, Best parameters for AUC:')
print('ngram_range=%s' %str(svm_count_auc['ngram_range'].values[0]))
print('max_df=%f' %svm_count_auc['max_df'].values[0])
print('min_df=%f' %svm_count_auc['min_df'].values[0])
print('C=%f' %svm_count_auc['C'].values[0])
print('kernel=%s' %svm_count_auc['kernel'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(svm_count_auc['Acc'].values[0], 
                                                                svm_count_auc['AUC'].values[0], 
                                                                svm_count_auc['recall'].values[0], 
                                                                svm_count_auc['PPV'].values[0]))
print('')
print('')


svm_tfidf = svm.loc[(svm['vector'] == 'tfidf') & (svm['dataset'] == 'val')].copy()
svm_tfidf.drop(columns='fold', inplace=True)
svm_tfidf = svm_tfidf.groupby(['ngram_range','max_df','min_df','C','kernel']).mean()
svm_tfidf_acc = svm_tfidf.sort_values('Acc', ascending=False).head(1).reset_index()
svm_tfidf_auc = svm_tfidf.sort_values('AUC', ascending=False).head(1).reset_index()
print('Random Forest - TF-IDF Vectorizer, Best parameters for accuracy:')
print('ngram_range=%s' %str(svm_tfidf_acc['ngram_range'].values[0]))
print('max_df=%f' %svm_tfidf_acc['max_df'].values[0])
print('min_df=%f' %svm_tfidf_acc['min_df'].values[0])
print('C=%f' %svm_tfidf_acc['C'].values[0])
print('kernel=%s' %svm_tfidf_acc['kernel'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(svm_tfidf_acc['Acc'].values[0], 
                                                                svm_tfidf_acc['AUC'].values[0], 
                                                                svm_tfidf_acc['recall'].values[0], 
                                                                svm_tfidf_acc['PPV'].values[0]))
print('')

print('Random Forest - TF-IDF Vectorizer, Best parameters for AUC:')
print('ngram_range=%s' %str(svm_tfidf_auc['ngram_range'].values[0]))
print('max_df=%f' %svm_tfidf_auc['max_df'].values[0])
print('min_df=%f' %svm_tfidf_auc['min_df'].values[0])
print('C=%f' %svm_tfidf_auc['C'].values[0])
print('kernel=%s' %svm_tfidf_auc['kernel'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(svm_tfidf_auc['Acc'].values[0], 
                                                                svm_tfidf_auc['AUC'].values[0], 
                                                                svm_tfidf_auc['recall'].values[0], 
                                                                svm_tfidf_auc['PPV'].values[0]))

SVM - Count Vectorizer, Best parameters for accuracy:
ngram_range=(1, 2)
max_df=0.500000
min_df=0.000000
C=0.010000
kernel=linear
Scores: Acc=0.8476, AUC=0.9392, Recall=0.9410, PPV=0.7988

SVM - Count Vectorizer, Best parameters for AUC:
ngram_range=(1, 2)
max_df=0.500000
min_df=0.000000
C=0.010000
kernel=linear
Scores: Acc=0.8476, AUC=0.9392, Recall=0.9410, PPV=0.7988


Random Forest - TF-IDF Vectorizer, Best parameters for accuracy:
ngram_range=(1, 2)
max_df=0.250000
min_df=0.000000
C=1.257500
kernel=linear
Scores: Acc=0.8720, AUC=0.9454, Recall=0.8962, PPV=0.8609

Random Forest - TF-IDF Vectorizer, Best parameters for AUC:
ngram_range=(1, 2)
max_df=0.250000
min_df=0.000000
C=1.257500
kernel=linear
Scores: Acc=0.8720, AUC=0.9454, Recall=0.8962, PPV=0.8609


### SVM:
For Count Vectorizer:
<ul>
    <li>ngram_range: (1, 2)</li>
    <li>max_df: 0.5</li>
    <li>min_df: 0</li>
    <li>kernel: linear</li>
    <li>C: 0.01</li>
</ul>
<br/>
<br/>
For TF-IDF Vectorizer:
<ul>
    <li>ngram_range: (1, 2)</li>
    <li>max_df: 0.25</li>
    <li>min_df: 0</li>
    <li>kernel: linear</li>
    <li>C: 1.2575</li>
</ul>

In [12]:
nn = pd.read_csv('../results/tuning/nn_tuning.csv')
nn.drop(columns='Unnamed: 0', inplace=True)

nn_count = nn.loc[(nn['vector'] == 'count') & (nn['dataset'] == 'val')].copy()
nn_count.drop(columns='fold', inplace=True)
nn_count = nn_count.groupby(['ngram_range','max_df','min_df','h1_nodes','optimizer']).mean()
nn_count_acc = nn_count.sort_values('Acc', ascending=False).head(1).reset_index()
nn_count_auc = nn_count.sort_values('AUC', ascending=False).head(1).reset_index()
print('NN - Count Vectorizer, Best parameters for accuracy:')
print('ngram_range=%s' %str(nn_count_acc['ngram_range'].values[0]))
print('max_df=%f' %nn_count_acc['max_df'].values[0])
print('min_df=%f' %nn_count_acc['min_df'].values[0])
print('h1_nodes=%f' %nn_count_acc['h1_nodes'].values[0])
print('optimizer=%s' %nn_count_acc['optimizer'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(nn_count_acc['Acc'].values[0], 
                                                                nn_count_acc['AUC'].values[0], 
                                                                nn_count_acc['recall'].values[0], 
                                                                nn_count_acc['PPV'].values[0]))
print('')

print('NN - Count Vectorizer, Best parameters for AUC:')
print('ngram_range=%s' %str(nn_count_auc['ngram_range'].values[0]))
print('max_df=%f' %nn_count_auc['max_df'].values[0])
print('min_df=%f' %nn_count_auc['min_df'].values[0])
print('h1_nodes=%f' %nn_count_auc['h1_nodes'].values[0])
print('optimizer=%s' %nn_count_auc['optimizer'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(nn_count_auc['Acc'].values[0], 
                                                                nn_count_auc['AUC'].values[0], 
                                                                nn_count_auc['recall'].values[0], 
                                                                nn_count_auc['PPV'].values[0]))
print('')
print('')


nn_tfidf = nn.loc[(nn['vector'] == 'tfidf') & (nn['dataset'] == 'val')].copy()
nn_tfidf.drop(columns='fold', inplace=True)
nn_tfidf = nn_tfidf.groupby(['ngram_range','max_df','min_df','h1_nodes','optimizer']).mean()
nn_tfidf_acc = nn_tfidf.sort_values('Acc', ascending=False).head(1).reset_index()
nn_tfidf_auc = nn_tfidf.sort_values('AUC', ascending=False).head(1).reset_index()
print('NN - TF-IDF Vectorizer, Best parameters for accuracy:')
print('ngram_range=%s' %str(nn_tfidf_acc['ngram_range'].values[0]))
print('max_df=%f' %nn_tfidf_acc['max_df'].values[0])
print('min_df=%f' %nn_tfidf_acc['min_df'].values[0])
print('h1_nodes=%f' %nn_tfidf_acc['h1_nodes'].values[0])
print('optimizer=%s' %nn_tfidf_acc['optimizer'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(nn_tfidf_acc['Acc'].values[0], 
                                                                nn_tfidf_acc['AUC'].values[0], 
                                                                nn_tfidf_acc['recall'].values[0], 
                                                                nn_tfidf_acc['PPV'].values[0]))
print('')

print('NN - TF-IDF Vectorizer, Best parameters for AUC:')
print('ngram_range=%s' %str(nn_tfidf_auc['ngram_range'].values[0]))
print('max_df=%f' %nn_tfidf_auc['max_df'].values[0])
print('min_df=%f' %nn_tfidf_auc['min_df'].values[0])
print('h1_nodes=%f' %nn_tfidf_auc['h1_nodes'].values[0])
print('optimizer=%s' %nn_tfidf_auc['optimizer'].values[0])
print('Scores: Acc=%.04f, AUC=%.04f, Recall=%.04f, PPV=%.04f' %(nn_tfidf_auc['Acc'].values[0], 
                                                                nn_tfidf_auc['AUC'].values[0], 
                                                                nn_tfidf_auc['recall'].values[0], 
                                                                nn_tfidf_auc['PPV'].values[0]))

NN - Count Vectorizer, Best parameters for accuracy:
ngram_range=(1, 2)
max_df=1.000000
min_df=0.000000
h1_nodes=128.000000
optimizer=Adadelta
Scores: Acc=0.8678, AUC=0.9440, Recall=0.8920, PPV=0.8570

NN - Count Vectorizer, Best parameters for AUC:
ngram_range=(1, 1)
max_df=0.500000
min_df=0.000000
h1_nodes=3200.000000
optimizer=Adadelta
Scores: Acc=0.8651, AUC=0.9448, Recall=0.8965, PPV=0.8502


NN - TF-IDF Vectorizer, Best parameters for accuracy:
ngram_range=(1, 2)
max_df=0.750000
min_df=0.000000
h1_nodes=128.000000
optimizer=Adadelta
Scores: Acc=0.8706, AUC=0.9475, Recall=0.8889, PPV=0.8643

NN - TF-IDF Vectorizer, Best parameters for AUC:
ngram_range=(1, 2)
max_df=1.000000
min_df=0.000000
h1_nodes=128.000000
optimizer=Adadelta
Scores: Acc=0.8671, AUC=0.9481, Recall=0.8602, PPV=0.8793


### NN:
For Count Vectorizer:
<ul>
    <li>ngram_range: (1, 2)</li>
    <li>max_df: 1.0</li>
    <li>min_df: 0</li>
    <li>h1_nodes: 128</li>
    <li>optimizer: Adadelta</li>
</ul>
<br/>
<br/>
For TF-IDF Vectorizer:
<ul>
    <li>ngram_range: (1, 2)</li>
    <li>max_df: 0.75</li>
    <li>min_df: 0</li>
    <li>h1_nodes: 128</li>
    <li>optimizer: Adadelta</li>
</ul>