In [78]:
import pandas as pd
import pickle
from sklearn.model_selection import KFold
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

# Tune and Train the NN
Similar to the previous steps but using a little different tools from SVM and RF tuning.

Feature extractors include:
<ul>
    <li>count vectorizer</li>
    <li>TF-IDF vectorizer</li>
</ul>

Classifiers include:
<ul>
    <li>support vector machine</li>
    <li>random forest</li>
    <li>neural net</li>
</ul>


## Feature Extractors
### Count vectorizer
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html<br/>
Hyperparameters to tune:
<ul>
    <li>ngram_range</li>
    <li>max_df</li>
    <li>min_df</li>
</ul>

### TF-IDF vectorizer
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html<br/>
Hyperparameters to tune:
<ul>
    <li>ngram_range</li>
    <li>max_df</li>
    <li>min_df</li>
</ul>

Note: Was going to tune max_features as well but forgot to put it into vectorizer call. May look at this later.

## Classifiers
### Neural Net
Reference: This one will use tensorflow, rather than sci-kit learn. Need to still extract features with sklearn and return those vectors. Then feed them to tensorflow.<br/>
Hyperparameters to tune:
<ul>
    <li>number of nodes in hidden layer</li>
    <li>optimizer</li>
</ul>


In [2]:
# Load training data and stopwords
train_data = pd.read_pickle('../data/train_data.pkl')
with open('../data/stopwords.pkl', 'rb') as f:
    stopwords = pickle.load(f)
    
# for testing
train_data = train_data[:500]

In [92]:
class TuneNeuralNet(object):
    def __init__(self, train_data, cv_num, stopwords, title):
        self.data = train_data
        self.stopwords = stopwords
        self.title = title
        self.k_folds = KFold(n_splits=cv_num, shuffle=True)
        self.cv_scores = pd.DataFrame()
        
    def tune_parameters(self, params, vector):
        ngram_range = params['ngram_range']
        max_df = params['max_df']
        min_df = params['min_df']
        
        h1_nodes = params['h1_nodes']
        optimizer = params['optimizer']

        for n in ngram_range:
            for mx in max_df:
                for mn in min_df:
                    for h1 in h1_nodes:
                        for o in optimizer:
                            self.run_cv(n, mx, mn, h1, o, vector)
        return None

    def save_scores_csv(self, title):
        self.cv_scores.to_csv('../results/tuning/%s_tuning.csv' %title)
        return None
    
    def run_cv(self, ngram_range, max_df, min_df, h1_nodes, optimizer, vector):
        fold = 0
        for train_index, val_index in self.k_folds.split(self.data):
            fold += 1
            print(fold)
            X_train = self.data.iloc[train_index]['text'].values
            y_train = self.data.iloc[train_index]['label'].values
            X_val = self.data.iloc[val_index]['text'].values
            y_val = self.data.iloc[val_index]['label'].values
            
            if vector == 'count':
                vectorizer = CountVectorizer(ngram_range=ngram_range, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words=self.stopwords)
            else:
                vectorizer = TfidfVectorizer(ngram_range=ngram_range, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words=self.stopwords)
            
            try:
                X_train_vec = vectorizer.fit_transform(X_train)
                X_val_vec = vectorizer.transform(X_val)
            except:
                return None
            else:
                n_dim = X_train_vec.shape[1]
                early_stopping_monitor = EarlyStopping(monitor='val_loss', patience=3)
                
                model = Sequential()
                model.add(Dense(h1_nodes, activation='relu', input_dim=n_dim))
                model.add(Dense(1, activation='sigmoid'))
                model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 
                history = model.fit(X_train_vec, y_train, epochs=3000, validation_split=0.2, batch_size=100, 
                                    callbacks=[early_stopping_monitor], verbose=0)
                
                
                y_train_prob = model.predict(X_train_vec).flatten()
                y_train_pred = model.predict_classes(X_train_vec).flatten()
                train_scores = self.evaluate_cv_results(y_train, y_train_pred, y_train_prob, 
                                                        ngram_range, max_df, min_df, 
                                                        h1_nodes, optimizer)

                y_val_prob = model.predict(X_val_vec).flatten()
                y_val_pred = model.predict_classes(X_val_vec).flatten()
                val_scores = self.evaluate_cv_results(y_val, y_val_pred, y_val_prob, 
                                                      ngram_range, max_df, min_df, 
                                                      h1_nodes, optimizer)

                eval_df = self.create_scores_dataframe(train_scores, val_scores, fold, vector)
                self.cv_scores = pd.concat([self.cv_scores, eval_df])
                self.save_scores_csv('temp_%s' %self.title)
        return None
    
    def evaluate_cv_results(self, y_true, y_pred, y_prob, ngram_range, max_df, min_df, 
                            h1_nodes, optimizer):
        scores = {'ngram_range':[],'max_df':[],'min_df':[],'h1_nodes':[],'optimizer':[],
                  'Acc':[],'recall':[],'PPV':[],'AUC':[]}

        scores['ngram_range'].append(ngram_range)
        scores['max_df'].append(max_df)
        scores['min_df'].append(min_df)
        scores['h1_nodes'].append(h1_nodes)
        scores['optimizer'].append(optimizer)
        scores['Acc'].append(accuracy_score(y_true, y_pred))
        scores['recall'].append(recall_score(y_true, y_pred))
        scores['PPV'].append(precision_score(y_true, y_pred))
        scores['AUC'].append(roc_auc_score(y_true, y_prob))

        return scores

    def create_scores_dataframe(self, train_dict, val_dict, fold, vector):
        train_df = pd.DataFrame(train_dict)
        train_df['dataset'] = 'train'
        train_df['fold'] = fold
        train_df['vector'] = vector

        val_df = pd.DataFrame(val_dict)
        val_df['dataset'] = 'val'
        val_df['fold'] = fold
        val_df['vector'] = vector
        eval_df = pd.concat([train_df, val_df]).reset_index(drop=True)
        return eval_df
    

In [93]:
# nn_params = {
#     'ngram_range':[(1,1),(1,2),(2,2)],
#     'max_df':np.linspace(0, 1, 5),
#     'min_df':np.linspace(0, 1, 5),
#     'h1_nodes':[128, 512, 1024, 2048, 3200],
#     'optimizer':['Adam','RMSprop','Adadelta']
# }

nn_params = {
    'ngram_range':[(1,1)],
    'max_df':[0.5],
    'min_df':[0],
    'h1_nodes':[128],
    'optimizer':['Adam']
}

tune_nn = TuneNeuralNet(train_data, 3, stopwords, 'nn')
tune_nn.tune_parameters(nn_params, 'count')
tune_nn.tune_parameters(nn_params, 'tfidf')
tune_nn.save_scores_csv('nn')

1
2
3
1
2
3
