# Training set - 17226 samples

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV


from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords

In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser


HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
import pandas as pd
helper.cd_main_data()
#load main Midas labelled data table
df = pd.read_csv('midas_labeled_data_Q12018.csv')

In [None]:
def clean_note(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
 
    # Strip escaped quotes
    text = text.replace('\\"', '')
 
    # Strip quotes
    text = text.replace('"', '')
 
    return text
 
# df = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3)
df.fillna('No Score', inplace=True)
df['cleaned_note_unstructured'] = df['cleaned_note_unstructured'].apply(clean_note)
df['category_id'] = df['midas_final_unstructured'].factorize()[0]

In [None]:
df.shape


In [None]:
#map class numerical values
midas_final_unstructured_id_df = df[['midas_final_unstructured', 'category_id']].drop_duplicates().sort_values('category_id')
midas_final_unstructured_id_df.index = midas_final_unstructured_id_df.category_id
midas_final_unstructured_id_df.drop('category_id', axis=1, inplace=True)
class_dict = midas_final_unstructured_id_df.to_dict()['midas_final_unstructured']
class_dict

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_note_unstructured'], 
                                                    df['category_id'], test_size=0.2, random_state=2019)

In [None]:
MAX_SEQ_LENGHT = len(max([i.split() for i in X_train.values], key=len))
cv = CountVectorizer(stop_words=stopwords.words('english'), 
                             lowercase=True, min_df=3, max_df=0.9, max_features=MAX_SEQ_LENGHT)

In [None]:
model_mult_nb = MultinomialNB()
pipe = Pipeline([('cv',cv),
                ('model',model_mult_nb)
])

In [None]:
gs = GridSearchCV(pipe,cv=5)
print(gs.fit(X_train,y_train))
print(gs.best_params_)
print("Train Score: ", round(gs.best_score_,4))
print("Train Score: ", round(gs.score(X_test,y_test),4))

# Mutlinomial NB

In [None]:
# for alpha in np.linspace(0,2,20)[1:]:
X_train_onehot = cv.fit_transform(X_train)
X_test_onehot = cv.fit_transform(X_test)

model_mult_nb = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model_mult_nb.fit(X_train_onehot,y_train)
print("Train score", model_mult_nb.score(X_train_onehot,y_train))
print("Test score", model_mult_nb.score(X_test_onehot,y_test))

# Random Forest

In [None]:
rf = RandomForestClassifier(random_state=2019)
rf.fit(X_train_onehot,y_train)
print("Cross Val Score: ",cross_val_score(rf,X_train_onehot,y_train,cv=5).mean())
print("Train Score: ", round(rf.score(X_train_onehot,y_train),4))
print("Test Score: ", round(rf.score(X_test_onehot,y_test),4))

# Extra Trees Classifier

In [None]:
et = ExtraTreesClassifier(random_state=42)
et.fit(X_train_onehot,y_train)
print("Cross Val Score: ",cross_val_score(rf,X_train_onehot,y_train,cv=5).mean())
print("Train Score: ", round(et.score(X_train_onehot,y_train),4))
print("Test Score: ", round(et.score(X_test_onehot,y_test),4))

# Bagging Classifier

In [None]:
bagged = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42)
                            ,max_samples=0.5, max_features=0.5)
bagged.fit(X_train_onehot,y_train)
print("Cross Val Score: ",cross_val_score(bagged,X_train_onehot,y_train,cv=5).mean())
print("Train Score: ",bagged.score(X_train_onehot,y_train))
print("Test Score: ",bagged.score(X_test_onehot,y_test))

# Tuning random forest with GridSearch

In [None]:

#saved data is available as a pickle file

model__max_features = ['auto',0.75,0.8,0.4]
model__max_depth = [65,66,67,68,69,70]#,200]
model__criterion = ['gini', 'entropy']
sample_leaf_options = [1,5,10,50,100,200,500]
results = []
for crit in model__criterion:
    for feat in model__max_features:
        for depth in model__max_depth:
#             print (crit, feat,depth) ## min_samples_leaf=model_min_samples_leaf,n_estimators = [20] 
#             et = ExtraTreesClassifier(random_st ate=42, criterion=crit,max_depth=depth,max_features=feat)
            rf = RandomForestClassifier(n_estimators = 10,random_state=42, criterion=crit,max_depth=depth,max_features=feat
                                       ,min_samples_leaf = 1)
            rf.fit(X_train_onehot,y_train)
            cv_score = round(cross_val_score(rf,X_train_onehot,y_train,cv=5).mean(),5)
            train_score = round(rf.score(X_train_onehot,y_train),5)
            test_score = round(rf.score(X_test_onehot,y_test),5)            
            results.append([crit,feat,depth,cv_score,train_score,test_score])

In [None]:
results

In [None]:
helper.cd_main_data()
helper.save_as_pickle(results, 'random_forest_gridSearchCv.pkl')

In [None]:
ls

In [None]:
# retain random forest with optimum hyperparameters
# 'entropy', 'auto', None, 1.0, 0.99856, 0.73593
rf = RandomForestClassifier(random_state=2019,criterion='gini',max_features='auto',max_depth=30)
rf.fit(X_train_onehot,y_train)
print("Cross Val Score: ",cross_val_score(rf,X_train_onehot,y_train,cv=5).mean())
print("Train Score: ", round(rf.score(X_train_onehot,y_train),4))
print("Test Score: ", round(rf.score(X_test_onehot,y_test),4))

In [None]:
# print random forest top features
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
print(indices)

# Print the feature ranking
print("Feature ranking:")

for f in range(0,20): # X_train_vec.shape[1]
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]),class_dict[indices[f]])