In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


df_original = pd.read_csv('sample_data/WikiLarge_Train.csv')
df_original['re_tokened'] = pickle.load(open('sample_data/re_tokenized_lemma.pkl', 'rb'))
df_original['re_tokened2'] = df_original['re_tokened'].apply(lambda x: ' '.join(x))

df = df_original.sample(10000, random_state=42)
Train_X, Test_X, Train_Y, Test_Y = train_test_split(df['re_tokened2'], df['label'], 
                                                    test_size=0.2, random_state=42)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf_vect = TfidfVectorizer(max_features=10000, lowercase=True, stop_words='english')
Tfidf_vect.fit(df['re_tokened2'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [3]:
len(Tfidf_vect.vocabulary_)

10000

In [4]:
Train_X_Tfidf.shape, Test_X_Tfidf.shape

((8000, 10000), (2000, 10000))

# SVM

In [5]:
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score

In [6]:
SVM = svm.SVC(C=0.5, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

predictions_SVM = SVM.predict(Test_X_Tfidf)
svm_acc = accuracy_score(Test_Y, predictions_SVM)*100
svm_f1 = f1_score(Test_Y, predictions_SVM)*100
svm_acc, svm_f1

(57.8, 58.546168958742626)

In [32]:
SVM.score(Train_X_Tfidf,Train_Y)

0.8265

In [9]:
pickle.dump(SVM, open('trained_models/svm_tfidf_1w.pkl', 'wb'))

### rbf kernel does not increase model performance.

In [8]:
SVM2 = svm.SVC(C=0.5, kernel='rbf', degree=3, gamma='auto')
SVM2.fit(Train_X_Tfidf,Train_Y)

predictions_SVM2 = SVM2.predict(Test_X_Tfidf)
svm_acc2 = accuracy_score(Test_Y, predictions_SVM2)*100
svm_f12 = f1_score(Test_Y, predictions_SVM2)*100
svm_acc2, svm_f12

(51.5, 67.98679867986799)

# MLP

In [13]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(150,))
mlp.fit(Train_X_Tfidf, Train_Y)

MLPClassifier(hidden_layer_sizes=(150,))

In [33]:
mlp.score(Train_X_Tfidf, Train_Y)

0.994625

In [14]:
mlp_preds = mlp.predict(Test_X_Tfidf)
mlp_acc = accuracy_score(Test_Y, mlp_preds)
mlp_f1 = f1_score(Test_Y, mlp_preds)
mlp_acc, mlp_f1

(0.5725, 0.585956416464891)

In [15]:
pickle.dump(mlp, open('trained_models/mlp_tfidf_1w.pkl', 'wb'))

# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(bootstrap= True,
 max_depth=70,
 max_features='auto',
 min_samples_leaf=4,
 min_samples_split=10,
 n_estimators=800)

rf.fit(Train_X_Tfidf, Train_Y)
rf_preds = rf.predict(Test_X_Tfidf)
rf_acc = accuracy_score(Test_Y, rf_preds)
rf_f1 = f1_score(Test_Y, rf_preds)
rf_acc, rf_f1

(0.607, 0.5814696485623003)

In [34]:
rf.score(Train_X_Tfidf, Train_Y)

0.74825

In [18]:
import pickle

pickle.dump(rf, open('trained_models/rf_tfidf_1w.pkl', 'wb'))

# Summary

Models with the same parameters for other features all behave like overfitting with the Tf-Idf feature. With more time one should try to do **PCA or any dimension reduction** on the input data and then feed into the model.

In [22]:
mlp_acc= mlp_acc*100
mlp_f1= mlp_f1*100
rf_acc=rf_acc*100
rf_f1=rf_f1*100

In [31]:
print('    accuracy\t f1 score')
print(' mlp:', mlp_acc,'\t', mlp_f1, 
      '\n SVM:', svm_acc,'\t', svm_f1, 
      '\n rf: ',round(rf_acc,3),'\t', rf_f1)

    accuracy	 f1 score
 mlp: 57.25 	 58.595641646489106 
 SVM: 57.8 	 58.546168958742626 
 rf:  60.7 	 58.146964856230035


In [28]:
score_df = pd.DataFrame([[mlp_acc, svm_acc, rf_acc],[mlp_f1, svm_f1, rf_f1]]).T
score_df.columns=['accuracy', 'f1 score']
score_df.index = ['MLP','SVM','RandomForest']

In [29]:
score_df.to_csv('scores_1w_TFIDF.csv')