In [1]:
import os
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm

In [2]:
GLOVE_DIR = 'glove.6B.100d.txt'
glove_dict = {}
with open(GLOVE_DIR) as f:
    for line in f:
        word, *vector = line.split()
        glove_dict[word]=np.array(vector).astype(float)

re_tokenized = pickle.load(open('re_tokenized_lemma.pkl','rb'))

In [4]:
def get_average_glove_vec(list_of_tokens):
    avg_vec=[]
    for token in list_of_tokens:
        if token in glove_dict:
            avg_vec.append(glove_dict[token])

    if len(avg_vec)>0:
        avg_vec = np.mean(avg_vec, axis=0)
    else:
        avg_vec = np.zeros(100)

    return avg_vec

In [5]:
import pandas as pd
from tqdm import tqdm
from pre_processing import load_df_and_features

df_original = load_df_and_features("")
df_vecs = []

for ls in tqdm(re_tokenized):
    avg_vec = get_average_glove_vec(ls)
    df_vecs.append(avg_vec)

df_original['glove_avg'] = df_vecs

100%|██████████| 416768/416768 [00:09<00:00, 41690.20it/s]


In [7]:
num_features = ['aoa', 'concrete_score', 'verb2',
                'conc_unknown', 'aoa_perc_known_lem', 'conc_total', 'syllable_per_word',
                'conc_mean_score', 'dale_chall_score', 'conc_subtlex_score']

In [8]:
X1 = df_original[num_features].to_numpy()
X2 = np.vstack(df_vecs)
X = np.hstack([X1,X2])

In [9]:
df = df_original.sample(10000, random_state=42)

In [10]:
X_sampled = X[df.index]
y = df['label']

In [11]:
X_sampled.shape, y.shape

((10000, 110), (10000,))

In [12]:
from sklearn.model_selection import train_test_split

Train_X, Test_X, Train_Y, Test_Y = train_test_split(X_sampled, y, test_size=0.2, random_state=42)

In [12]:
# Train_X.shape

# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

rf = RandomForestClassifier(bootstrap=True,
 max_depth=70,
 max_features='auto',
 min_samples_leaf=4,
 min_samples_split=10,
 n_estimators=800)

rf.fit(Train_X, Train_Y)
rf_preds = rf.predict(Test_X)
rf_acc = accuracy_score(Test_Y, rf_preds)
rf_f1 = f1_score(Test_Y, rf_preds)
rf_acc, rf_f1

(0.684, 0.6892822025565388)

In [93]:
rf_training_score = rf.score(Train_X, Train_Y)
rf_training_score

0.986

In [14]:
pickle.dump(rf, open('trained_models/rf_glove_and_simplefeats_1w.pkl', 'wb'))

In [15]:
reloaded_rf = pickle.load(open('trained_models/rf_glove_and_simplefeats_1w.pkl', 'rb'))
# y_score = reloaded_rf.predict_proba(Test_X)

# SVM

In [19]:
from sklearn import svm
from sklearn.metrics import f1_score, accuracy_score

SVM = svm.SVC(C=0.5, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X, Train_Y)

predictions_SVM = SVM.predict(Test_X)
svm_acc = accuracy_score(Test_Y, predictions_SVM)*100
svm_f1 = f1_score(Test_Y, predictions_SVM)*100
svm_acc, svm_f1

(62.150000000000006, 57.01306076093129)

In [92]:
svm_training_score = SVM.score(Train_X, Train_Y)
svm_training_score

0.6385

In [20]:
SVM2 = svm.SVC(C=0.5, kernel='rbf', degree=3, gamma='auto')
SVM2.fit(Train_X, Train_Y)

predictions_SVM2 = SVM2.predict(Test_X)
svm_acc2 = accuracy_score(Test_Y, predictions_SVM2)*100
svm_f12 = f1_score(Test_Y, predictions_SVM2)*100
svm_acc2, svm_f12

(56.45, 57.574281539210915)

### RBF kernel does not seem to improve results in this case.

In [21]:
pickle.dump(SVM, open('trained_models/svm_glove_and_simplefeats_1w.pkl', 'wb'))

In [15]:
reloaded_svm = pickle.load(open('trained_models/svm_glove_and_simplefeats_1w.pkl', 'rb'))

In [None]:
predictions_SVM = reloaded_svm.predict(Test_X[:100])
svm_acc = accuracy_score(Test_Y, predictions_SVM)*100
svm_f1 = f1_score(Test_Y, predictions_SVM)*100
svm_acc, svm_f1

# MLP

In [49]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# for mlp & svm
sc = StandardScaler()
Train_X_scaled = sc.fit_transform(Train_X)
Test_X_scaled = sc.transform(Test_X)

In [52]:
from sklearn.neural_network import MLPClassifier
# with scaling


mlp = MLPClassifier(hidden_layer_sizes=(150,), random_state=1, max_iter=300).fit(Train_X_scaled, Train_Y)
mlp_predict = mlp.predict(Test_X_scaled)
mlp_acc = accuracy_score(Test_Y, mlp_predict)
mlp_f1 = f1_score(Test_Y, mlp_predict)
mlp_acc, mlp_f1

(0.6065, 0.6196230062832285)

In [90]:
mlp_train_score = mlp.score(Train_X_scaled, Train_Y)
mlp_train_score

0.9855

In [54]:
pickle.dump(mlp, open('trained_models/mlp_glove_and_simplefeats_1w.pkl', 'wb'))

### Without feature scaling, the model perform less ideal.

In [53]:
# without scaling
mlp2 = MLPClassifier(hidden_layer_sizes=(150,), random_state=1, max_iter=300).fit(Train_X, Train_Y)
mlp_predict2 = mlp2.predict(Test_X)

mlp_acc2 = accuracy_score(Test_Y, mlp_predict2)
mlp_f12 = f1_score(Test_Y, mlp_predict2)
mlp_acc2, mlp_f12

(0.5245, 0.680550890157877)

# Summary

In [56]:
mlp_acc=mlp_acc*100
mlp_f1=mlp_f1*100
rf_acc=rf_acc*100
rf_f1=rf_f1*100

In [87]:
print('    accuracy\t f1 score')
print(' mlp:', round(mlp_acc,3),'\t', mlp_f1, 
      '\n SVM:', round(svm_acc,3),'\t', svm_f1, 
      '\n rf: ',rf_acc,'\t', rf_f1)

    accuracy	 f1 score
 mlp: 60.65 	 61.96230062832285 
 SVM: 62.15 	 57.01306076093129 
 rf:  68.4 	 68.92822025565388


In [70]:
score_df = pd.DataFrame([[mlp_acc, svm_acc, rf_acc],[mlp_f1, svm_f1, rf_f1]]).T
score_df.columns=['accuracy', 'f1 score']
score_df.index = ['MLP','SVM','RandomForest']
score_df

Unnamed: 0,accuracy,f1 score
MLP,60.65,61.962301
SVM,62.15,57.013061
RandomForest,68.4,68.92822


In [71]:
score_df.to_csv('scores_1w_glove_simplefeats.csv')

# Extra: Try a more complicated model under neural network, but this time using simple Count Vectorizer

In [None]:
sentences = df_original['original_text'].values
y = df_original['label'].values

sentences_train = np.array(sentences)[train_indexes]
sentences_test = np.array(sentences)[test_indexes]
y_train = np.array(y)[train_indexes]
y_test = np.array(y)[test_indexes]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# for Neural Network
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of  reserved 0 index
vocab_size = len(tokenizer.word_index) + 1 

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
X_train.shape

(80000, 100)

In [None]:
from keras.models import Sequential
from keras import layers
import tensorflow as tf

embedding_dim=100
nlp_input = layers.Input(shape=(maxlen,)) 
# meta_input = layers.Input(shape=(6,))
emb = layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(nlp_input) 
nlp_out = layers.Bidirectional(layers.LSTM(128))(emb) 
# concat = tf.concat([nlp_out, meta_input], axis=1) 
classifier = layers.Dense(32, activation='relu')(nlp_out) 
output = layers.Dense(1, activation='sigmoid')(classifier) 
model = tf.keras.Model(inputs=[nlp_input], outputs=[output])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          7716400   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 7,959,153
Trainable params: 7,959,153
Non-trainable params: 0
_________________________________________________________________


In [None]:
# model.fit({'input_1': X_train, 'input_2': meta_X_train}, y_train, epochs=10, batch_size=128)
model.fit(X_train, y_train, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f27c9837fd0>

In [None]:
model.save('drive/MyDrive/Milestone2/trained_models/NN_tf_countvec')



INFO:tensorflow:Assets written to: drive/MyDrive/Milestone2/trained_models/NN_tf_countvec/assets


INFO:tensorflow:Assets written to: drive/MyDrive/Milestone2/trained_models/NN_tf_countvec/assets


In [None]:
predicted2 = model.predict(X_test)
predicted = 1*(predicted2>0.5).flatten()

In [None]:
from sklearn.metrics import accuracy_score, f1_score
nn2_acc = accuracy_score(y_test, predicted)
nn2_f1 = f1_score(y_test, predicted)

In [None]:
nn2_acc, nn2_f1

(0.64675, 0.6450997136685589)