In [1]:
import json
import os
import numpy as np

from functions.classical_ML_functions import *

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB


ORIGINAL_DIR = os.path.dirname(os.getcwd())
DIR = ORIGINAL_DIR + "\\project_2\\embeddings\\glove\\stanford_glove"
DATA_DIR = ORIGINAL_DIR + "\\project_2"
SAVING_DIR = ORIGINAL_DIR + "\\project_2\\Classical_ML_results\\glove_twitter_200d"

# Glove twitter 200d

In [2]:
embeddings_index = {}
with open(DIR + '\\glove.twitter.27B.200d.txt', 'r', encoding='utf-8') as txtfile:
    lines = txtfile.readlines()
    for line in tqdm(lines):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

HBox(children=(FloatProgress(value=0.0, max=1193514.0), HTML(value='')))




# Self made Data

In [4]:
# Importing data

with open(DATA_DIR + '\\self_made_data\\self_made_positive.txt', 'r', encoding='utf-8') as filehandle:
    pos = json.load(filehandle)
    pos = pos[:90233] #90233

with open(DATA_DIR + '\\self_made_data\\self_made_negative.txt', 'r', encoding='utf-8') as filehandle:
    neg = json.load(filehandle)
    neg = neg[:90233] #91088
    
data = pos + neg
labels = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))), axis=0)
    
with open(DATA_DIR + '\\self_made_data\\self_made_test.txt', 'r', encoding='utf-8') as filehandle:
    test = json.load(filehandle)
    
# Convert to tokens

data = tokenize(data)
test = tokenize(test)

# Take the average vectors of words to create the feature matrix

data_features = dict_embed(data, embeddings_index, 200)
test_features = dict_embed(test, embeddings_index, 200)

HBox(children=(FloatProgress(value=0.0, max=180466.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=180466.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




###### Logistic regression

In [5]:
self_made_LR_results = list()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(data_features, labels):
    
    X_train, X_test = data_features[train_index], data_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = LogisticRegression(verbose=0, max_iter=1000).fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    self_made_LR_results.append(clf.score(X_test, y_test))
    
print("---------------------")
print(f"Mean accuracy = {round(np.mean(self_made_LR_results)*100, 4)}%, Std accuracy = {round(np.std(self_made_LR_results), 4)}")

0.7424225633069208
0.7450198099354446
0.7467930069542571
0.7445488044773225
0.7390906824037902
---------------------
Mean accuracy = 74.3575%, Std accuracy = 0.0026


###### Linear SVM

In [None]:
# self_made_LSVM_results = list()
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for train_index, test_index in skf.split(data_features, labels):
#     
#     X_train, X_test = data_features[train_index], data_features[test_index]
#     y_train, y_test = labels[train_index], labels[test_index]
#     clf = svm.SVC(kernel='linear', verbose=1).fit(X_train, y_train)
#     print(clf.score(X_test, y_test))
#     self_made_LSVM_results.append(clf.score(X_test, y_test))
#     
# print("---------------------")
# print(f"Mean accuracy = {round(np.mean(self_made_LSVM_results)*100, 4)}%, Std accuracy = {round(np.std(self_made_LSVM_results), 4)}")

###### Radial basis SVM

In [None]:
# self_made_RBFSVM_results = list()
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for train_index, test_index in skf.split(data_features, labels):
#     
#     X_train, X_test = data_features[train_index], data_features[test_index]
#     y_train, y_test = labels[train_index], labels[test_index]
#     clf = svm.SVC(kernel='rbf', verbose=1).fit(X_train, y_train)
#     print(clf.score(X_test, y_test))
#     self_made_RBFSVM_results.append(clf.score(X_test, y_test))
#     
# print("---------------------")
# print(f"Mean accuracy = {round(np.mean(self_made_RBFSVM_results)*100, 4)}%, Std accuracy = {round(np.std(self_made_RBFSVM_results), 4)}")

###### Multi layer perceptron

In [None]:
from sklearn.neural_network import MLPClassifier


self_made_MLP_results = list()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(data_features, labels):
    
    X_train, X_test = data_features[train_index], data_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = MLPClassifier(hidden_layer_sizes=(100, 100), random_state=1, max_iter=100, verbose=True, early_stopping=True).fit(X_train, y_train)
    self_made_MLP_results.append(clf.score(X_test, y_test))
    
print("---------------------")
print(f"Mean accuracy = {round(np.mean(self_made_MLP_results)*100, 4)}%, Std accuracy = {round(np.std(self_made_MLP_results), 4)}")

###### Naive bayes

In [6]:
self_made_NB_results = list()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(data_features, labels):
    
    X_train, X_test = data_features[train_index], data_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = GaussianNB().fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    self_made_NB_results.append(clf.score(X_test, y_test))
    
print("---------------------")
print(f"Mean accuracy = {round(np.mean(self_made_NB_results)*100, 4)}%, Std accuracy = {round(np.std(self_made_NB_results), 4)}")

0.658752147171275
0.6656415371401657
0.6595461723879976
0.6609591887623639
0.6597678220153492
---------------------
Mean accuracy = 66.0933%, Std accuracy = 0.0025


# Trivial Data

In [7]:
# Importing data

with open(DATA_DIR + '\\trivial_data\\trivial_positive.txt', 'r', encoding='utf-8') as filehandle:
    pos = json.load(filehandle)
    pos = pos[:90233] #90233

with open(DATA_DIR + '\\trivial_data\\trivial_negative.txt', 'r', encoding='utf-8') as filehandle:
    neg = json.load(filehandle)
    neg = neg[:90233] #91088
    
data = pos + neg
labels = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))), axis=0)
    
with open(DATA_DIR + '\\trivial_data\\trivial_test.txt', 'r', encoding='utf-8') as filehandle:
    test = json.load(filehandle)
    
# Convert to tokens

data = tokenize(data)
test = tokenize(test)

# Take the average vectors of words to create the feature matrix

data_features = dict_embed(data, embeddings_index, 200)
test_features = dict_embed(test, embeddings_index, 200)

HBox(children=(FloatProgress(value=0.0, max=180466.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=180466.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




###### Logistic regression

In [8]:
trivial_LR_results = list()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(data_features, labels):
    
    X_train, X_test = data_features[train_index], data_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = LogisticRegression(verbose=0, max_iter=1000).fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    trivial_LR_results.append(clf.score(X_test, y_test))
    
print("---------------------")
print(f"Mean accuracy = {round(np.mean(trivial_LR_results)*100, 4)}%, Std accuracy = {round(np.std(trivial_LR_results), 4)}")

0.7659998891782568
0.7670185354500872
0.7683761394176156
0.7677666029423988
0.7645249771423822
---------------------
Mean accuracy = 76.6737%, Std accuracy = 0.0014


###### Linear SVM

In [None]:
# trivial_LSVM_results = list()
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for train_index, test_index in skf.split(data_features, labels):
#     
#     X_train, X_test = data_features[train_index], data_features[test_index]
#     y_train, y_test = labels[train_index], labels[test_index]
#     clf = svm.SVC(kernel='linear', verbose=1).fit(X_train, y_train)
#     print(clf.score(X_test, y_test))
#     trivial_LSVM_results.append(clf.score(X_test, y_test))
#     
# print("---------------------")
# print(f"Mean accuracy = {round(np.mean(trivial_LSVM_results)*100, 4)}%, Std accuracy = {round(np.std(trivial_LSVM_results), 4)}")

###### Radial basis SVM

In [None]:
# trivial_RBFSVM_results = list()
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for train_index, test_index in skf.split(data_features, labels):
#     
#     X_train, X_test = data_features[train_index], data_features[test_index]
#     y_train, y_test = labels[train_index], labels[test_index]
#     clf = svm.SVC(kernel='rbf', verbose=1).fit(X_train, y_train)
#     print(clf.score(X_test, y_test))
#     trivial_RBFSVM_results.append(clf.score(X_test, y_test))
#     
# print("---------------------")
# print(f"Mean accuracy = {round(np.mean(trivial_RBFSVM_results)*100, 4)}%, Std accuracy = {round(np.std(trivial_RBFSVM_results), 4)}")

###### Naive bayes

In [9]:
trivial_NB_results = list()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(data_features, labels):
    
    X_train, X_test = data_features[train_index], data_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = GaussianNB().fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    trivial_NB_results.append(clf.score(X_test, y_test))
    
print("---------------------")
print(f"Mean accuracy = {round(np.mean(trivial_NB_results)*100, 4)}%, Std accuracy = {round(np.std(trivial_NB_results), 4)}")

0.6475314456696404
0.64713379325631
0.6463580195605796
0.6492948771229878
0.6510680741418003
---------------------
Mean accuracy = 64.8277%, Std accuracy = 0.0017


# Saving results

###### Self made data

In [None]:
with open(SAVING_DIR + '\\stratified_5fold_LR_self_made_data.txt', 'w') as filehandle:
    json.dump(self_made_LR_results, filehandle)
    
with open(SAVING_DIR + '\\stratified_5fold_RBFSVM_self_made_data.txt', 'w') as filehandle:
    json.dump(self_made_RBFSVM_results, filehandle)
    
with open(SAVING_DIR + '\\stratified_5fold_NB_self_made_data.txt', 'w') as filehandle:
    json.dump(self_made_NB_results, filehandle)

###### Trivial data

In [None]:
with open(SAVING_DIR + '\\stratified_5fold_LR_trivial_data.txt', 'w') as filehandle:
    json.dump(trivial_LR_results, filehandle)
    
with open(SAVING_DIR + '\\stratified_5fold_RBFSVM_trivial_data.txt', 'w') as filehandle:
    json.dump(trivial_RBFSVM_results, filehandle)
    
with open(SAVING_DIR + '\\stratified_5fold_NB_trivial_data.txt', 'w') as filehandle:
    json.dump(trivial_NB_results, filehandle)