### IMPORTS

In [1]:
import nltk
import spacy
from modules.utils import build_dataset, tune_logistic_regression, tune_svm, evaluate
import sklearn
from modules.preprocess import spacy_tokenizer, text_edit
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### LOAD DATASET

In [2]:
dataset = build_dataset('archive/truth_seeker.xlsx', num_class_samples=10000, rnd_state=10)

### PREPROCESS DATA

In [3]:
dataset = text_edit(dataset,
                    grp_num=True,
                    rm_newline=True,
                    rm_punctuation=False,
                    rm_stop_words=True,
                    lowercase=True,
                    lemmatize=True,
                    expand=True,
                    html_=True,
                    symb_to_text=True)

### CREATE SAMPLE AND TARGET LISTS

In [4]:
X = [x['tweet'] for x in dataset.values()]
Y = [x['BinaryNumTarget'] for x in dataset.values()]

In [5]:
X

['mention morganjttalley mention arkypatriot joe bidens first day office shut keystone pipeline . okay nordstream num . that s russia sell gas germany . I m sure do not understand that . import oil russia we . fucking stupid you question',
 'mention washingtonpost extension dollar num unemployment benefit republicans want cut exclamation exclamation exclamation citizen make dollar num weekly job + codiv - num + no health insurance exclamation exclamation exclamation congress republicans make dollar num , num month : obviously republicans care citizen exclamation exclamation exclamation exclamation',
 'mention xzamilloh mention benshapiro one thing agree about : tax loophole exclamation say , determine fair interest rate charge earning category permit many loophole exclamation remember billionaire warren buffet bragging pay less taxis secretary exclamation unacceptable exclamation',
 'mention jackkninum mention vassijohri mention sairasameerarao post - secondary education practically ma

### TRAIN/TEST SPLIT

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

### GENERATE BAG OF WORDS

In [7]:
vector_count = TfidfVectorizer(tokenizer=spacy_tokenizer, min_df=0.05, max_df=0.95)
bow_train = vector_count.fit_transform(X_train)
bow_test =  vector_count.transform(X_test)



### HYPERPARAMETER TUNING

In [8]:
svm_model = tune_svm(bow_train, Y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits
Best Hyperparameters: {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}


In [9]:
lr_model = tune_logistic_regression(bow_train, Y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Hyperparameters: {'C': 1, 'max_iter': 5000, 'penalty': 'l2', 'solver': 'newton-cg'}


In [10]:
mlp_model = MLPClassifier(hidden_layer_sizes=(25, 100, 100, 25), activation='relu', max_iter=5000, random_state=42)

### TRAIN CLASSIFIER

In [11]:
lr_model.fit(bow_train, Y_train)
lr_y_pred = lr_model.predict(bow_test)
evaluate(Y_test, lr_y_pred)

Precision:  0.652196929592377
Recall:  0.8121292023731048
F1_score:  0.7234292425132121
accuracy:  0.686


In [12]:
svm_model.fit(bow_train, Y_train)
svm_y_pred = svm_model.predict(bow_test)
evaluate(Y_test, svm_y_pred)

Precision:  0.6622411046202867
Recall:  0.8220171390903098
F1_score:  0.7335294117647058
accuracy:  0.698


In [13]:
mlp_model.fit(bow_train, Y_train)
mlp_y_pred = mlp_model.predict(bow_test)
evaluate(Y_test, mlp_y_pred)

Precision:  0.6678779069767442
Recall:  0.6058009228740936
F1_score:  0.6353266505357761
accuracy:  0.6483333333333333


### TRAIN ENSEMBLE CLASSIFIER

In [14]:
svm_model = SVC(probability=True, C=100, degree=2, gamma='scale', kernel='poly', random_state=42) 
lr_model = LogisticRegression(C=10, penalty='l2', solver='liblinear', max_iter=5000, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(25, 100, 100, 25), activation='relu', max_iter=5000, random_state=42)

In [15]:
svm_model.fit(bow_train, Y_train)
lr_model.fit(bow_train, Y_train)
mlp_model.fit(bow_train, Y_train)

In [16]:
ensemble_model = VotingClassifier(estimators=[('svm', svm_model), ('logistic', lr_model), ('mlp', mlp_model)], voting='soft')

In [17]:
ensemble_model.fit(bow_train, Y_train)
ensemble_y_pred = ensemble_model.predict(bow_test)
evaluate(Y_test, ensemble_y_pred)

Precision:  0.6745124411566913
Recall:  0.6611733684904416
F1_score:  0.6677762982689747
accuracy:  0.6673333333333333


In [18]:
#Precision:  0.8362573099415205
#Recall:  0.89375
#F1_score:  0.8640483383685801
#accuracy:  0.85