### IMPORTS

In [1]:
import nltk
import spacy
from modules.utils import build_dataset
import sklearn
from modules.preprocess import spacy_tokenizer, text_edit
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import pickle

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### LOAD DATASET

In [2]:
dataset = build_dataset('archive/truth_seeker.xlsx', num_class_samples=1500, rnd_state=5)

### PREPROCESS DATA

In [3]:
dataset = text_edit(dataset,
                    num=True,
                    mention=True,
                    newline=True,
                    punctuation=True,
                    stop_words_=True)

### CREATE SAMPLE AND TARGET LISTS

In [4]:
X = [x['tweet'] for x in dataset.values()]
Y = [x['BinaryNumTarget'] for x in dataset.values()]

### GENERATE BAG OF WORDS

In [None]:
vector_count = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=True)
bow_train = vector_count.fit_transform(X_train)
bow_test =  vector_count.transform(X_test)
#vector_count.vocabulary_

### TRAIN/TEST SPLIT

In [5]:
#Do train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

### TRAIN CLASSIFIER

In [10]:
#TODO: Hyperparameter tuning - Find best combination of solver, penalty, etc. 

#Define logistic regression model
lr_model = LogisticRegression(random_state=42, solver='lbfgs', max_iter =2000)
#Train logistic regression model
lr_model.fit(bow_train, Y_train)
#Get predictions on the test set
lr_y_pred = lr_model.predict(bow_test)
lr_y_proba = lr_model.predict_proba(bow_test)

### EVALUATE

In [11]:
print('Model Precision: ', precision_score(Y_test, lr_y_pred))
print('Model Recall: ', recall_score(Y_test, lr_y_pred))
print('Model F1_score: ', f1_score(Y_test, lr_y_pred))

Model Precision:  0.8654708520179372
Model Recall:  0.8976744186046511
Model F1_score:  0.8812785388127854
