# Portfolio DAY 4 - Natural Language Processing
## Fake Trump Twitter

In [149]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import floor

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import normalize
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# Load the data
df = pd.read_json('./data/trump_vs_GPT2.gz')
df.rename(columns = { 0: 'tweet', 1: 'real_tweet' }, inplace = True)

In [150]:
# Tokenize words

stop_words = set(stopwords.words('english'))

tokenized = []

for index, tweet in enumerate(df['tweet'].values):
    tokens = [word.lower() for word in word_tokenize(tweet) if word not in stop_words and word.isalnum()]
    tokenized.append(' '.join(tokens))

df['tweet_token'] = tokenized

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\caspe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\caspe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [151]:
# Vectorise the text
vectorizer_bow = CountVectorizer()
vectorizer_tfid = TfidfVectorizer()

In [152]:
# Create mask to ensure same training sample
indices_all = np.arange(0, len(df), 1, dtype=int) # Create a range of all the indices
np.random.shuffle(indices_all) # Shuffle the indices

test_size = floor(len(indices_all) * 0.33)
test_mask = indices_all[0:test_size]

train_size = len(indices_all) - test_size
train_mask = indices_all[test_size:]

In [153]:
# Vectorise the text
vectorizer_bow = CountVectorizer()
vectorizer_tfid = TfidfVectorizer()

y_test = df.iloc[test_mask]['real_tweet'].values
y_train = df.iloc[train_mask]['real_tweet'].values

X_test = df.iloc[test_mask]['tweet_token'].values
X_train = df.iloc[train_mask]['tweet_token'].values

X_train_bow = vectorizer_bow.fit_transform(X_train)
X_train_tfid = vectorizer_tfid.fit_transform(X_train)

X_test_bow = vectorizer_bow.transform(X_test)
X_test_tfid = vectorizer_tfid.transform(X_test)

In [154]:
# Fit first attempt to assess accuracy
model_bow = LogisticRegression(max_iter=2000)
model_bow.fit(X_train_bow, y_train)
bow_score = model_bow.score(X_test_bow, y_test)

model_tfid = LogisticRegression(max_iter=2000)
model_tfid.fit(X_train_tfid, y_train)
tfid_score = model_tfid.score(X_test_tfid, y_test)

print(f'bow: {round(bow_score, 2)}, tfid: {round(tfid_score, 2)}')


bow: 0.81, tfid: 0.81


In [177]:
# Model tuning parameters
max_iter = [100, 500, 1000, 2000]
solver = ['newton-cg', 'lbfgs']
C = [0.05, 0.1, 0.15, 1.0, 1.5, 2, 10]
tol = [0.001, 0.005, 0.010]

# Create the random grid
random_grid = { 'max_iter': max_iter, 'solver': solver, 'C': C, 'tol': tol }

classifier = LogisticRegression()
classifier_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs = -1)

classifier_random.fit(X_train_bow, y_train)
print(classifier_random.score(X_test_bow, y_test))
print(classifier_random.best_params_)

classifier_random.fit(X_train_tfid, y_train)
print(classifier_random.score(X_test_tfid, y_test))
print(classifier_random.best_params_)

Fitting 3 folds for each of 126 candidates, totalling 378 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:   10.0s
0.8272315919374743
{'tol': 0.001, 'solver': 'newton-cg', 'max_iter': 100, 'C': 0.1}
[Parallel(n_jobs=-1)]: Done 378 out of 378 | elapsed:   12.4s finished


In [179]:
# bow {'tol': 0.001, 'solver': 'newton-cg', 'max_iter': 100, 'C': 0.1}
# tfid {'tol': 0.001, 'solver': 'newton-cg', 'max_iter': 100, 'C': 1.0}

model_bow = LogisticRegression(tol=0.001, solver='newton-cg', max_iter=100, C=0.1)
model_bow.fit(X_train_bow, y_train)
bow_score = model_bow.score(X_test_bow, y_test)

model_tfid = LogisticRegression(tol=0.001, solver='newton-cg', max_iter=100, C=1.0)
model_tfid.fit(X_train_tfid, y_train)
tfid_score = model_tfid.score(X_test_tfid, y_test)

print(f'bow: {round(bow_score, 2)}, tfid: {round(tfid_score, 2)}')


bow: 0.83, tfid: 0.81


## Conclusion
Slight improvement in bag of words approach.
0.83% accuracy