# Final Model All Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# For NLP vectorizing

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle

from wordcloud import WordCloud, STOPWORDS
from src.helpers import *
%load_ext autoreload
%autoreload 2

In [None]:
# reading in all 70,000 articles

df_all = pd.read_csv('data/all_data.csv').reset_index(drop = True)
df_all = df_all[['text', 'label']]

In [None]:
#split data into target and features, stratify to maintain class balance

y = df_all['label']
X = df_all['text']

In [None]:
# Term frequency-Inverse document frequency vectorizer with bigrams and trigrams, top 100,000 highest weighted tokens

tfidf = TfidfVectorizer(preprocessor=preprocessor, stop_words=stopwords_list() + ['reuters'], ngram_range=(1, 3), max_features=20000)
tfidf_model = tfidf.fit(X)

In [None]:
# # save fitted model

# with open('tfidf_model_final.pkl', 'wb') as f:
#     pickle.dump(tfidf_model, f)

In [None]:
# transform the data using the tfidf vectorizer

tfidf_X = tfidf_model.transform(X)

In [None]:
# # reload model

# with open('tfidf_model_final.pkl', 'rb') as f:
#     tfidf_model = pickle.load(f)

In [None]:
# Random Forest Classifier

rfc = RandomForestClassifier(criterion='gini', max_depth = 200, n_estimators = 50, max_features = 'auto', oob_score = True, random_state=42).fit(tfidf_X, y)
print(rfc.oob_score_)

In [None]:
#feature importances

feat_scores = pd.DataFrame({'Top 10 Important Features':rfc.feature_importances_}, index = columns)
feat_scores.sort_values(by = 'Top 10 Important Features', ascending=False)[:10].plot(kind='bar');

In [None]:
# clean df_all

df_all_clean = df_all.copy()
df_all_clean['text'] = df_all_clean['text'].apply(lambda x: preprocessor(x))

In [None]:
real = df_all_clean[df_all_clean.label == 1]
fake = df_all_clean[df_all_clean.label == 0]

In [None]:
# Word Cloud needs one long string of words

real_text = ' '.join(text for text in real.text)

In [None]:
# Word Cloud needs one long string of words

fake_text = ' '.join(text for text in fake.text)

In [None]:
# # Real wordcloud

# wordcloud = WordCloud(width = 800, height = 600, 
#                 background_color ='white', 
#                 stopwords = sw + ['reuters'], 
#                 min_font_size = 10).generate(real_text) 
  
# # plot the WordCloud image                        
# plt.figure(figsize = (8, 8), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis("off") 
# plt.tight_layout(pad = 0) 
  
# plt.show()

In [None]:
# # Fake wordcloud

# wordcloud = WordCloud(width = 800, height = 600, 
#                 background_color ='white', 
#                 stopwords = sw + ['reuters'], 
#                 min_font_size = 10).generate(fake_text) 
  
# # plot the WordCloud image                        
# plt.figure(figsize = (8, 8), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis("off") 
# plt.tight_layout(pad = 0) 
  
# plt.show()