In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download()
# In the NLTK Downloader window that pops up, click on the Models tab, 
# then click on punkt in the Identifier column, then click on the Download button.

In [2]:
# Need to figure out how to use the same amount of results everytime.
np.random.seed(500)

In [3]:
news_dataset = pd.read_csv('./fake-news/train.csv')
news_dataset.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [4]:
# Dropping 'author' column
news_dataset = news_dataset.drop(columns='author')
news_dataset = news_dataset.drop(columns='text')

In [5]:
# Filling NaN fields with empty string
news_dataset = news_dataset.fillna('')

In [6]:
# Converting title and text columns to lowercase
news_dataset['title'] = news_dataset['title'].str.lower()
news_dataset.head()

Unnamed: 0,id,title,label
0,0,house dem aide: we didn’t even see comey’s let...,1
1,1,"flynn: hillary clinton, big woman on campus - ...",0
2,2,why the truth might get you fired,1
3,3,15 civilians killed in single us airstrike hav...,1
4,4,iranian woman jailed for fictional unpublished...,1


In [7]:
port_stem = PorterStemmer()

In [8]:
# Function for removing stopwords and stemming words
# Could try Lemmatization as it's even better than stemming but it's more complicated
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [9]:
# Applying the stemming function
news_dataset['title'] = news_dataset['title'].apply(stemming)
news_dataset.head()

Unnamed: 0,id,title,label
0,0,hous dem aid even see comey letter jason chaff...,1
1,1,flynn hillari clinton big woman campu breitbart,0
2,2,truth might get fire,1
3,3,civilian kill singl us airstrik identifi,1
4,4,iranian woman jail fiction unpublish stori wom...,1


In [10]:
# Tokenizing words and putting them in a new column each
news_dataset['tokenized_title'] = news_dataset['title'].apply(word_tokenize)
news_dataset

Unnamed: 0,id,title,label,tokenized_title
0,0,hous dem aid even see comey letter jason chaff...,1,"[hous, dem, aid, even, see, comey, letter, jas..."
1,1,flynn hillari clinton big woman campu breitbart,0,"[flynn, hillari, clinton, big, woman, campu, b..."
2,2,truth might get fire,1,"[truth, might, get, fire]"
3,3,civilian kill singl us airstrik identifi,1,"[civilian, kill, singl, us, airstrik, identifi]"
4,4,iranian woman jail fiction unpublish stori wom...,1,"[iranian, woman, jail, fiction, unpublish, sto..."
...,...,...,...,...
20795,20795,rapper trump poster child white supremaci,0,"[rapper, trump, poster, child, white, supremaci]"
20796,20796,n f l playoff schedul matchup odd new york time,0,"[n, f, l, playoff, schedul, matchup, odd, new,..."
20797,20797,maci said receiv takeov approach hudson bay ne...,0,"[maci, said, receiv, takeov, approach, hudson,..."
20798,20798,nato russia hold parallel exercis balkan,1,"[nato, russia, hold, parallel, exercis, balkan]"


In [None]:
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)




In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
model = LogisticRegression()

model.fit(X_train, Y_train)

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

In [None]:
prediction = model.predict(X_test[3])
prediction

In [None]:
import pickle
filename = 'nlp_logistical_regression_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [70]:
print(X_test[3])

  (0, 12672)	0.26600491667436216
  (0, 11029)	0.24116450324525057
  (0, 10295)	0.2888013813520418
  (0, 9708)	0.38776119090424305
  (0, 9684)	0.37642938364730816
  (0, 6020)	0.3111117196303452
  (0, 4899)	0.33000212063735657
  (0, 3332)	0.2957706883568352
  (0, 2438)	0.25745473794854234
  (0, 2188)	0.22827557481744312
  (0, 2098)	0.26149828879624953
  (0, 1650)	0.12518294517540368


import joblib
filename = 'nlp_logistical_regression.sav'
joblib.dump(model, filename)