In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import preprocess_string
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('news_articles.csv',na_filter='')
df = df[df['language']=='english']
data = df[['title','text','label']]
data['label'] = np.where(data['label']=='Real',1,0)

In [3]:
data # 1- > real, 0 -> fake

Unnamed: 0,title,text,label
0,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,1
1,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,1
2,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,1
3,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,1
4,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,1
...,...,...,...
2090,prof canoes reek of genocide white privilege,,1
2091,teens walk free after gangrape conviction,,1
2092,school named for munichmassacre mastermind,,1
2093,russia unveils satan missile,,0


In [4]:
data['News'] = data['title'] + data['text']
data['News'] = data['News'].apply(lambda x: preprocess_string(x))

In [5]:
sw = stopwords.words('english')

In [6]:
def sw_remove(x):
    return [i for i in x if i not in sw]

In [7]:
data['News'] = data['News'].apply(lambda x:sw_remove(x))

In [8]:
data['title_01'] = data['title'].apply(lambda x:1 if len(x)>=1 else 0)
data['text_01'] = data['text'].apply(lambda x:1 if len(x)>=1 else 0)

In [9]:
port_stem = PorterStemmer()

In [10]:
# data['News'] = data['News'].apply(lambda x: x.split())

In [11]:
def f(x):
    return ' '.join([port_stem.stem(token) for token in x])

In [12]:
data['News'] = data['News'].apply(lambda x: f(x))

In [13]:
X = data['News']
y = data['label']

In [14]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
X = X.toarray()

In [16]:
X[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [163]:
col = data[['title_01','text_01']].values.reshape(-1,2)
X = np.concatenate((col, X), axis=1)

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

# logistic regression

In [165]:
model = LogisticRegression()
model.fit(X_train, y_train)
# accuracy score on the test data
y_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test,y_pred)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7277227722772277


In [166]:
import pickle

pickle.dump(model, open('model.pickle', 'wb'))

pickle.dump(vectorizer, open('vectorizer.pickle', 'wb'))

In [175]:
def check(text):
    model_saved = pickle.load(open('model.pickle','rb'))
    vectorizer_saved = pickle.load(open('vectorizer.pickle','rb'))
    test = pd.Series(text)
    X_processed = vectorizer_saved.transform(test)
    X_processed = X_processed.toarray()
    X_processed = np.concatenate([X_processed,np.array([[1,1]])],axis=1)
    pred = model_saved.predict(X_processed)
    if pred[0]==1:
        return 'Real'
    else:
        return 'Fake'

In [199]:
' '.join(preprocess_string('hillary clinton jumps the shark with trumps '))

'hillari clinton jump shark trump'

In [177]:
check('hillari clinton jump shark trump')

'Real'

# RandomForestClassifier

In [178]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train, y_train)

# Predicting the Test set results
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7475247524752475


In [179]:
col = data[['title_01','text_01']].values.reshape(-1,2)
X = np.concatenate((col, X), axis=1)

In [180]:
testdf = pd.DataFrame({'Actual':y_test,'Prediction':y_pred})

In [181]:
import pandas as pd
pd.set_option('display.max_rows', 2200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

testdf.replace([1,0],['Real','Fake'])

Unnamed: 0,Actual,Prediction
1469,Real,Real
42,Fake,Fake
559,Fake,Fake
165,Fake,Fake
1236,Fake,Fake
2011,Real,Fake
1918,Real,Fake
1614,Real,Fake
64,Fake,Fake
173,Fake,Fake


# DecisionTreeClassifier

In [182]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7103960396039604


# Performance tuning

In [183]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [24]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,n_jobs=-1,verbose=3)
CV_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
CV_rfc.best_params_

In [184]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=8, criterion='gini')

In [185]:
rfc1.fit(X_train, y_train)

# Predicting the Test set results
y_pred = rfc1.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.6089108910891089


In [195]:
len(df)

2017

In [186]:
import pickle

pickle.dump(rfc1, open('model.pickle', 'wb'))

pickle.dump(vectorizer, open('vectorizer.pickle', 'wb'))

In [188]:
df['text'][0]

'print they should pay all the back all the money plus interest the entire family and everyone who came in with them need to be deported asap why did it take two years to bust them \nhere we go again another group stealing from the government and taxpayers a group of somalis stole over four million in government benefits over just  months \nweve reported on numerous cases like this one where the muslim refugeesimmigrants commit fraud by scamming our systemits way out of control more related'

In [196]:
X

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

In [194]:
df[df['label']!='Real']['text'][33]

'st century wire says \nwire reported on friday about the fbis surprising announcement that it would be reopening the clinton email case due to new evidence of classified information found on sex cheat anthony weiners newly estranged husband of clinton chief aid huma abedin computer which was subject to a seperate investigation will this really yield anything significant in the  days runningup to the nov th election or is this just clever democrat party smoke and mirrors it seems that washingtons political tricksters have already sprung into action \nafter comeys shock announcement a leaked memo appeared out of nowhere supplied to fox news  in which comey and the fbi seem to be going through a routine set of prescribed political moves designed to implement damage control \nelite circles fbi head james comey and friend hillary clinton \ncertainly a desperate democratic party and an even more desperate obama white house over the last  weeks obama and his wife michelle have been out campa