In [60]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import missingno as msno
import autoreload
import csv
import warnings
import os
import sys
import re

from collections import defaultdict, Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, recall_score, r2_score, classification_report


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

sb.set()
sb.set_style('darkgrid')

plt.style.use('seaborn')
plt.tight_layout()

%matplotlib inline
%reload_ext autoreload
%autoreload 2

warnings.filterwarnings('ignore')

In [20]:
df = pd.read_csv("D:\Open Classroom\Datasets\Fake News\FakeNewsNet.csv")
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [21]:
df.shape

(23196, 5)

In [22]:
df.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [23]:
df.replace(np.nan, 'null', inplace = True)

In [24]:
df.isnull().sum()

title            0
news_url         0
source_domain    0
tweet_num        0
real             0
dtype: int64

In [25]:
df['real'].value_counts()

1    17441
0     5755
Name: real, dtype: int64

### Replace special string character using Regresison analysis

In [26]:
for i in range(len(df['title'])):
    df['title'][i]=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', df['title'][i], flags=re.MULTILINE)

In [27]:
for i in range(len(df['title'])):
    df['title'][i] = re.sub('@([a-zA-Z])+','', df['title'][i], flags = re.MULTILINE)

### Feature Engineering

In [28]:
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


### Split the Dataset into Train and Test segments

In [29]:
x = df['title']
y = df['real']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

print('x_train:', x_train.shape)
print('x_test:', x_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

x_train: (18556,)
x_test: (4640,)
y_train: (18556,)
y_test: (4640,)


### Convert the dataset to a document-term matrix using the TFIDF Vectorizer

In [30]:
vect = TfidfVectorizer()

In [31]:
x_train_vect = vect.fit_transform(x_train)
%time x_train_vect

Wall time: 0 ns


<18556x15981 sparse matrix of type '<class 'numpy.float64'>'
	with 199127 stored elements in Compressed Sparse Row format>

In [32]:
x_test_vect = vect.fit_transform(x_test)
%time x_test_vect

Wall time: 0 ns


<4640x8654 sparse matrix of type '<class 'numpy.float64'>'
	with 49939 stored elements in Compressed Sparse Row format>

### Build and Evaluate Models

In [69]:
def model_evaluation(model):
    model.fit(x_train_vect, y_train)
    
    pred = model.predict(x_train_vect)
    
    
#     print('F1 Score :', f1_score(x_train, pred))
#     print('Recall :', recall_score(x_train, pred))
#     print('R^2 Score :', r2_score(x_train, pred))

In [71]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
mnb = MultinomialNB()

models = {
    dtc : DecisionTreeClassifier(),
    rfc : RandomForestClassifier(),
    gbc : GradientBoostingClassifier(),
    mnb : MultinomialNB(),
}

for i in models.keys():
    model_evaluation(i)