In [32]:
import numpy as np
import pandas as pd

In [33]:
df = pd.read_csv('kaggle_fake_train.csv')

In [34]:
df.shape

(20800, 5)

In [35]:
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [36]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [37]:
df.drop('id', axis=1, inplace=True)

In [38]:
df.isna().any()

title      True
author     True
text       True
label     False
dtype: bool

In [39]:
df.dropna(inplace=True)
print(df.shape)

(18285, 4)


In [40]:
df.isna().any()

title     False
author    False
text      False
label     False
dtype: bool

In [41]:
df.shape

(18285, 4)

In [42]:
news = df.copy()

In [43]:
news.reset_index(inplace=True)

In [44]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Atul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
# Cleaning the news
corpus = []
lm = WordNetLemmatizer()

for i in range(0,news.shape[0]):

  # Cleaning special character from the news-title
  title = re.sub(pattern='[^a-zA-Z]', repl=' ', string=news.title[i])

  # Converting the entire news-title to lower case
  title = title.lower()

  # Tokenizing the news-title by words
  words = title.split()

  # Removing the stopwords
  words = [word for word in words if word not in set(stopwords.words('english'))]

  # Lemmetising the words
  words = [lm.lemmatize(word) for word in words]

  # Joining the stemmed words
  title = ' '.join(words)

  # Building a corpus of news-title
  corpus.append(title)

In [86]:
corpus[0:100]

['house dem aide even see comey letter jason chaffetz tweeted',
 'flynn hillary clinton big woman campus breitbart',
 'truth might get fired',
 'civilian killed single u airstrike identified',
 'iranian woman jailed fictional unpublished story woman stoned death adultery',
 'jackie mason hollywood would love trump bombed north korea lack trans bathroom exclusive video breitbart',
 'beno hamon win french socialist party presidential nomination new york time',
 'back channel plan ukraine russia courtesy trump associate new york time',
 'obama organizing action partner soros linked indivisible disrupt trump agenda',
 'bbc comedy sketch real housewife isi cause outrage',
 'russian researcher discover secret nazi military base treasure hunter arctic photo',
 'u official see link trump russia',
 'yes paid government troll social medium blog forum website',
 'major league soccer argentine find home success new york time',
 'well fargo chief abruptly step new york time',
 'anonymous donor pay 

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=3, max_df = 0.75, max_features=None, ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()


In [48]:
X.shape

(18285, 13351)

In [49]:
y = news['label']

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [51]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [52]:
lr_y_pred = lr_classifier.predict(X_test)

In [62]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [63]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score, precision_score, recall_score
score1 = accuracy_score(y_test, lr_y_pred)
score2 = precision_score(y_test, lr_y_pred)
score3 = recall_score(y_test, lr_y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))

---- Scores ----
Accuracy score is: 93.68%
Precision score is: 0.89
Recall score is: 0.97


In [54]:
def fake_news(sample_news):
  sample_news = re.sub(pattern='[^a-zA-Z]',repl=' ', string=sample_news)
  sample_news = sample_news.lower()
  sample_news_words = sample_news.split()
  sample_news_words = [word for word in sample_news_words if not word in set(stopwords.words('english'))]
  lm = WordNetLemmatizer()
  final_news = [lm.lemmatize(word) for word in sample_news_words]
  final_news = ' '.join(final_news)

  temp = cv.transform([final_news]).toarray()
  return classifier.predict(temp)

In [55]:
df_test = pd.read_csv('kaggle_fake_test.csv')

In [56]:
df_test.columns

Index(['id', 'title', 'author', 'text'], dtype='object')

In [64]:
df_test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [57]:
news_title = df_test['title']

In [58]:
news_title.shape

(5200,)

In [65]:
news_title.head()

0    Specter of Trump Loosens Tongues, if Not Purse...
1    Russian warships ready to strike terrorists ne...
2    #NoDAPL: Native American Leaders Vow to Stay A...
3    Tim Tebow Will Attempt Another Comeback, This ...
4                      Keiser Report: Meme Wars (E995)
Name: title, dtype: object

In [75]:
sample_news = news_title[70]

In [77]:
print('News: {}'.format(sample_news))
if fake_news(sample_news):
  print('FAKE news!')
else:
  print('REAL news.')

FAKE news!


In [84]:
sample_news = news_title[200]

In [85]:
print('News: {}'.format(sample_news))
if fake_news(sample_news):
  print('FAKE news!')
else:
  print('REAL news.')

News: Music World Bands Together Against YouTube, Seeking Change to Law - The New York Times
REAL news.
