In [45]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
fake_news_data = pd.read_csv('FakeNewsNet.csv')
print(fake_news_data)

                                                   title  \
0      Kandi Burruss Explodes Over Rape Accusation on...   
1      People's Choice Awards 2018: The best red carp...   
2      Sophia Bush Sends Sweet Birthday Message to 'O...   
3      Colombian singer Maluma sparks rumours of inap...   
4      Gossip Girl 10 Years Later: How Upper East Sid...   
...                                                  ...   
23191  Pippa Middleton wedding: In case you missed it...   
23192  Zayn Malik & Gigi Hadid’s Shocking Split: Why ...   
23193  Jessica Chastain Recalls the Moment Her Mother...   
23194  Tristan Thompson Feels "Dumped" After Khloé Ka...   
23195  Kelly Clarkson Performs a Medley of Kendrick L...   

                                                news_url  \
0      http://toofab.com/2017/05/08/real-housewives-a...   
1      https://www.today.com/style/see-people-s-choic...   
2      https://www.etonline.com/news/220806_sophia_bu...   
3      https://www.dailymail.co.uk/news

In [5]:
fake_news_data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [6]:
fake_news_data.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [7]:
fake_news = fake_news_data.fillna('')

In [9]:
X = fake_news.drop(columns='real', axis=1)
Y = fake_news['real']

In [10]:
print(X)
print(Y)

                                                   title  \
0      Kandi Burruss Explodes Over Rape Accusation on...   
1      People's Choice Awards 2018: The best red carp...   
2      Sophia Bush Sends Sweet Birthday Message to 'O...   
3      Colombian singer Maluma sparks rumours of inap...   
4      Gossip Girl 10 Years Later: How Upper East Sid...   
...                                                  ...   
23191  Pippa Middleton wedding: In case you missed it...   
23192  Zayn Malik & Gigi Hadid’s Shocking Split: Why ...   
23193  Jessica Chastain Recalls the Moment Her Mother...   
23194  Tristan Thompson Feels "Dumped" After Khloé Ka...   
23195  Kelly Clarkson Performs a Medley of Kendrick L...   

                                                news_url  \
0      http://toofab.com/2017/05/08/real-housewives-a...   
1      https://www.today.com/style/see-people-s-choic...   
2      https://www.etonline.com/news/220806_sophia_bu...   
3      https://www.dailymail.co.uk/news

In [11]:
port_stream = PorterStemmer()

In [12]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stream.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
fake_news['title'] = fake_news['title'].apply(stemming)

In [15]:
print(fake_news['title'])

0        kandi burruss explod rape accus real housew at...
1                   peopl choic award best red carpet look
2        sophia bush send sweet birthday messag one tre...
3        colombian singer maluma spark rumour inappropr...
4        gossip girl year later upper east sider shock ...
                               ...                        
23191    pippa middleton wed case miss pippa marri lace...
23192    zayn malik gigi hadid shock split chanc reunit...
23193    jessica chastain recal moment mother boyfriend...
23194    tristan thompson feel dump khlo kardashian ref...
23195    kelli clarkson perform medley kendrick lamar h...
Name: title, Length: 23196, dtype: object


In [17]:
X = fake_news['title'].values
Y = fake_news['real'].values

In [18]:
print(X)
print(Y)

['kandi burruss explod rape accus real housew atlanta reunion video'
 'peopl choic award best red carpet look'
 'sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva'
 ... 'jessica chastain recal moment mother boyfriend slap kick genit'
 'tristan thompson feel dump khlo kardashian refus let move la home exclus'
 'kelli clarkson perform medley kendrick lamar humbl hit billboard music award']
[1 1 1 ... 1 0 1]


In [19]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [20]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 182971 stored elements and shape (23196, 12546)>
  Coords	Values
  (0, 53)	0.2721149519149911
  (0, 576)	0.32205761515065545
  (0, 1524)	0.4060028523437122
  (0, 3739)	0.37202135470266046
  (0, 5246)	0.27372476382132127
  (0, 5895)	0.4060028523437122
  (0, 8975)	0.3165911519884338
  (0, 9020)	0.24249280188561428
  (0, 9287)	0.26647854102470714
  (0, 11959)	0.2217815349642914
  (1, 643)	0.31690546637483147
  (1, 986)	0.34762560771735407
  (1, 1710)	0.3978429114527965
  (1, 1975)	0.43378159796649224
  (1, 6573)	0.3420111696667716
  (1, 8315)	0.4067948402953306
  (1, 9079)	0.387076508000102
  (2, 1069)	0.19438975915734968
  (2, 1361)	0.37698546923989756
  (2, 1528)	0.3313932058546551
  (2, 1529)	0.2679606830015344
  (2, 2132)	0.22767646060985314
  (2, 3657)	0.25142880383632443
  (2, 5100)	0.22534648956135603
  (2, 5104)	0.2418360754414745
  :	:
  (23193, 9046)	0.32490265953129527
  (23193, 10239)	0.39707689058614987
  (23194, 3

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [38]:
model = LogisticRegression()

In [39]:
model.fit(X_train,Y_train)

In [40]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [41]:
print('Accuracy Score: ', training_data_accuracy)

Accuracy Score:  0.8603147230006467


In [42]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [43]:
print('Accuracy Score: ', test_data_accuracy)

Accuracy Score:  0.8293103448275863


In [47]:
score_1 = metrics.r2_score(Y_test, X_test_prediction)

score_2 = metrics.mean_absolute_error(Y_test, X_test_prediction)

print("R squared value = ",score_1)
print("Mean Absolute error = ", score_2)

R squared value =  0.09687782311694726
Mean Absolute error =  0.1706896551724138


In [54]:
X_new = X_test[10]

prediction = model.predict(X_new)
print(prediction)

if(prediction[0]==0):
    print('News is real')
else:
    print('News is fake')


[1]
News is fake


In [52]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36540 stored elements and shape (4640, 12546)>
  Coords	Values
  (0, 224)	0.3066017623491687
  (0, 1326)	0.2651295501496394
  (0, 2402)	0.3748360531778298
  (0, 3728)	0.4028179283210758
  (0, 4437)	0.3958295600412114
  (0, 5716)	0.21836679858380506
  (0, 6577)	0.2864495277411444
  (0, 8179)	0.30042931329262174
  (0, 9440)	0.3092241878725657
  (0, 9852)	0.2462550078735655
  (1, 668)	0.2568705353962885
  (1, 1105)	0.30826334501527003
  (1, 2001)	0.3498089320066428
  (1, 4793)	0.3347599748677849
  (1, 8491)	0.3299565614027961
  (1, 9218)	0.2883917947132534
  (1, 10026)	0.32861627865347465
  (1, 10641)	0.3384071432648236
  (1, 12138)	0.3483015313203653
  (1, 12162)	0.2621021535214766
  (2, 1676)	0.4932572981416907
  (2, 7885)	0.6111722045474342
  (2, 9158)	0.39598756584494227
  (2, 11281)	0.47577265780494987
  (3, 535)	0.3546123579711796
  :	:
  (4637, 9115)	0.3852023982283259
  (4637, 9410)	0.3042683129975527
  (4637, 9926)	0.3

In [48]:
print(Y_test[0])

1
