In [7]:
import numpy as np
import pandas as pd
import re
from  nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
news = pd.read_csv('/content/fake_or_real_news.csv', engine='python', on_bad_lines='skip')

In [10]:
news.shape

(596, 4)

In [11]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [12]:
news.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [28]:
news['content'] = news['title'] + ' ' + news['text']

In [29]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...


In [30]:
#LABEL ENCODING
from sklearn.preprocessing import LabelEncoder
model = LabelEncoder()
news['label'] = model.fit_transform(news['label'])

In [31]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...


In [32]:
#now doing steaming
#action, act, acting == act (root word)

In [33]:
port = PorterStemmer()

In [34]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [35]:
news['content'] = news['content'].apply(stemming)

In [36]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,smell hillari fear daniel greenfield shillman ...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,watch exact moment paul ryan commit polit suic...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,kerri go pari gestur sympathi u secretari stat...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,berni support twitter erupt anger dnc tri warn...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,battl new york primari matter primari day new ...


In [37]:
#TRAIN TEST SLPIT
x = news['content']
y = news['label']

In [38]:
print(x)

0      smell hillari fear daniel greenfield shillman ...
1      watch exact moment paul ryan commit polit suic...
2      kerri go pari gestur sympathi u secretari stat...
3      berni support twitter erupt anger dnc tri warn...
4      battl new york primari matter primari day new ...
                             ...                        
591    jame comey taken know much clinton jame comey ...
592    noun verb donald trump rubio seek seiz murphi ...
593    true scandal tortur chelsea man true scandal t...
594    opec see oil price explod barrel right oil mar...
595    clinton inc watch pain msm report clinton corr...
Name: content, Length: 596, dtype: object


In [39]:
print(y)

0      0
1      0
2      1
3      0
4      1
      ..
591    0
592    0
593    0
594    1
595    0
Name: label, Length: 596, dtype: int64


In [47]:
#Convert text to numer
vectorizer = TfidfVectorizer()
x = news['content']
x = vectorizer.fit_transform(x)

In [48]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 162619 stored elements and shape (596, 15591)>
  Coords	Values
  (0, 12700)	0.06776778349863495
  (0, 6280)	0.21876644934879688
  (0, 4875)	0.1269537759509474
  (0, 3302)	0.02875729517739698
  (0, 5787)	0.03807779583644815
  (0, 12410)	0.04053107245904079
  (0, 7297)	0.02130217948792544
  (0, 4908)	0.02304280695246348
  (0, 5289)	0.019106410860551804
  (0, 2207)	0.0168612809175668
  (0, 9274)	0.02837039258647166
  (0, 15496)	0.030735295872513453
  (0, 15411)	0.025753591570275115
  (0, 5142)	0.05840323068272158
  (0, 10993)	0.02304280695246348
  (0, 7092)	0.02005373581832197
  (0, 4984)	0.032031295686206926
  (0, 13221)	0.053819151971411114
  (0, 4269)	0.05858710890750769
  (0, 11701)	0.028339852109276496
  (0, 2518)	0.20027984070094562
  (0, 5680)	0.0814779480125425
  (0, 15029)	0.07085349202047089
  (0, 4869)	0.5869393359726809
  (0, 15356)	0.01650872345388878
  :	:
  (594, 7748)	0.03300867917867482
  (594, 5115)	0.02416273

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [50]:
#TRANING THE MODEL
model = LogisticRegression()
model = model.fit(x_train, y_train)

In [59]:
#NOW CHECK ACCUR
prediction = model.predict(x_train)
predicaccc = accuracy_score(prediction, y_train)
print(f'accuracy on traning data:  {predicacc*100 :.0f}%')

accuracy on traning data:  88%


In [55]:
#NOW CHECK FOR TEST DATA
predicrion = model.predict(x_test)
predicacc = accuracy_score(predicrion, y_test)
print(f'accuracy on test data: {predicacc*100:.0f}%')

accuracy on test data: 87.5


In [60]:
#MAKE A PREDIC SUSTEM

In [68]:
x_new = x_test[19]
predict = model.predict(x_new)
print(predict)
if(predict[0] == 0):
  print('REAL NEWS')
else:
  print('FAKE NEWS')

[0]
REAL NEWS


In [69]:
x_new = x_test[9]
predict = model.predict(x_new)
print(predict)
if(predict[0] == 0):
  print('REAL NEWS')
else:
  print('FAKE NEWS')

[1]
FAKE NEWS
