Goal : Analyze whether the news is fake or not.

Data Description : Link for data: https://www.kaggle.com/c/fake-news/data

Information about the features:

id: unique id for a news article

title: the title of a news article

author: author of the news article

text: the text of the article; could be incomplete

label: a label that marks the article as potentially unreliable
0--real
1--fake

Data Importing & Description

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
data=pd.read_csv("/content/test.csv")

In [4]:
data.shape

(5200, 4)

In [5]:
data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [7]:
data.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [8]:
data.fillna("",inplace=True)

In [9]:
data.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [10]:
data["content"]=data["author"]+" "+data["title"]
data.head()

Unnamed: 0,id,title,author,text,content
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",David Streitfeld Specter of Trump Loosens Tong...
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,Russian warships ready to strike terrorists n...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,Common Dreams #NoDAPL: Native American Leaders...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",Daniel Victor Tim Tebow Will Attempt Another C...
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Truth Broadcast Network Keiser Report: Meme Wa...


In [12]:
data1=pd.read_csv("/content/submit.csv")
data1.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [14]:
y=data1.label
y.head()

0    0
1    1
2    0
3    1
4    1
Name: label, dtype: int64

In [15]:
x=data.content
x.head()

0    David Streitfeld Specter of Trump Loosens Tong...
1     Russian warships ready to strike terrorists n...
2    Common Dreams #NoDAPL: Native American Leaders...
3    Daniel Victor Tim Tebow Will Attempt Another C...
4    Truth Broadcast Network Keiser Report: Meme Wa...
Name: content, dtype: object

In [18]:
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")
print(stopwords.words("english")) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bo

In [19]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [21]:
import re #RegularExpression library

In [22]:
def stemming(content):
  stemmed_content=re.sub("[^a-zA-Z]",' ',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[ps.stem(word) for word in stemmed_content if not word in stopwords.words("english")]
  stemmed_content=" ".join(stemmed_content)
  return stemmed_content

In [23]:
data["content"]=data["content"].apply(stemming)

In [25]:
print(data.content)

0       david streitfeld specter trump loosen tongu pu...
1       russian warship readi strike terrorist near al...
2       common dream nodapl nativ american leader vow ...
3       daniel victor tim tebow attempt anoth comeback...
4        truth broadcast network keiser report meme war e
                              ...                        
5195    jodi rosen bangladeshi traffic jam never end n...
5196    sheryl gay stolberg john kasich sign one abort...
5197    mike mcphate california today exactli sushi ne...
5198                us marin deploy russian border norway
5199        teddi wayn awkward sex onscreen new york time
Name: content, Length: 5200, dtype: object


In [26]:
X=data["content"].values
Y=y.values

In [27]:
print(X)

['david streitfeld specter trump loosen tongu purs string silicon valley new york time'
 'russian warship readi strike terrorist near aleppo'
 'common dream nodapl nativ american leader vow stay winter file lawsuit polic'
 ... 'mike mcphate california today exactli sushi new york time'
 'us marin deploy russian border norway'
 'teddi wayn awkward sex onscreen new york time']


In [28]:
print(Y)

[0 1 0 ... 0 1 0]


In [29]:
print(X.shape)
print(Y.shape)

(5200,)
(5200,)


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tf=TfidfVectorizer()

In [33]:
X1=tf.fit_transform(X)
print(X1)

  (0, 8217)	0.08565162006825205
  (0, 9086)	0.08756899768504828
  (0, 5528)	0.08477318149045779
  (0, 8618)	0.2951833364547816
  (0, 7416)	0.2999075906455375
  (0, 7825)	0.3555115824143384
  (0, 6435)	0.3555115824143384
  (0, 8258)	0.3555115824143384
  (0, 4746)	0.3555115824143384
  (0, 8391)	0.11042633230471806
  (0, 7632)	0.3555115824143384
  (0, 7815)	0.33924845737676024
  (0, 1960)	0.20372848702181864
  (1, 186)	0.36689240142839846
  (1, 5485)	0.37981022352550325
  (1, 8135)	0.35260162515090443
  (1, 7823)	0.36689240142839846
  (1, 6583)	0.40045820302192875
  (1, 8826)	0.4600630644689105
  (1, 6999)	0.30028557064663775
  (2, 6196)	0.2207192031076798
  (2, 4560)	0.30535137235977566
  (2, 2913)	0.2867830893352259
  (2, 8964)	0.295064720779198
  (2, 7741)	0.2982257606996168
  :	:
  (5196, 9086)	0.08533742989568321
  (5196, 5528)	0.08261286098643077
  (5197, 2712)	0.5050057718512321
  (5197, 8237)	0.3546424451054057
  (5197, 5054)	0.38652719828192134
  (5197, 1136)	0.3372429743783571
 

In [34]:
from sklearn.model_selection import train_test_split

In [43]:
x_train,x_test,y_train,y_test=train_test_split(X1,Y,test_size=.20,stratify=Y,random_state=2)

In [38]:
from sklearn.linear_model import LogisticRegression 

In [39]:
LR=LogisticRegression()

In [44]:
LR.fit(x_train,y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
x_train_prediction=LR.predict(x_train)

In [48]:
from sklearn.metrics import accuracy_score

In [49]:
accuracy_score(x_train_prediction,y_train)

0.828125

In [52]:
x_test_prediction=LR.predict(x_test)

In [53]:
accuracy_score(x_test_prediction,y_test)

0.7932692307692307

Predictive System

In [54]:
x_new=x_train[0]
prediction=LR.predict(x_new)
print(prediction)
if (prediction==0):
  print("News is real")
else:
  print("News is not real")  

[1]
News is not real


In [56]:
print(y_train[0])

1
