In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news/submit.csv
/kaggle/input/fake-news/train.csv
/kaggle/input/fake-news/test.csv


#### About the Data:
    1. id: unique ID for a news article
    2. title: the title of a news article
    3. author: author of the news article
    4. text: the text of the article, could be incomplete
    5. lable: a label that marks whether the news article is real or fake {1: Fake news, 0: Real news}


In [2]:
## Importing the Dependencies
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
# # incase we are doing this project on our personal systems
# import nltk
# nltk.download('stopwords')

In [4]:
# printing the stopwords present in english literature
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### Data Pre-processing

In [5]:
# loading the dataset to dataframe
news_dataset = pd.read_csv('../input/fake-news/train.csv')

In [6]:
# the number of news articles
news_dataset.shape

(20800, 5)

In [7]:
# top 5 data-rows
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
# checking null values
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
# replacing the null values with empty strings
news_dataset = news_dataset.fillna("")

In [10]:
# checking null values: after replacement
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [11]:
# using the title column and author columns together; merging columns
news_dataset['author_and_title'] = news_dataset['author'] + ' '+news_dataset['title']
news_dataset.author_and_title.head()

0    Darrell Lucus House Dem Aide: We Didn’t Even S...
1    Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2    Consortiumnews.com Why the Truth Might Get You...
3    Jessica Purkiss 15 Civilians Killed In Single ...
4    Howard Portnoy Iranian woman jailed for fictio...
Name: author_and_title, dtype: object

In [12]:
# splitting data into dependant and independant varaibles
X = news_dataset.drop(columns='label', axis=1)
y = news_dataset['label']

In [13]:
# looking at the split data
print(X.head())
print()
print(y.head())


   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1  Ever get the feeling your life circles the rou...   
2  Why the Truth Might Get You Fired October 29, ...   
3  Videos 15 Civilians Killed In Single US Airstr...   
4  Print \nAn Iranian woman has been sentenced to...   

                                    author_and_title  
0  Darrell Lucus House Dem Aide: We Didn’t Even S...  
1  Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...  
2  Consortiumnews

#### Stemming and Lemmatizing

In [14]:
import re

In [29]:
# Stemming
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def Stemming(data):
    removed_other_characters = re.sub('[^a-zA-Z]', ' ', data) # only keeping a-z and A-Z
    data_lower = removed_other_characters.lower() # converting everything to lowercase
    data_split = data_lower.split() #spliting data on the basis of space(" ")
    data_stemmed = [porter_stemmer.stem(word) for word in data_split if word not in stopwords.words('english')]
    
    data = ' '.join(data_stemmed) #convert list to string
    
    return data


In [28]:
# Lemmatizing
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def Lemmatizing(data):
    removed_other_characters = re.sub('[^a-zA-Z]', ' ', data)
    data_lower = removed_other_characters.lower()
    data_split = data_lower.split()
    data_lemmatized = [wordnet_lemmatizer.lemmatize(word) for word in data_split if word not in stopwords.words('english')]
    data = ' '.join(data_lemmatized)
    return data

In [30]:
# performing stemming and lemmatizing
news_dataset['author_and_title_stemmed'] = news_dataset['author_and_title'].apply(Stemming)

In [31]:
news_dataset['author_and_title_lemmatizing'] = news_dataset['author_and_title'].apply(Lemmatizing)

In [32]:
# let us see the output for stemmed and lemmatized contents
news_dataset.head()

Unnamed: 0,id,title,author,text,label,author_and_title,author_and_title_stemmed,author_and_title_lemmatizing
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...,darrel lucu hous dem aid even see comey letter...,darrell lucus house dem aide even see comey le...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...",daniel j flynn flynn hillari clinton big woman...,daniel j flynn flynn hillary clinton big woman...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...,consortiumnew com truth might get fire,consortiumnews com truth might get fired
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...,jessica purkiss civilian kill singl us airstri...,jessica purkiss civilian killed single u airst...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...,howard portnoy iranian woman jail fiction unpu...,howard portnoy iranian woman jailed fictional ...


In [33]:
news_dataset.columns

Index(['id', 'title', 'author', 'text', 'label', 'author_and_title',
       'author_and_title_stemmed', 'author_and_title_lemmatizing'],
      dtype='object')

In [54]:
# separating the data and lebels i.e., splitting dependant and independant variables
#X = news_dataset['author_and_title_stemmed'].values
X = news_dataset['author_and_title_lemmatizing'].values
y = news_dataset['label'].values

In [55]:
print(X)

['darrell lucus house dem aide even see comey letter jason chaffetz tweeted'
 'daniel j flynn flynn hillary clinton big woman campus breitbart'
 'consortiumnews com truth might get fired' ...
 'michael j de la merced rachel abrams macy said receive takeover approach hudson bay new york time'
 'alex ansary nato russia hold parallel exercise balkan'
 'david swanson keep f alive']


In [56]:
# converting texual data to numeric data
vectorizer = TfidfVectorizer()

vectorizer.fit(X)
X_vectorized = vectorizer.transform(X)

In [37]:
print(X_vectorized)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

In [38]:
X_vectorized.shape

(20800, 17128)

In [57]:
# splitting data into train, test split
x_train, x_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, stratify=y, random_state=2)
# startify -- y means, both the dependent variables will get splitted in equal proportion

In [58]:
# training the logistic regression model
model = LogisticRegression()

In [59]:
model.fit(x_train, y_train)
model

LogisticRegression()

In [60]:
# As it was classification problem; we will use accuracy as metrics
train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(train_prediction, y_train)
training_data_accuracy

0.9868990384615385

In [61]:
# classification report for training data
classification_report(y_train, train_prediction)

'              precision    recall  f1-score   support\n\n           0       0.99      0.98      0.99      8310\n           1       0.98      0.99      0.99      8330\n\n    accuracy                           0.99     16640\n   macro avg       0.99      0.99      0.99     16640\nweighted avg       0.99      0.99      0.99     16640\n'

In [62]:
# As it was classification problem; we will use accuracy as metrics
test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(test_prediction, y_test)
test_data_accuracy

0.9776442307692308

In [63]:
# making a predictive system:
x_new = x_test[3]

prediction = model.predict(x_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real
