In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Loading Data

In [2]:
#Loading data
#Dataframe1=fake
#Dataframe2=true
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

In [3]:
#adding a flag to track fake and real
fake['target'] = 'fake'
true['target'] = 'true'

In [4]:
#concatenating the data frames
data = pd.concat([fake, true]).reset_index(drop = True)

In [5]:
#shuffle the data to prevent bias
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

# Data Cleaning

In [6]:
data.shape

(44898, 5)

In [7]:
data.describe

<bound method NDFrame.describe of                                                    title  \
0      Yemen's Saleh says ready for 'new page' with S...   
1      Former soccer star Kaladze runs for mayor in G...   
2      Bosnian pensioners stage street protests for p...   
3      India, China need to do more to avoid border d...   
4      Prospects for House vote on gun control measur...   
...                                                  ...   
44893  Austrian far right to control foreign, interio...   
44894  Ramping up tensions over North Korea may have ...   
44895  THE VIEW Brings On Bill O’Reilly’s Sexual Hara...   
44896  Trump says process for finding new FBI chief i...   
44897  At least 30 Burundian refugees die in clashes ...   

                                                    text       subject  \
0      ADEN (Reuters) - Former Yemeni President Ali A...     worldnews   
1      TBILISI (Reuters) - Kakha Kaladze climbed to t...     worldnews   
2      SARAJEVO (Reuter

In [8]:
data.info

<bound method DataFrame.info of                                                    title  \
0      Yemen's Saleh says ready for 'new page' with S...   
1      Former soccer star Kaladze runs for mayor in G...   
2      Bosnian pensioners stage street protests for p...   
3      India, China need to do more to avoid border d...   
4      Prospects for House vote on gun control measur...   
...                                                  ...   
44893  Austrian far right to control foreign, interio...   
44894  Ramping up tensions over North Korea may have ...   
44895  THE VIEW Brings On Bill O’Reilly’s Sexual Hara...   
44896  Trump says process for finding new FBI chief i...   
44897  At least 30 Burundian refugees die in clashes ...   

                                                    text       subject  \
0      ADEN (Reuters) - Former Yemeni President Ali A...     worldnews   
1      TBILISI (Reuters) - Kakha Kaladze climbed to t...     worldnews   
2      SARAJEVO (Reuters)

In [9]:
#checking the missing values in dataset
data.isnull().sum()

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [10]:
#checking for the duplicate data
sum(data.duplicated())

209

In [11]:
#droping the duplicate values by using drop 
# and then re-checking the duplicated values and shape of dataset
data.drop_duplicates(inplace=True)
print(sum(data.duplicated()))
print(data.shape)

0
(44689, 5)


In [12]:
#re-checking the duplicate data
sum(data.duplicated())

0

In [13]:
#cchecking the first five rows of the data set....
data.head()

Unnamed: 0,title,text,subject,date,target
0,Yemen's Saleh says ready for 'new page' with S...,ADEN (Reuters) - Former Yemeni President Ali A...,worldnews,"December 2, 2017",True
1,Former soccer star Kaladze runs for mayor in G...,TBILISI (Reuters) - Kakha Kaladze climbed to t...,worldnews,"October 19, 2017",True
2,Bosnian pensioners stage street protests for p...,SARAJEVO (Reuters) - Thousands of pensioners f...,worldnews,"October 25, 2017",True
3,"India, China need to do more to avoid border d...",NEW DELHI (Reuters) - Indian Prime Minister Na...,worldnews,"September 5, 2017",True
4,Prospects for House vote on gun control measur...,WASHINGTON (Reuters) - Prospects dimmed on Mon...,politicsNews,"July 12, 2016",True


In [14]:
#dropping 'date' and 'title' attributes  
data.drop(["date"],axis=1,inplace=True) #we won’t use it for the analysis
data.drop(["title"],axis=1,inplace=True) #we need only text

In [15]:
#Converting the text to lowercase:
data['text'] = data['text'].apply(lambda x: x.lower())

In [16]:
#check
data.head()

Unnamed: 0,text,subject,target
0,aden (reuters) - former yemeni president ali a...,worldnews,True
1,tbilisi (reuters) - kakha kaladze climbed to t...,worldnews,True
2,sarajevo (reuters) - thousands of pensioners f...,worldnews,True
3,new delhi (reuters) - indian prime minister na...,worldnews,True
4,washington (reuters) - prospects dimmed on mon...,politicsNews,True


In [17]:
#Remove punctuation:
import string
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str
data['text'] = data['text'].apply(punctuation_removal)

In [18]:
#Remove stopwords:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() 
                                                      if word not in (stop)]))

[nltk_data] Downloading package stopwords to C:\Users\Ishita
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
#check
data.head()

Unnamed: 0,text,subject,target
0,aden reuters former yemeni president ali abdul...,worldnews,True
1,tbilisi reuters kakha kaladze climbed top worl...,worldnews,True
2,sarajevo reuters thousands pensioners across b...,worldnews,True
3,new delhi reuters indian prime minister narend...,worldnews,True
4,washington reuters prospects dimmed monday us ...,politicsNews,True
