In [7]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from datetime import datetime
#nltk.download('stopwords')
#nltk.download('punkt')


def parse_date(x):
    for fmt in ("%B %d, %Y", "%b %d, %Y"):
        try:
            return datetime.strptime(x.strip(), fmt)
        except ValueError:
            continue
    return None

# Load the data
#Combine the two datasets and shuffle them
data1=pd.read_csv('../data/raw/Fake.csv',delimiter=',')
data2=pd.read_csv('../data/raw/True.csv',delimiter=',')
data1['label']=0
data2['label']=1

data=pd.concat([data1,data2],ignore_index=True)
data=data.sample(frac=1).reset_index(drop=True)

data.loc[data["date"] == "14-Feb-18", "date"] = "February 14, 2018"
data.loc[data["date"] == "15-Feb-18", "date"] = "February 15, 2018"
data.loc[data["date"] == "16-Feb-18", "date"] = "February 16, 2018"
data.loc[data["date"] == "17-Feb-18", "date"] = "February 17, 2018"
data.loc[data["date"] == "18-Feb-18", "date"] = "February 18, 2018"
data.loc[data["date"] == "19-Feb-18", "date"] = "February 19, 2018"

data["date"]=data["date"].apply(parse_date)

x_train=data.drop(columns=['label'])
y_train=data['label']

print(x_train.head())
print(y_train.head())

                                               title  \
0  Competing efforts to end South Sudan's war pro...   
1  House Democratic leader Pelosi says intel pane...   
2   The Koch Brothers And Warren Buffett Try To T...   
3  DEMOCRAT LAWMAKER Puts Forth Bill Requiring Wi...   
4  NOT KIDDING: Call A Transexual “He” If He Want...   

                                                text       subject       date  
0  NAIROBI (Reuters) - Competing efforts to end S...     worldnews 2017-09-08  
1  WASHINGTON (Reuters) - U.S. House of Represent...  politicsNews 2017-03-28  
2  Billionaire Investor Warren Buffett has portra...          News 2016-03-02  
3  Leave it to a Democrat to waste everyone s tim...     left-news 2016-02-17  
4  The LGBT mafia has never been more threatening...     left-news 2015-12-28  
0    1
1    1
2    0
3    0
4    0
Name: label, dtype: int64


In [8]:
#preprocessing the text data

#Lowering the text
x_train['title']=x_train['title'].str.lower()
x_train['text']=x_train['text'].str.lower()
x_train['subject']=x_train['subject'].str.lower()

#tokenization
tokenized=[nltk.word_tokenize(i) for i in x_train['text']]

#removing stopwords and non-alphabetic characters
#stemming the words
ps=nltk.PorterStemmer()

stopwords=set(stopwords.words('english'))

for i in range(len(tokenized)):
    tokenized[i]=[word if word.isalpha() else '' for word in tokenized[i]]
    tokenized[i]=[word if word not in stopwords else '' for word in tokenized[i] ]
    tokenized[i]=[ps.stem(word) if word else '' for word in tokenized[i]]


#joining the tokens back to form the text
for i in range(len(tokenized)):
    tokenized[i]=' '.join(tokenized[i])
x_train['text']=tokenized

print(x_train['text'].head())


0    nairobi  reuter   compet effort  end south sud...
1    washington  reuter    hous  repres democrat le...
2    billionair investor warren buffett  portray   ...
3    leav    democrat  wast everyon  time  money  p...
4     lgbt mafia  never   threaten  power   call  t...
Name: text, dtype: object


In [None]:
#filling missing values
x_train['title']=x_train['title'].fillna('')

x_train['text']=x_train['text'].fillna('')

x_train['subject']=x_train['subject'].fillna('')

x_train['date']=x_train['date'].ffill()


#Saving the preprocessed data
x_train.to_csv('../data/processed/x_train.csv',index=False)
y_train.to_csv('../data/processed/y_train.csv',index=False)