In [15]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from datetime import datetime
#nltk.download('stopwords')
#nltk.download('punkt')


def parse_date(x):
    for fmt in ("%B %d, %Y", "%b %d, %Y"):
        try:
            return datetime.strptime(x.strip(), fmt)
        except ValueError:
            continue
    return None

# Load the data
#Combine the two datasets and shuffle them
data1=pd.read_csv('../data/raw/Fake.csv',delimiter=',')
data2=pd.read_csv('../data/raw/True.csv',delimiter=',')
data1['label']=0
data2['label']=1

data=pd.concat([data1,data2],ignore_index=True)
data=data.sample(frac=1).reset_index(drop=True)

data.loc[data["date"] == "14-Feb-18", "date"] = "February 14, 2018"
data.loc[data["date"] == "15-Feb-18", "date"] = "February 15, 2018"
data.loc[data["date"] == "16-Feb-18", "date"] = "February 16, 2018"
data.loc[data["date"] == "17-Feb-18", "date"] = "February 17, 2018"
data.loc[data["date"] == "18-Feb-18", "date"] = "February 18, 2018"
data.loc[data["date"] == "19-Feb-18", "date"] = "February 19, 2018"

data["date"]=data["date"].apply(parse_date)

x_train=data.drop(columns=['label'])
y_train=data['label']

print(x_train.head())
print(y_train.head())

                                               title  \
0   A Panicked Trump Ran Around The White House A...   
1   WATCH: Will Ferrell Skewers Trump As He Repri...   
2   Watch Heartbreaking Cries From An Abused Pupp...   
3  U.S. intel committee subpoenas comedian in Rus...   
4  MESSAGE TO THE LEFT ON IMMIGRATION: If You Dis...   

                                                text    subject       date  
0  This incredibly awkward story comes from New Y...       News 2017-06-30  
1  Last night, during the official White House Co...       News 2017-04-30  
2  A video of a terrified and previously abused p...       News 2016-01-02  
3  (Reuters) - A New York comedian has been compe...  worldnews 2017-11-30  
4  IT S A MUST READ! The first month of the Trump...   politics 2017-02-27  
0    0
1    0
2    0
3    1
4    0
Name: label, dtype: int64


In [16]:
#preprocessing the text data

#filling missing values
x_train['title']=x_train['title'].fillna('')

x_train['text']=x_train['text'].fillna('')

x_train['subject']=x_train['subject'].fillna('')

x_train['date']=x_train['date'].ffill()


#Lowering the text
x_train['title']=x_train['title'].str.lower()
x_train['text']=x_train['text'].str.lower()
x_train['subject']=x_train['subject'].str.lower()

#tokenization
tokenized=[nltk.word_tokenize(i) for i in x_train['text']]

#removing stopwords and non-alphabetic characters
#stemming the words
ps=nltk.PorterStemmer()

stopwords=set(stopwords.words('english'))

for i in range(len(tokenized)):
    # Keep only alphabetic words and remove stopwords
    tokenized[i] = [ps.stem(word) for word in tokenized[i] if word.isalpha() and word not in stopwords]

    tokenized[i] = ' '.join(tokenized[i]) if tokenized[i] else ' '


x_train['text'] = tokenized

print(x_train['text'].head())


0    incred awkward stori come new york post report...
1    last night offici white hous correspond dinner...
2    video terrifi previous abus puppi shown love f...
3    reuter new york comedian compel appear hous in...
4    must read first month trump administr alreadi ...
Name: text, dtype: object


In [17]:
#Saving the preprocessed data
x_train.to_csv('../data/processed/x_train.csv',index=False)
y_train.to_csv('../data/processed/y_train.csv',index=False)