In [12]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from datetime import datetime
#nltk.download('stopwords')
#nltk.download('punkt')

# Function to parse dates with mixed formats
def parse_mixed_date(x):
    for fmt in ("%b %d, %Y", "%B %d, %Y"):
        try:
            return datetime.strptime(x, fmt)
        except ValueError:
            continue
    return pd.NaT


# Load the data
#Combine the two datasets and shuffle them
data1=pd.read_csv('../data/raw/Fake.csv',delimiter=',')
data2=pd.read_csv('../data/raw/True.csv',delimiter=',')
data1['label']=0
data2['label']=1

data=pd.concat([data1,data2],ignore_index=True)
data=data.sample(frac=1).reset_index(drop=True)
data["date"]=data["date"].apply(parse_mixed_date)

x_train=data.drop(columns=['label'])
y_train=data['label']

print(x_train.head())

print(y_train.head())

                                               title  \
0         REDUX 1963? The Deep State vs Donald Trump   
1  CATHERINE ENGELBRECHT: “An Attack On One Right...   
2  WATCH VETERAN Embarrass Trump Hater In Kansas ...   
3  PENTAGON REQUEST DENIED: [Video] ARMED CITIZEN...   
4  U.S. House lawmaker to release revamped Dodd-F...   

                                                text          subject  \
0  Patrick Henningsen 21st Century WireWatching t...          US_News   
1  This woman is a hero! Catherine Engelbrecht st...  Government News   
2  This veteran exposes Trump hating protester in...        left-news   
3  Ordinary citizens (many of them are proud vete...  Government News   
4  WASHINGTON (Reuters) - The head of the U.S. Ho...     politicsNews   

        date  
0 2017-01-15  
1 2016-04-06  
2 2016-03-17  
3 2015-07-25  
4        NaT  
0    0
1    0
2    0
3    0
4    1
Name: label, dtype: int64


In [13]:
#preprocessing the text data

#Lowering the text
x_train['title']=x_train['title'].str.lower()
x_train['text']=x_train['text'].str.lower()
x_train['subject']=x_train['subject'].str.lower()

#tokenization
tokenized=[nltk.word_tokenize(i) for i in x_train['text']]

#removing stopwords and non-alphabetic characters
#stemming the words
ps=nltk.PorterStemmer()

stopwords=set(stopwords.words('english'))

for i in range(len(tokenized)):
    tokenized[i]=[word if word.isalpha() else '' for word in tokenized[i]]
    tokenized[i]=[word if word not in stopwords else '' for word in tokenized[i] ]
    tokenized[i]=[ps.stem(word) if word else '' for word in tokenized[i]]


#joining the tokens back to form the text
for i in range(len(tokenized)):
    tokenized[i]=' '.join(tokenized[i])
x_train['text']=tokenized

print(x_train['text'].head())


0    patrick henningsen  centuri wirewatch  week  s...
1     woman   hero  catherin engelbrecht stood    i...
2     veteran expos trump hate protest  kansa citi ...
3    ordinari citizen  mani    proud veteran   fill...
4    washington  reuter    head    hous  repres com...
Name: text, dtype: object


In [14]:
#Saving the preprocessed data
x_train.to_csv('../data/processed/x_train.csv',index=False)
y_train.to_csv('../data/processed/y_train.csv',index=False)