In [87]:
import pandas as pd
from nltk.corpus import stopwords

In [88]:
fakedf = pd.read_csv('../data/Fake.csv')
truedf = pd.read_csv('../data/True.csv')
fakedf.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [89]:
truedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [90]:
fakedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


### Steps to consider for Preprocessing
1. Remove Null values
2. Remove unnecessary columns and combine title and text columns, if needed
3. Apply target variable (Fake, True) to the dataset
4. Combine the Fake and True datasets
5. Tokenize and remove stopwords. Will do lemmatization later to check its impact on model performance
6. Convert text into Word2Vec Embedding (Pretrained model)

In [91]:
truedf.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [92]:
fakedf.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [93]:
truedf['article'] = truedf['title'] + " " + truedf['text']
truedf['target'] = 0
fakedf['article'] = fakedf['title'] + " " + fakedf['text']
fakedf['target'] = 1

In [94]:
truedf.drop(columns=['title','text','subject', 'date'], axis=1, inplace=True)
fakedf.drop(columns=['title','text','subject', 'date'], axis=1, inplace=True)

In [95]:
# truedf = truedf.iloc[0:100,]
# fakeedf = fakedf.iloc[0:100,]

In [96]:
articledf = pd.concat([truedf, fakedf], ignore_index=True)
articledf.head()

Unnamed: 0,article,target
0,"As U.S. budget fight looms, Republicans flip t...",0
1,U.S. military to accept transgender recruits o...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,0
3,FBI Russia probe helped by Australian diplomat...,0
4,Trump wants Postal Service to charge 'much mor...,0


In [97]:
articledf.tail()

Unnamed: 0,article,target
44893,McPain: John McCain Furious That Iran Treated ...,1
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,1
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,1
44896,How to Blow $700 Million: Al Jazeera America F...,1
44897,10 U.S. Navy Sailors Held by Iranian Military ...,1


In [98]:
stop = set(stopwords.words('english'))
word_count = 0
for i, article in enumerate(articledf['article']):
    sentence_list = article.split(" ")
    filtered_words = [word.lower() for word in sentence_list if word.lower() not in stop]
    word_count += len(filtered_words)
    article = " ".join(filtered_words)
    articledf.iloc[i,0] = article

In [99]:
print(word_count)

11606239


#### Can add lemmatization later

In [100]:
articledf.to_csv('../data/article_preprocessed.csv', index=False)