In [38]:
import os
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from string import digits 
import string
from textblob import TextBlob
import numpy as np
import matplotlib.pyplot as plt

#nltk.download('punkt')

In [39]:
## Loading data set
# unzipping dataset
import zipfile
with zipfile.ZipFile("../dataset/fake-news.zip", 'r') as zip_ref:
    zip_ref.extractall("../dataset/")

In [40]:
##Loading dataset
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

### Understanding Data

Understanding Data Types and Missing Information

In [41]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [42]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [43]:
train.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [44]:
## Check for missing values
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

_Author column has more than 5% missing values - cannot drop all rows. 
Imputing with empty text_

In [45]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [46]:
## Checking the structure of data
train.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [2]:
## Replacing null columns with empty string
train = train.replace(np.nan, '', regex=True)
test = test.replace(np.nan, '', regex=True)

In [64]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              5200 non-null   int64  
 1   title           5200 non-null   object 
 2   author          5200 non-null   object 
 3   text            5200 non-null   object 
 4   original_text   5200 non-null   object 
 5   text_words      5200 non-null   object 
 6   text_digit_cnt  5200 non-null   int64  
 7   clean_text_lem  5200 non-null   object 
 8   clean_text      5200 non-null   object 
 9   title_polarity  5200 non-null   float64
 10  text_polarity   5200 non-null   float64
 11  ttl_wrds        5200 non-null   int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 487.6+ KB


In [49]:
## Checking null values after imputing
print(train.isnull().sum())
print(test.isnull().sum())

id        0
title     0
author    0
text      0
label     0
dtype: int64
id        0
title     0
author    0
text      0
dtype: int64


## Data Processing and Cleaning

**Hypothesis:** Fake news would have more polarizing words in article and title, and be longer and have less reliable citations

### Feature Engineering 

To check above hypothesis, add following features:
    - total words in article
    - sentiment polarity of title
    - sentiment polarity of article
    - number of words in article with all digits

---------------
Steps:
    1. cleaning and processing 
    2. remove stop words
        cnt digits in columns
    3. cnt words in all columns (3 features)
    4. find action words
    5. calculate sentiment

In [50]:
## Getting column names
text_cols = train.columns[1:-1]

In [51]:
## Saving the orignal text column before modifications
train['original_text'] = train['text']
test['original_text'] = test['text']

In [52]:
## Changing colums to lowercase
for col in text_cols:
    train[col] = train.apply(lambda x: x[col].lower(),axis=1)
    test[col] = test.apply(lambda x: x[col].lower(),axis=1)

In [53]:
## removing punctuations
for col in text_cols:
    train[col] = train.apply(lambda x: x[col].translate(str.maketrans('','',string.punctuation)),axis=1)
    test[col] = test.apply(lambda x: x[col].translate(str.maketrans('','',string.punctuation)),axis=1)    

In [54]:
## Word tokenization
train['text_words'] = train.apply(lambda x: word_tokenize(x['text']),axis=1)
test['text_words'] = test.apply(lambda x: word_tokenize(x['text']),axis=1)

In [55]:
## Create new features 
## Number of digits ( to see if citations are made)

def cnt_digits(sentence):
    return sum(c.isdigit() for c in sentence)
        
train['text_digit_cnt'] = train.apply(lambda x: cnt_digits(x.text_words),axis=1)
test['text_digit_cnt'] = test.apply(lambda x: cnt_digits(x.text_words),axis=1)

In [56]:
## removing digits
for col in text_cols:
    train[col] = train.apply(lambda x: x[col].translate(str.maketrans('', '', digits)) ,axis=1)
    test[col] = test.apply(lambda x: x[col].translate(str.maketrans('', '', digits) ) ,axis=1)

In [57]:
## tokenzize again after digits removal
train['text_words'] = train.apply(lambda x: word_tokenize(x['text']),axis=1)
test['text_words'] = test.apply(lambda x: word_tokenize(x['text']),axis=1)

In [58]:
## lemmatize verbs and nouns
## in final model only nouns have been lemmatize, verbs used to indicate polarity

def lemmat_words(words,pos):
    lemmated = [lemmatizer.lemmatize(word,pos) for word in words]
    return lemmated

lemmatizer = WordNetLemmatizer()
train['clean_text_lem'] = train.apply(lambda x: lemmat_words(x['text_words'],pos = wordnet.NOUN),axis=1)
test['clean_text_lem'] = test.apply(lambda x: lemmat_words(x['text_words'],pos = wordnet.NOUN),axis=1)

In [59]:
##remove stopwords
import nltk
#nltk.download('stopwords')

## stop words from nltk module
stop_words = list(set(stopwords.words('english')))

def rm_stopwords(text_words):
    stopped = [w for w in text_words if not w in stop_words]
    return stopped

train['clean_text'] = train.apply(lambda x: rm_stopwords(x['clean_text_lem']),axis=1)
test['clean_text'] = test.apply(lambda x: rm_stopwords(x['clean_text_lem']),axis=1)

In [61]:
## Calculate Sentiment
def polarity(words):
    text = ' '.join([w for w in words])
    blob = TextBlob(text)
    return(blob.polarity)

train['title_polarity'] = train.apply(lambda x: polarity(x['title']),axis=1)
train['text_polarity'] = train.apply(lambda x: polarity(x['clean_text']),axis=1)
test['title_polarity'] = test.apply(lambda x: polarity(x['title']),axis=1)
test['text_polarity'] = test.apply(lambda x: polarity(x['clean_text']),axis=1)

In [62]:
## length of article
train['ttl_wrds'] = train.apply(lambda x: len(x.text_words),axis = 1)
test['ttl_wrds'] = test.apply(lambda x: len(x.text_words),axis = 1)

In [63]:
train.head(5)

Unnamed: 0,id,title,author,text,label,original_text,text_words,text_digit_cnt,clean_text_lem,clean_text,title_polarity,text_polarity,ttl_wrds
0,0,house dem aide we didn’t even see comey’s lett...,darrell lucus,house dem aide we didn’t even see comey’s lett...,1,House Dem Aide: We Didn’t Even See Comey’s Let...,"[house, dem, aide, we, didn, ’, t, even, see, ...",6,"[house, dem, aide, we, didn, ’, t, even, see, ...","[house, dem, aide, ’, even, see, comey, ’, let...",0.0,0.026726,857
1,1,flynn hillary clinton big woman on campus bre...,daniel j flynn,ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,"[ever, get, the, feeling, your, life, circles,...",9,"[ever, get, the, feeling, your, life, circle, ...","[ever, get, feeling, life, circle, roundabout,...",0.0,0.077613,742
2,2,why the truth might get you fired,consortiumnewscom,why the truth might get you fired october \n...,1,"Why the Truth Might Get You Fired October 29, ...","[why, the, truth, might, get, you, fired, octo...",8,"[why, the, truth, might, get, you, fired, octo...","[truth, might, get, fired, october, tension, i...",0.0,0.083994,1293
3,3,civilians killed in single us airstrike have ...,jessica purkiss,videos civilians killed in single us airstrik...,1,Videos 15 Civilians Killed In Single US Airstr...,"[videos, civilians, killed, in, single, us, ai...",13,"[video, civilian, killed, in, single, u, airst...","[video, civilian, killed, single, u, airstrike...",0.0,0.021485,555
4,4,iranian woman jailed for fictional unpublished...,howard portnoy,print \nan iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,"[print, an, iranian, woman, has, been, sentenc...",2,"[print, an, iranian, woman, ha, been, sentence...","[print, iranian, woman, ha, sentenced, six, ye...",0.0,0.047143,160


In [65]:
##Saving data
train.to_csv('../dataset/train_cleaned.csv',index= False)
test.to_csv('../dataset/test_cleaned.csv',index= False)