## Import Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
plt.style.use('ggplot')

In [3]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
print(f'There are {data.shape[0]} Rows and {data.shape[1]} columns in our data')

There are 20800 Rows and 5 columns in our data


In [5]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
data[data['title'].isnull()]

Unnamed: 0,id,title,author,text,label
53,53,,Dairy✓ᵀᴿᵁᴹᴾ,Sounds like he has our president pegged. What ...,1
120,120,,Anonymous,"Same people all the time , i dont know how you...",1
124,124,,SeekSearchDestory,"You know, outside of any morality arguments, i...",1
140,140,,Anonymous,There is a lot more than meets the eye to this...,1
196,196,,Raffie,They got the heater turned up on high.,1
...,...,...,...,...,...
20568,20568,,Cathy Milne,"Amusing comment Gary! “Those week!” So, are ...",1
20627,20627,,Ramona,"No she doesn’t have more money than God, every...",1
20636,20636,,Dave Lowery,Trump all the way!,1
20771,20771,,Letsbereal,DYN's Statement on Last Week's Botnet Attack h...,1


there are 558 null row in our data , so let's handle it

In [7]:
data = data[data['title'].notna()]

In [8]:
data.isnull().sum()

id           0
title        0
author    1957
text        39
label        0
dtype: int64

In [9]:
data['content'] = data['author'] + " " + data['title']

In [10]:
data.drop(['title','author','text'],axis=1,inplace=True)

In [11]:
data.head()

Unnamed: 0,id,label,content
0,0,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,1,Consortiumnews.com Why the Truth Might Get You...
3,3,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,1,Howard Portnoy Iranian woman jailed for fictio...


In [12]:
data['content'].iloc[0]

'Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

## Data preprocessing

### 1) conver letters into lower case 

In [130]:
data['content'] = data['content'].astype(str).apply(lambda x:x.lower())

In [131]:
data.head()

Unnamed: 0,id,label,content
0,0,1,darrell lucus house dem aide: we didn’t even s...
1,1,0,"daniel j. flynn flynn: hillary clinton, big wo..."
2,2,1,consortiumnews.com why the truth might get you...
3,3,1,jessica purkiss 15 civilians killed in single ...
4,4,1,howard portnoy iranian woman jailed for fictio...


### 2) Remove punctuation 

In [132]:
punctuation = string.punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [133]:
def removePunctuation(text):
    text = [word for word in text if word not in punctuation]
    return "".join(text)

In [134]:
removePunctuation('hello ahmed !!')

'hello ahmed '

In [135]:
data['content'] = data['content'].apply(removePunctuation)

In [136]:
data['content'].head()

0    darrell lucus house dem aide we didn’t even se...
1    daniel j flynn flynn hillary clinton big woman...
2    consortiumnewscom why the truth might get you ...
3    jessica purkiss 15 civilians killed in single ...
4    howard portnoy iranian woman jailed for fictio...
Name: content, dtype: object

### 3) Tokenize Words

In [137]:
from nltk.tokenize import word_tokenize

In [138]:
data['content'] = data['content'].apply(word_tokenize)

In [139]:
data.head()

Unnamed: 0,id,label,content
0,0,1,"[darrell, lucus, house, dem, aide, we, didn, ’..."
1,1,0,"[daniel, j, flynn, flynn, hillary, clinton, bi..."
2,2,1,"[consortiumnewscom, why, the, truth, might, ge..."
3,3,1,"[jessica, purkiss, 15, civilians, killed, in, ..."
4,4,1,"[howard, portnoy, iranian, woman, jailed, for,..."


### 4) Remove Stopwords

In [140]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [141]:
def removeStopWords(text):
    text= [word for word in text if word not in stop_words]
    return text

In [142]:
# berfore removing stopwords you have to tokenize the data 
removeStopWords(['this','is','ahmed','basem'])

['ahmed', 'basem']

In [143]:
data['content'] = data['content'].apply(removeStopWords)

In [145]:
data.head()

Unnamed: 0,id,label,content
0,0,1,"[darrell, lucus, house, dem, aide, ’, even, se..."
1,1,0,"[daniel, j, flynn, flynn, hillary, clinton, bi..."
2,2,1,"[consortiumnewscom, truth, might, get, fired]"
3,3,1,"[jessica, purkiss, 15, civilians, killed, sing..."
4,4,1,"[howard, portnoy, iranian, woman, jailed, fict..."


### 5) Stemming

In [147]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [149]:
def wordStemming(text):
    text = [stemmer.stem(word) for word in text]
    return text

In [150]:
data['content'] = data['content'].apply(wordStemming)

In [151]:
data.head()

Unnamed: 0,id,label,content
0,0,1,"[darrel, lucu, hous, dem, aid, ’, even, see, c..."
1,1,0,"[daniel, j, flynn, flynn, hillari, clinton, bi..."
2,2,1,"[consortiumnewscom, truth, might, get, fire]"
3,3,1,"[jessica, purkiss, 15, civilian, kill, singl, ..."
4,4,1,"[howard, portnoy, iranian, woman, jail, fictio..."


### 6) Lemmatizing

In [152]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [153]:
def wordLemmatizing(text):
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [154]:
data['content'] = data['content'].apply(wordLemmatizing)

In [155]:
data.head()

Unnamed: 0,id,label,content
0,0,1,"[darrel, lucu, hous, dem, aid, ’, even, see, c..."
1,1,0,"[daniel, j, flynn, flynn, hillari, clinton, bi..."
2,2,1,"[consortiumnewscom, truth, might, get, fire]"
3,3,1,"[jessica, purkiss, 15, civilian, kill, singl, ..."
4,4,1,"[howard, portnoy, iranian, woman, jail, fictio..."


In [157]:
data['content'] = data['content'].apply(lambda x : " ".join(x))

In [158]:
data.head()

Unnamed: 0,id,label,content
0,0,1,darrel lucu hous dem aid ’ even see comey ’ le...
1,1,0,daniel j flynn flynn hillari clinton big woman...
2,2,1,consortiumnewscom truth might get fire
3,3,1,jessica purkiss 15 civilian kill singl u airst...
4,4,1,howard portnoy iranian woman jail fiction unpu...


In [159]:
data.to_csv('processedData.csv')