# Data cleansing

#### This **.ipynb** file converts the text to lowercase, removes columns that are <br>not necessary and removes punctuations and stopwords.

Removes 'date' and 'title' columns

In [24]:
data.shape

(44898, 2)

In [25]:
from sklearn.utils import shuffle
data = shuffle(data)

In [26]:
data.head()

Unnamed: 0,text,label
16012,BEIRUT (Reuters) - With Islamic State near def...,True
5198,WASHINGTON (Reuters) - U.S. Treasury Secretary...,True
15314,BEIJING (Reuters) - U.S. President Donald Trum...,True
346,"(Reuters) - Richard Cordray, a Democrat whose ...",True
5264,"NORTH CHARLESTON, S.C. (Reuters) - U.S. Senato...",True


Makes all letters lowercase in text column

In [27]:
data['text'] = data['text'].str.lower()

In [28]:
data.head()

Unnamed: 0,text,label
16012,beirut (reuters) - with islamic state near def...,True
5198,washington (reuters) - u.s. treasury secretary...,True
15314,beijing (reuters) - u.s. president donald trum...,True
346,"(reuters) - richard cordray, a democrat whose ...",True
5264,"north charleston, s.c. (reuters) - u.s. senato...",True


Removes punctuation like (,.()- etc.) in text column

In [29]:
from string import punctuation

def remove_punctuation(text):
    temp_list = []
    for char in text:
        if char not in punctuation:
            temp_list.append(char)
    return ''.join(temp_list)

''' 
or with comprehension list
def remove_punctuation(text):
    return ''.join([char for char in text if char not in punctuation])
'''

data['text'] = data['text'].apply(remove_punctuation)

In [30]:
data.head()

Unnamed: 0,text,label
16012,beirut reuters with islamic state near defeat...,True
5198,washington reuters us treasury secretary stev...,True
15314,beijing reuters us president donald trump tol...,True
346,reuters richard cordray a democrat whose resi...,True
5264,north charleston sc reuters us senator tim sc...,True


Removes stopwords like (as, at, be, both, etc.) in text column

In [31]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/belis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
def remove_stopwords(text):
    temp_list = []
    for word in text.split():
        if word not in stopwords:
            temp_list.append(word)
    return ' '.join(temp_list)

''' 
or with comprehension list
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords])
'''

data['text'] = data['text'].apply(remove_stopwords)

In [33]:
data.head()

Unnamed: 0,text,label
16012,beirut reuters islamic state near defeat syria...,True
5198,washington reuters us treasury secretary steve...,True
15314,beijing reuters us president donald trump told...,True
346,reuters richard cordray democrat whose resigna...,True
5264,north charleston sc reuters us senator tim sco...,True
