# Data cleansing

#### This **.ipynb** file converts the text to lowercase, removes columns that are <br>not necessary and removes punctuations and stopwords.

Removes 'date' and 'title' columns

In [9]:
data.shape

(44898, 2)

In [10]:
from sklearn.utils import shuffle
data = shuffle(data)

In [11]:
data.head()

Unnamed: 0,text,label
8965,NEW YORK (Thomson Reuters Foundation) - Aborti...,true
2441,Donald Trump does not care about Jewish people...,fake
10658,"(Reuters) - Five Democratic U.S. senators, inc...",true
3881,An active shooter situation is developing in L...,fake
10896,PARIS (Reuters) - U.S. Republican presidential...,true


Makes all letters lowercase in text column

In [12]:
data['text'] = data['text'].str.lower()

In [13]:
data.head()

Unnamed: 0,text,label
8965,new york (thomson reuters foundation) - aborti...,true
2441,donald trump does not care about jewish people...,fake
10658,"(reuters) - five democratic u.s. senators, inc...",true
3881,an active shooter situation is developing in l...,fake
10896,paris (reuters) - u.s. republican presidential...,true


Removes punctuation like (,.()- etc.) in text column

In [14]:
from string import punctuation

def remove_punctuation(text):
    temp_list = []
    for char in text:
        if char not in punctuation:
            temp_list.append(char)
    return ''.join(temp_list)

''' 
or with comprehension list
def remove_punctuation(text):
    return ''.join([char for char in text if char not in punctuation])
'''

data['text'] = data['text'].apply(remove_punctuation)

In [15]:
data.head()

Unnamed: 0,text,label
8965,new york thomson reuters foundation abortion ...,true
2441,donald trump does not care about jewish people...,fake
10658,reuters five democratic us senators including...,true
3881,an active shooter situation is developing in l...,fake
10896,paris reuters us republican presidential cont...,true


Removes stopwords like (as, at, be, both, etc.) in text column

In [16]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/belis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def remove_stopwords(text):
    temp_list = []
    for word in text.split():
        if word not in stopwords:
            temp_list.append(word)
    return ' '.join(temp_list)

''' 
or with comprehension list
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords])
'''

data['text'] = data['text'].apply(remove_stopwords)

In [18]:
data.head()

Unnamed: 0,text,label
8965,new york thomson reuters foundation abortion r...,true
2441,donald trump care jewish people would strongly...,fake
10658,reuters five democratic us senators including ...,true
3881,active shooter situation developing los angele...,fake
10896,paris reuters us republican presidential conte...,true
