## Exploratory Data Analysis

#### Understand the data - Clean the data - Analysis between the relationship


In [1]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [2]:
train_df.columns

Index(['id', 'label', 'tweet'], dtype='object')

## 1. Data Exploration
#### 1.1 Check for missing data and rectify

In [3]:
print(train_df.tweet.isna().any())
print(train_df.tweet.isna().any())

False
False


There is no missing data present.

#### 1.2 Data cleaning

In [4]:
train_df['cleaned_text'] = train_df['tweet']
test_df['cleaned_text'] = test_df['tweet']

# get the hashtags
def getHashTags(text):
    tags = ''
    for word in text.split():
        if word.startswith('#'):
            tags += (' ' + word[1:])
            
    return tags.strip()

train_df['hashTags'] = train_df.cleaned_text.apply(lambda row: getHashTags(row))

In [5]:
#remove url
# train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : ' '.join([word for word in str(text).split() if not ('http' in word or 'https' in word)]))
import re
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : re.sub(r'http\S+', '', text))

In [6]:
#remove apostrophes

def removeApostrophes(text):
    aposMap = {
        "'s": ' is',
        "'re" : ' are',
        "'m" : ' am',
        "can't" : 'cannot',
        "ain't" : 'is not',
        "n't" : ' not'
    }
    
    for apos in aposMap:
        if apos in text:
            text = text.replace(apos,aposMap[apos])
    
    return text

train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : removeApostrophes(text))

In [7]:
# encoding decoding
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : text.encode('ascii','ignore').decode('ascii'))

In [8]:
#remove punctuations
import re

def removePunctuations(text):
    modifiedText = re.sub(r'[^\w\s]','',text)
    modifiedText = re.sub(r'[0-9]','',modifiedText)    
    
    return modifiedText

train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda row: removePunctuations(row))

In [10]:
#remove stopwords
from nltk.corpus import stopwords 
# from wordcloud import STOPWORDS
stop_words = set(stopwords.words('english')) 
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : ' '.join([word for word in text.split() if not word in stop_words]))


In [11]:
train_df.cleaned_text = train_df.cleaned_text.str.replace('user', '')
train_df

Unnamed: 0,id,label,tweet,cleaned_text,hashTags
0,1,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfu...,run
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cannot use cause offer wh...,lyft disapointed getthanked
2,3,0,bihday your majesty,bihday majesty,
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur,model
4,5,0,factsguide: society now #motivation,factsguide society motivation,motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talking leave chaos pay disp...,allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny,
7,8,0,the next school year is the year for exams.ð...,next school year year exams cannot think schoo...,school exams hate imagine actorslife revolutio...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land allin cavs champions cleveland cleve...,allin cavs champions cleveland clevelandcavaliers
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome gr,gr8


In [12]:
# extract only english words

import enchant
english_dict = enchant.Dict("en_US")
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : ' '.join([word for word in text.split() if english_dict.check(word)]))


Unnamed: 0,id,label,tweet,cleaned_text,hashTags
0,1,0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,run
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks credit cannot use cause offer wheelchai...,lyft disapointed getthanked
2,3,0,bihday your majesty,majesty,
3,4,0,#model i love u take with u all the time in ...,model love u take u time,model
4,5,0,factsguide: society now #motivation,society motivation,motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talking leave chaos pay disp...,allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow,
7,8,0,the next school year is the year for exams.ð...,next school year year exams cannot think schoo...,school exams hate imagine actorslife revolutio...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land champions,allin cavs champions cleveland clevelandcavaliers
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome gr,gr8


In [14]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : ' '.join([stemmer.stem(word) for word in text.split()]))

In [15]:
train_df 

Unnamed: 0,id,label,tweet,cleaned_text,hashTags
0,1,0,@user when a father is dysfunctional and is s...,father dysfunct selfish drag kid dysfunct run,run
1,2,0,@user @user thanks for #lyft credit i can't us...,thank credit cannot use cau offer wheelchair van,lyft disapointed getthanked
2,3,0,bihday your majesty,majesti,
3,4,0,#model i love u take with u all the time in ...,model love u take u time,model
4,5,0,factsguide: society now #motivation,societi motiv,motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare big talk leav chao pay disput get,allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @use...,camp tomorrow,
7,8,0,the next school year is the year for exams.ð...,next school year year exam cannot think school...,school exams hate imagine actorslife revolutio...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land champion,allin cavs champions cleveland clevelandcavaliers
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcom gr,gr8
