## Exploratory Data Analysis

In [30]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.columns

Index(['id', 'label', 'tweet'], dtype='object')

## 1. Data Exploration
#### 1.1 Check for missing data and rectify

In [3]:
print(train_df.tweet.isna().any())
print(train_df.tweet.isna().any())

False
False


There is no missing data present.

#### 1.2 Data cleaning

In [61]:
train_df['cleaned_text'] = train_df['tweet']
test_df['cleaned_text'] = test_df['tweet']

# get the hashtags
def getHashTags(text):
    tags = ''
    for word in text.split():
        if word.startswith('#'):
            tags += (' ' + word[1:])
            
    return tags.strip()

train_df['hashTags'] = train_df.cleaned_text.apply(lambda row: getHashTags(row))

In [62]:
#remove url
# train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : ' '.join([word for word in str(text).split() if not ('http' in word or 'https' in word)]))
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : re.sub(r'http\S+', '', text))

In [63]:
#remove apostrophes

def removeApostrophes(text):
    aposMap = {
        "'s": ' is',
        "'re" : ' are',
        "'m" : ' am',
        "can't" : 'cannot',
        "ain't" : 'is not',
        "n't" : ' not'
    }
    
    for apos in aposMap:
        if apos in text:
            text = text.replace(apos,aposMap[apos])
    
    return text

train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : removeApostrophes(text))

In [68]:
# encoding decoding

train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : text.encode('ascii','ignore').decode('ascii'))

In [69]:

#remove punctuations
import re

def removePunctuations(text):
    return re.sub(r'[^\w\s]','',text)

train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda row: removePunctuations(row))

In [70]:
#remove stopwords
from nltk.corpus import stopwords 
# from wordcloud import STOPWORDS
stop_words = set(stopwords.words('english')) 
train_df['cleaned_text'] = train_df.cleaned_text.apply(lambda text : ' '.join([word for word in text.split() if not word in stop_words]))


In [60]:
test1 = '#model   i love u take with u all the time in urÃ°ÂŸÂ“Â±!!! Ã°ÂŸÂ˜Â™Ã°ÂŸÂ˜ÂŽÃ°ÂŸÂ‘Â„Ã°ÂŸÂ‘Â…Ã°ÂŸÂ’Â¦Ã°ÂŸÂ’Â¦Ã°ÂŸÂ’Â¦  '

test1 = decodeEncoder(test1)

print(test1.decode('ascii'))

#model   i love u take with u all the time in ur!!!   


In [71]:
train_df.head(12)

Unnamed: 0,id,label,tweet,cleaned_text,hashTags
0,1,0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...,run
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cannot use cause ...,lyft disapointed getthanked
2,3,0,bihday your majesty,bihday majesty,
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur,model
4,5,0,factsguide: society now #motivation,factsguide society motivation,motivation
5,6,0,[2/2] huge fan fare and big talking before the...,22 huge fan fare big talking leave chaos pay d...,allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @use...,user camping tomorrow user user user user user...,
7,8,0,the next school year is the year for exams.ð...,next school year year exams cannot think schoo...,school exams hate imagine actorslife revolutio...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land allin cavs champions cleveland cleve...,allin cavs champions cleveland clevelandcavaliers
9,10,0,@user @user welcome here ! i'm it's so #gr...,user user welcome gr8,gr8
