In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [86]:
df = pd.read_csv('twitt30k.csv')
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


In [87]:
df.shape

(30000, 2)

In [88]:
df_copy = df.copy()

In [89]:
df_copy['word counts'] = df_copy['twitts'].apply(lambda x : len(x.split()))
df_copy.head()

Unnamed: 0,twitts,sentiment,word counts
0,@robbiebronniman Sounds like a great night.,1,6
1,Damn the person who stolde my wallet !!!!! Ma...,1,18
2,Greetings from the piano bench (photo) http:/...,1,7
3,@drewryanscott i love it!! i love you!! haha f...,1,25
4,"@kissthestars Pretty pretty pretty please, pak...",0,18


# Character counts

In [90]:
def char_counts(x):
    x = x.split()
    x = ''.join(x)
    return len(x)

In [91]:
df_copy['char counts'] = df_copy['twitts'].apply(lambda x : char_counts(x))
df_copy.head()

Unnamed: 0,twitts,sentiment,word counts,char counts
0,@robbiebronniman Sounds like a great night.,1,6,38
1,Damn the person who stolde my wallet !!!!! Ma...,1,18,73
2,Greetings from the piano bench (photo) http:/...,1,7,57
3,@drewryanscott i love it!! i love you!! haha f...,1,25,110
4,"@kissthestars Pretty pretty pretty please, pak...",0,18,113


# Average word length

In [92]:
df_copy['average_word_len'] = df_copy['char counts']/df_copy['word counts']
df_copy.head()

Unnamed: 0,twitts,sentiment,word counts,char counts,average_word_len
0,@robbiebronniman Sounds like a great night.,1,6,38,6.333333
1,Damn the person who stolde my wallet !!!!! Ma...,1,18,73,4.055556
2,Greetings from the piano bench (photo) http:/...,1,7,57,8.142857
3,@drewryanscott i love it!! i love you!! haha f...,1,25,110,4.4
4,"@kissthestars Pretty pretty pretty please, pak...",0,18,113,6.277778


# Stop word counts

In [93]:
from nltk.corpus import stopwords

In [94]:
stop_words = stopwords.words('english')

In [95]:
df['twitts'] = df['twitts'].apply(lambda x : ' '.join([t for t in x.split() if t not in stop_words]))

In [96]:
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like great night.,1
1,Damn person stolde wallet !!!!! May karma come...,1
2,Greetings piano bench (photo) http://twitpic.c...,1
3,@drewryanscott love it!! love you!! haha forge...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


# Count # and @

In [97]:
def remove(x):
    x = x.split()
    for i in range(len(x)):
        if(x[i].startswith('@')):
            x[i] = x[i].replace('@','')
        elif(x[i].startswith('#')):
            x[i].replace('#','')
    return ' '.join(x)

In [98]:
remove('@robbiebronniman Sounds like great night')

'robbiebronniman Sounds like great night'

In [99]:
df['twitts'] = df['twitts'].apply(lambda x : remove(x))

In [100]:
df.head()

Unnamed: 0,twitts,sentiment
0,robbiebronniman Sounds like great night.,1
1,Damn person stolde wallet !!!!! May karma come...,1
2,Greetings piano bench (photo) http://twitpic.c...,1
3,drewryanscott love it!! love you!! haha forget...,1
4,"kissthestars Pretty pretty pretty please, paki...",0


## If numeric digits are present in twitts

In [101]:
df_copy.head()

Unnamed: 0,twitts,sentiment,word counts,char counts,average_word_len
0,@robbiebronniman Sounds like a great night.,1,6,38,6.333333
1,Damn the person who stolde my wallet !!!!! Ma...,1,18,73,4.055556
2,Greetings from the piano bench (photo) http:/...,1,7,57,8.142857
3,@drewryanscott i love it!! i love you!! haha f...,1,25,110,4.4
4,"@kissthestars Pretty pretty pretty please, pak...",0,18,113,6.277778


In [102]:
df_copy['numeric_counts'] = df_copy['twitts'].apply(lambda x : len([t for t in x.split() if t.isdigit()]))

In [103]:
df_copy[df_copy['numeric_counts']==5]

Unnamed: 0,twitts,sentiment,word counts,char counts,average_word_len,numeric_counts
9281,OMG my battery is empty ..... !!! aaaaah and i...,0,20,74,3.7,5
12969,I'm so excited to go home.. 13 hours going to...,1,27,99,3.666667,5
29770,"7 children 1 baby, 82 women, 126 men on the fl...",0,17,62,3.647059,5


In [104]:
df_copy.iloc[12969]

twitts              I'm so excited to go home..  13 hours going to...
sentiment                                                           1
word counts                                                        27
char counts                                                        99
average_word_len                                             3.666667
numeric_counts                                                      5
Name: 12969, dtype: object

In [105]:
def remove_digit(x):
    x = x.split()
    for t in range(0,len(x)):
        if(x[t].isdigit()):
            x[t] = ''
    return ' '.join(x)
    

In [106]:
df['twitts'] = df['twitts'].apply(lambda x : remove_digit(x))

# Lower case conversion

In [107]:
df['twitts'] = df['twitts'].apply(lambda x : str(x).lower())

In [108]:
df.iloc[12969]

twitts       i'm excited go home..  hours going hk,  hours ...
sentiment                                                    1
Name: 12969, dtype: object

# Contraction to Expansion

In [109]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring',
"won't":"would not",
'dis':'this',
"bak":"back",
"brng":'bring',
"i'v" : 'i have'
}

In [110]:
def expand(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key,value)
        return x
    else:
        return x

In [111]:
df['twitts'] = df['twitts'].apply(lambda x : expand(x))

# Email removals and count

In [112]:
import re

In [113]:
# re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',x)

In [114]:
df_copy['emails'] = df_copy['twitts'].apply(lambda x : re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',x))

In [115]:
df_copy['email_count'] = df_copy['emails'].apply(lambda x : len(x))

In [116]:
df_copy[df_copy['email_count']==1]

Unnamed: 0,twitts,sentiment,word counts,char counts,average_word_len,numeric_counts,emails,email_count
9757,just seeing all my@ambercamp...about to answer...,0,13,77,5.923077,0,[my@ambercamp...about],1
14619,@maritorres Cool! Just send an email to contac...,1,18,112,6.222222,0,[contactus@bondno9.com],1
19972,@hot2definc phone was stolen bro.... campbell...,0,6,60,10.0,0,[campbell.relations@gmail.com],1
24514,@joystiq May I have a code please I'v been wan...,0,20,92,4.6,0,[solarboy300@yahoo.com],1
24686,Looking to speak to a Met Life agent. Please c...,1,20,102,5.1,0,[lunasin@live.com],1
28377,Aw just got the cutest background this little ...,1,18,106,5.888889,0,[popstar.team@yahoo.com],1


In [117]:
df['twitts'] = df['twitts'].apply(lambda x : re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)'," ",x))

# Removal of special characters

In [118]:
df['twitts'] = df['twitts'].apply(lambda x : re.sub(r'[^\w]+'," ",x))

# Remove URLs

In [119]:
df['twitts'] = df['twitts'].apply(lambda x : re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'," ",x))

In [120]:
df.head()

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like great night,1
1,damn person stolde wallet may karma come back ...,1
2,greetings piano bench photo http twitpic com 6...,1
3,drewryanscott love it love you haha forget hug...,1
4,kissthestars pretty pretty pretty please pakid...,0


In [121]:
df['twitts'] = df['twitts'].apply(lambda x : ' '.join(x.split()))

# Removal of HTML tags

In [123]:
from bs4 import BeautifulSoup

In [127]:
df['twitts'] = df['twitts'].apply(lambda x : BeautifulSoup(x,'lxml').get_text().strip())

# Removal of Accented Characters

In [128]:
import unicodedata

In [132]:
def remove_accented(x):
    x = unicodedata.normalize('NFKD',x).encode('ascii','ignore').decode('utf-8','ignore')
    return x

In [133]:
df.tail()

Unnamed: 0,twitts,sentiment
29995,calumfan1 way related photoshop,0
29996,swiz_nz really wow thats crap,0
29997,at lexus hs250h press event again cannot tell ...,0
29998,karmicunderpath ooooh there is nice thought,1
29999,mariap91 i would usually ask sun school now si...,1


In [136]:
df['twitts'] = df['twitts'].apply(lambda x : remove_accented(x))

# Conversion into base form of words

In [140]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

In [141]:
def convert_base(x):
    words = word_tokenize(x)
    l = []
    for w in words:
        base = stemmer.stem(w)
        l.append(base)
    return ' '.join(l)
    

In [144]:
df['twitts'] = df['twitts'].apply(lambda x : convert_base(x))

# Common Occurring words Removal

In [160]:
text = ' '.join(df['twitts'])
len(text)

1338776

In [161]:
text = text.split()

In [162]:
freq_comm = pd.Series(text).value_counts()

In [163]:
fre = freq_comm[:20]

In [164]:
df['twitts'] = df['twitts'].apply(lambda x : ' '.join([w for w in x.split() if w not in fre]))

In [165]:
text1 = ' '.join(df['twitts'])
len(text1)

1245327