## 1. Intro

In [141]:
import pandas as pd
import numpy as np
import spacy

In [142]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [143]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitter4000.csv', encoding='latin1')
df

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0
...,...,...
3995,i just graduated,1
3996,Templating works; it all has to be done,1
3997,mommy just brought me starbucks,1
3998,@omarepps watching you on a House re-run...lov...,1


In [144]:
df['sentiment'].value_counts()

sentiment
0    2000
1    2000
Name: count, dtype: int64

## 2. Preprocessing

### Word Counts

In [145]:
df['word_counts'] = df['twitts'].apply(lambda x : len(str(x).split()))
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts
821,@ChamBasa whyy? what's wrong? &gt;&lt;,0,5
2671,"Beauty only grabs attention, but personality c...",1,14
2739,@2DaWesternSky Yay!! Thank you,1,4
1269,@tommcfly hahaha aww poor gi! take me take me!...,0,13
2998,@mrskeith7 http://twitpic.com/7iqwi - ugh so l...,1,17


In [146]:
print(f"max: {df['word_counts'].max()}")
print(f"min: {df['word_counts'].min()}")

max: 32
min: 1


In [147]:
df[df['word_counts'] == 1]

Unnamed: 0,twitts,sentiment,word_counts
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


### Characters Counts

In [148]:
df['chars_counts'] = df['twitts'].apply(lambda x: len(''.join(str(x).split())))
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,chars_counts
2731,"Well, nice of you to finally show up, Sexy McB...",1,10,43
501,@drewseeley it's not easy to see your show whe...,0,13,56
3312,"@isiswisdom OOPS, I'm already following. I did...",1,10,56
3411,@allyrockstar Asian girls in general win,1,6,35
2146,woohoo! Off to Cebu tomorrow for some lazy day...,1,19,92


### Average Word Length

In [149]:
df['avg_word_length'] = df['chars_counts'] / df['word_counts'] 
df.head()

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3
1,back in miami. waiting to unboard ship,0,7,32,4.571429
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538


### Stop Words Count

In [150]:
print(stopwords)
print(len(stopwords))

{'’ve', 'on', 'sixty', 'never', 'in', 'none', 'ever', 'sometimes', 'should', 'into', 'others', 'whence', 'fifteen', 'myself', 'has', 'toward', 'formerly', 'nowhere', 'also', 'ten', 'with', 'yet', 'before', 'yourself', '‘ve', 'enough', 'rather', 'what', 'anyway', 'ca', 'six', 'of', 'n’t', 'often', 'using', 'further', 'made', 'ours', 'least', 'seemed', "'d", 'at', '’m', 'whose', 'were', 'therein', 'any', 'back', 'whereafter', 'whether', 'off', 'the', 'either', 'everything', 'something', 'quite', 'done', 'though', 'see', 'beforehand', 'have', 'becoming', 'such', 'throughout', 'former', 'since', 'us', 'due', 'herein', 'themselves', 'because', 'other', 'i', 'out', '’s', 'for', 'unless', 'whenever', 'via', "'ll", "n't", 'next', 'him', 'seeming', 'get', 'twenty', 'how', 'nobody', 'himself', 'eight', 'perhaps', 'give', 'bottom', 'sometime', 'herself', 'thereupon', 'up', 'last', 'beyond', 'we', 'across', 'your', 'than', 'whatever', 'mostly', 'say', 'side', 'whereas', 'anyhow', 'same', 'take', '

In [151]:
df['stop_words_len'] = df['twitts'].apply(lambda x: (len([word for word in x.split() if word in stopwords])))
df.head()

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len
0,is bored and wants to watch a movie any sugge...,0,10,43,4.3,5
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.5,0
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.5,3
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116,4.461538,13


### Count #HashTags and @Mentions

In [152]:
df['hashtags_counts'] = df['twitts'].apply(lambda x: len([word for word in x.split() if word.startswith("#")]))

In [153]:
df['mentions_counts'] = df['twitts'].apply(lambda x: len([word for word in x.split() if word.startswith("@")]))

In [154]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts
3430,@riandawson Thx 4 the awesum show. u guys neve...,1,27,110,4.074074,6,0,1
3255,Working at the &quot;The Bunker PB.Com&quot; t...,1,23,121,5.26087,8,0,0
54,@j_kmrprez yeah I want be able to until next y...,0,10,47,4.7,4,0,1
1064,why is it so freaking hot in my room?,0,9,29,3.222222,6,0,0
488,"i miss are life plan, everything used to be pe...",0,15,61,4.066667,10,0,0


### If numeric digits are present in twitts

In [155]:
df['numerics_count'] = df['twitts'].apply(lambda x: len([word for word in x.split() if word.isdigit()]))
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count
2175,@Mareeclo thank you!!!! thank you!!!!!! goo...,1,12,65,5.416667,3,0,1,0
2994,@lobster_tony My housemates were texting like ...,1,22,115,5.227273,8,0,1,0
2794,@jmiddlin cupcake camp? sounds like fun!!,1,6,36,6.0,0,0,1,0
2878,Happy Mother's Day @jlara5,1,4,23,5.75,0,0,1,0
332,how do you stop someone who's obsessing on you...,0,26,112,4.307692,13,0,0,0


### UPPER case words count

In [156]:
df['upper_count'] = df['twitts'].apply(lambda x: len([word for word in x.split() if word.isupper()]))
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count
2553,@grant78uk ya I know what you mean on that for...,1,27,107,3.962963,11,0,1,0,2
3168,"@rannau thanks bb We miss you too, its hard t...",1,28,102,3.642857,15,0,1,0,1
2864,@ColorblindFish I can't wait!!!! I want full s...,1,8,50,6.25,1,0,1,0,2
1085,Watching skins &amp; eating toast...lonely,0,5,38,7.6,0,0,0,0,0
3287,@NissieTR good night nissieeee.....have a nice...,1,7,50,7.142857,1,0,1,0,0


## 3. Preprocessing and Cleaning

## Lower Case Conversion

In [157]:
df['twitts'] = df['twitts'].apply(lambda x: str(x).lower())
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count
1366,i can't fall asleep,0,4,16,4.0,0,0,0,0,1
779,@blakeshelton i know how that is,0,6,27,4.5,4,0,1,0,0
3740,@morningsidemom good! that means i'm not the o...,1,17,75,4.411765,6,0,1,0,0
1827,"@mamazilla771 oh man, this is terrible news. ...",0,12,66,5.5,2,0,2,0,0
326,argh i wish my hair was longish again i really...,0,15,56,3.733333,5,0,0,0,2


### Contraction to Expansion

In [158]:
corrections_dict = {
    "a'ight": "alright",
    "ain't": "am not",
    "amn't": "am not",
    " n ": "and",
    "arencha": "are not you",
    "aren't": "are not",
    "bout": "about",
    "can't": "cannot",
    "cap'n": "captain",
    "cause": "because",
    "cuz": "because",
    "cept": "except",
    "c'mon": "come on",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "cuppa": "cup of",
    "daren't": "dare not",
    "daresn't": "dare not",
    "dasn't": "dare not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "dunno": "do not know",
    "d'ya": "do you",
    "e'en": "even",
    "e'er": "ever",
    "em": "them",
    "everybody's": "everybody is",
    "everyone's": "everyone is",
    "everything's": "everything is",
    "finna": "fixing to",
    "fo'c'sle": "forecastle",
    "gainst": "against",
    "g'day": "good day",
    "gimme": "give me",
    "giv'n": "given",
    "gi'z": "give us",
    "gonna": "going to",
    "gon't": "go not",
    "gotta": "got to",
    "hadn't": "had not",
    "had've": "had have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "yes'nt": "yes not",
    "he's": "he is",
    "here's": "here is",
    "how'd": "how would",
    "howdy": "how do you do",
    "how'll": "how will",
    "how're": "how are",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'd'nt": "I would not",
    "I'd'nt've": "I would not have",
    "If'n": "If and when",
    "I'll": "I will",
    "I'm": "I am",
    "Imma": "I am about to",
    "I'm'o": "I am going to",
    "innit": "isn't it",
    "Ion": "I do not",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "Idunno": "I do not know",
    "kinda": "kind of)",
    "let's": "let us",
    "loven't": "love not",
    "mayn't": "may not",
    "may've": "may have",
    "methinks": "I think",
    "mightn't": "might not",
    "might've": "might have",
    "mine's": "mine is",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "must've": "must have",
    "neath": "beneath",
    "needn't": "need not",
    "nal": "and all",
    "ne'er": "never",
    "o'clock": "of the clock",
    "o'er": "over",
    "ol'": "old",
    "round": "around",
    "'s": "is",
    "shan'": "shall not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "so've": "so have",
    "that'll": "that will",
    "that're": "that are",
    "that's": "that is",
    "there'll": "there will",
    "there're": "there are",
    "there's": "there is",
    "these're": "these are",
    "these've": "these have",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "this's": "this is",
    "those're": "those are",
    "those've": "those have",
    "thout": "without",
    "til": "until",
    "tis": "it is",
    "to've": "to have",
    "tryna": "trying to",
    "twas": "it was",
    "tween": "between",
    "twere": "it were",
    "w'all": "we all",
    "w'at": "we at",
    "wanna": "want to",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "whatcha": "what are you",
    "what'd": "what did",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when'd": "when did",
    "where'd": "where did",
    "where'll": "where will",
    "where're": "where are",
    "where's": "where is",
    "where've": "where have",
    "which've": "which have",
    "who'd've": "who would have",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "why'd": "why did",
    "why're": "why are",
    "why's": "why is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'at": "you at",
    "yever": "have you ever",
    "y'know": "you know",
    "yessir": "yes sir",
    "you'd": "you had",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have",
    "i'm": "i am",
    "dis": "this",
    "bak": "back",
    "brng": "bring"
}

In [159]:
def con_to_exp(x):
    if type(x) is str:
        for key in corrections_dict:
            value = corrections_dict[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [160]:
%%timeit
df['twitts'] = df['twitts'].apply(lambda x: con_to_exp(x))

685 ms ± 49.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [161]:
df.sample(5)


Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count
1969,its all @brownskinbunnies fault that i am stuc...,0,17,81,4.764706,6,0,1,0,0
2621,@jordanknight good knight hun! hope jon gave y...,1,13,64,4.923077,2,0,1,0,0
2945,@nadineee persoand allly xmen won for me swin...,1,16,75,4.6875,4,0,1,0,1
78,has used more moisturer today than all the oth...,0,17,93,5.470588,9,0,0,0,0
3090,@nickymcb mebe....hey...i do not blame him...w...,1,9,69,7.666667,1,0,1,0,0


### Count and Remove Emails

In [162]:
import re

In [163]:
df[df['twitts'].str.contains('hotmail.com')]

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count
3713,@securerecs arghh me please markbradbury_16@h...,1,5,51,10.2,0,0,1,0,0


In [164]:
df.iloc[3713]['twitts']

'@securerecs arghh me please  markbradbury_16@hotmail.com'

In [170]:
df['emails'] = df['twitts'].apply(lambda x: re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', x))

df['emails_count'] = df['emails'].apply(lambda x: len(x))

In [171]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count,emails,emails_count
2668,@chocoboy1der - earl! i need you 2 vote 4 me. ...,1,11,51,4.636364,1,0,1,2,1,[],0
3909,@briannakay i'll be there soon!,1,5,27,5.4,2,0,1,0,0,[],0
1977,@officialmelb i miss your hot butt miss.mel. ...,0,11,54,4.909091,4,0,1,0,0,[],0
357,@janexdoe do not worry it will thisappear easi...,0,11,63,5.727273,2,0,1,0,0,[],0
3070,"@psychic09 oh ok, then do not worry aaaaaaaaab...",1,13,58,4.461538,6,0,1,0,0,[],0


In [173]:
df[df['emails_count'] > 0]

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count,emails,emails_count
3713,@securerecs arghh me please markbradbury_16@h...,1,5,51,10.2,0,0,1,0,0,[markbradbury_16@hotmail.com],1


In [175]:
df['twitts'] = df['twitts'].apply(lambda x:re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', "", x))
df[df['emails_count'] > 0]

Unnamed: 0,twitts,sentiment,word_counts,chars_counts,avg_word_length,stop_words_len,hashtags_counts,mentions_counts,numerics_count,upper_count,emails,emails_count
3713,@securerecs arghh me please,1,5,51,10.2,0,0,1,0,0,[markbradbury_16@hotmail.com],1
