### Processing & Cleaning

In [2]:
data = "https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitter4000.csv"

In [3]:
import pandas as pd
import numpy as np
import spacy as sp

In [4]:
df = pd.read_csv(data)

In [5]:
df.head()

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [14]:
df = df.rename(columns={"twitts":"tweets"})

In [15]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [16]:
df['sentiment'].value_counts()

1    2000
0    2000
Name: sentiment, dtype: int64

### Word Counts

In [17]:
len('this is text'.split())

3

In [18]:
df['word_counts'] = df['tweets'].apply(lambda x:len(str(x).split()))

In [21]:
df.head(10)

Unnamed: 0,tweets,sentiment,word_counts
0,is bored and wants to watch a movie any sugge...,0,10
1,back in miami. waiting to unboard ship,0,7
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12
3,ughhh i am so tired blahhhhhhhhh,0,6
4,@mandagoforth me bad! It's funny though. Zacha...,0,26
5,"brr, i'm so cold. at the moment doing my assig...",0,17
6,@kevinmarquis haha yep but i really need to sl...,0,22
7,eating some ice-cream while I try to see @pete...,0,14
8,@phatty84 just hella bored at work lol,0,7
9,Food poisoning blowssss,0,3


In [23]:
df.word_counts.max(),df.word_counts.min()

(32, 1)

In [24]:
df[df.word_counts==1]

Unnamed: 0,tweets,sentiment,word_counts
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


### Character Count

In [30]:
def char_counts(x):
    ####TO skip white spaces####
    s = x.split()
    x = ''.join(s)
    return len(x)

In [33]:
df['char_counts'] = df.tweets.apply(lambda x : char_counts(str(x)))

In [34]:
df.head(10)

Unnamed: 0,tweets,sentiment,word_counts,char_counts
0,is bored and wants to watch a movie any sugge...,0,10,43
1,back in miami. waiting to unboard ship,0,7,32
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54
3,ughhh i am so tired blahhhhhhhhh,0,6,27
4,@mandagoforth me bad! It's funny though. Zacha...,0,26,116
5,"brr, i'm so cold. at the moment doing my assig...",0,17,87
6,@kevinmarquis haha yep but i really need to sl...,0,22,94
7,eating some ice-cream while I try to see @pete...,0,14,88
8,@phatty84 just hella bored at work lol,0,7,32
9,Food poisoning blowssss,0,3,21


### Average word length

In [35]:
df['avg_word_length'] = df.char_counts/df.word_counts

In [36]:
df.sample(5)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length
1684,"Bradley still woke up crying with sore throat,...",0,20,92,4.6
1056,Okay I Was Going To Sleep; But Isabella's Cryi...,0,12,48,4.0
305,I have a torn something in my knee,0,8,27,3.375
3660,@gtrbush SENT YOU SOME PICS CHECK YOUR EMAIL,1,8,37,4.625
1341,@leylacarter Me gusta Twittear dejameeeeeeeee,0,5,41,8.2


### Stop Words Count

In [37]:
print(stopwords)

{'we', 'whose', 'say', 'therein', 'perhaps', 'therefore', 'and', 'whereas', 'empty', 'mine', 'has', 'would', 'unless', 'been', 'seems', 'until', 'latter', 'for', 'him', 'in', 'fifteen', 'thereupon', 'yourself', "'d", 'eleven', 'moreover', 'elsewhere', 'top', 'noone', 'have', 'her', 'being', 'at', 'below', 'besides', 'nothing', 'along', 'now', 'nine', 'could', 'other', 'enough', 'mostly', 'be', 'part', 'others', 'everyone', 'someone', 'always', 'various', 'whether', 'who', 'either', 'again', "'ve", 'herself', 'well', 'hers', 'except', 'even', 'else', 'same', 'what', 'own', 'upon', 'were', 'least', 'toward', 'last', 'somehow', "'ll", 'call', 'she', 'anyway', 'give', 'once', 'around', 'not', 'become', 'does', 'somewhere', 'whereupon', 'whither', '‘s', 'eight', 'neither', 'by', 'doing', '‘m', 'hundred', 'almost', 'to', 'none', 'sometimes', 'really', '’ve', "n't", 'if', 'one', 'bottom', 'might', 'whereafter', 'keep', 'via', 'seem', 'latterly', 'otherwise', 'up', 'amongst', 'it', 'themselves

In [38]:
len(stopwords)

326

In [39]:
df['stop_words_count'] = df['tweets'].apply(lambda x : len(list(t for t in x.split() if t in stopwords)))

In [40]:
df.sample(5)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length,stop_words
465,just got computer back but i still need to tak...,0,13,49,3.769231,10
1464,i dont like the heat anymore im sooo tired &a...,0,13,57,4.384615,3
696,Iï¿½m in Valencia (Spain) in my house...Iï¿½m ...,0,8,47,5.875,3
2416,@chrisdrackett My new portfoliodesign will mak...,1,12,73,6.083333,4
341,Primavera all morning. Bugs in the system = gr...,0,18,84,4.666667,7


### Count # Tags and @ Mentions

In [41]:
df['hashtags_count'] = df.tweets.apply(lambda x : len(list(t  for t in x.split() if t.startswith('#'))))

In [42]:
df['mention_count'] = df.tweets.apply(lambda x : len(list(t  for t in x.split() if t.startswith('@'))))

In [47]:
df.sample(7)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length,stop_words,hashtags_count,mention_count
442,@DexterAddict Oh pls share when you've made su...,0,15,69,4.6,4,0,1
3133,just had my daily Loose Women fix although it ...,1,16,75,4.6875,7,0,1
3038,"@run350 Girl, U outta see it. LOL I have 3 awk...",1,27,111,4.111111,8,0,1
2570,@LexiThaBoss Thanx babe. Love back 2 you,1,7,34,4.857143,2,0,1
2196,Had a good night out last night after work. Lo...,1,24,114,4.75,11,0,0
2665,"@stokely @johubris @andrewbarnett thanks, you ...",1,6,45,7.5,1,0,3
959,Zomg - totally came into work 30 mins early. I...,0,20,71,3.55,8,0,0


### Numeric digit check!

In [48]:
df['numeric_count'] = df.tweets.apply(lambda x : len(list(t for t in x.split() if t.isdigit())))

In [50]:
df.sample(6)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length,stop_words,hashtags_count,mention_count,numeric_count
3556,@djbriancua yeah thats a good start. back then...,1,28,109,3.892857,12,0,1,2
2352,"my brother sleeps long,might go swimming 2day ...",1,16,81,5.0625,4,0,0,1
3511,Hey @songzyuuup yessssssssssssssssssssssss (...,1,7,79,11.285714,0,0,1,0
1816,@roxanne_ong 12 more days? so fast... i'm gonn...,0,11,49,4.454545,2,0,1,1
1821,I did it! Went to sleep before 12....unfortuna...,0,22,90,4.090909,13,0,0,0
411,"@mikefoong @hantu Unfortunately, that Friday ...",0,17,97,5.705882,5,0,2,0


### Letter cases

In [51]:
df['upper_case_count'] = df.tweets.apply(lambda x: len(list(t for t in x.split() if t.isupper())))

In [52]:
df['tweets'] = df['tweets'].apply(lambda x:str(x).lower())

In [53]:
df.sample(5)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length,stop_words,hashtags_count,mention_count,numeric_count,upper_case_count
13,padres come back from being down 6-0 &amp; we ...,0,18,82,4.555556,8,0,0,0,0
3712,eating chocolates,1,2,16,8.0,0,0,0,0,0
2632,the cake looks amazing al pink,1,6,25,4.166667,0,0,0,0,0
1375,all the managers are gone today!!! yay! i thin...,0,15,57,3.8,8,0,0,0,0
713,why oh why did i volunteer to write my final a...,0,23,96,4.173913,9,0,0,1,0


### count and remove Emails

In [54]:
import re

In [55]:
x = '@ghas is # tabea @ajsam.com,as r adityasrichandan3098@gmail.com,mdaw-0fa'

In [70]:
re.findall(r'([a-z0-9._-]+@[a-z0-9._-]+\.[a-z0-9_-]+)',x)

['adityasrichandan3098@gmail.com']

In [71]:
df['emails'] = df.tweets.apply(lambda x : re.findall(r'([a-z0-9._-]+@[a-z0-9._-]+\.[a-z0-9_-]+\b)',x))

In [72]:
df['emails_count'] = df.emails.apply(lambda x:len(x))

In [73]:
df.sample(20)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length,stop_words,hashtags_count,mention_count,numeric_count,upper_case_count,emails,emails_count
1612,@shaycarl i'm gonna make you a banner out of ...,0,24,108,4.5,10,0,1,0,2,[],0
1318,gots golf in the morning. i love the sport but...,0,21,75,3.571429,13,0,0,0,1,[],0
309,"merlot rocks, but the wife doesn't like the sc...",0,14,64,4.571429,5,0,0,0,0,[],0
2736,@kkrseattle @jmedero either gonna be ac or p ...,1,15,73,4.866667,4,0,2,0,2,[],0
2116,@caseybrothers nope holding a nice hot cup of ...,1,12,57,4.75,4,0,1,0,0,[],0
344,@reminiscesmith im only on 1800 lol,0,6,30,5.0,2,0,1,1,0,[],0
2497,#ff keep an eye on the claw and the new u2 tou...,1,26,106,4.076923,12,1,2,0,1,[],0
3080,"hooray,.. it's saturday..!",1,3,24,8.0,0,0,0,0,0,[],0
2799,new phone tomorrow!!!,1,3,19,6.333333,0,0,0,0,0,[],0
207,found out i have a throat infection,0,7,29,4.142857,4,0,0,0,0,[],0


In [74]:
df[df['emails_count']>0]

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_length,stop_words,hashtags_count,mention_count,numeric_count,upper_case_count,emails,emails_count
3713,@securerecs arghh me please markbradbury_16@h...,1,5,51,10.2,0,0,1,0,0,[markbradbury_16@hotmail.com],1


In [78]:
df['emails'] = df.tweets.apply(lambda x : re.sub(r'([a-z0-9._-]+@[a-z0-9._-]+\.[a-z0-9_-]+\b)'," ",x))

In [79]:
df[df['emails_count']>0].tweets

3713    @securerecs arghh me please  markbradbury_16@h...
Name: tweets, dtype: object