# Background
The dataset 

### Importing Packages

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading Dataset

In [2]:
train = pd.read_csv("Data/train.csv")
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## Data Wrangling

### Dropping ID column

In [3]:
train = train[['text','selected_text', 'sentiment']]

In [4]:
train.head()

Unnamed: 0,text,selected_text,sentiment
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,my boss is bullying me...,bullying me,negative
3,what interview! leave me alone,leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


### Tokenizing the Data & Remove Capitalization

In [16]:
train['text'] = train.text.astype(str).str.lower()
tokenizer = RegexpTokenizer('\w+')
train['tokens'] = train['text'].apply(tokenizer.tokenize)
train.head()

Unnamed: 0,text,selected_text,sentiment,tokens
0,"i`d have responded, if i were going","I`d have responded, if I were going",neutral,"[i, d, have, responded, if, i, were, going]"
1,sooo sad i will miss you here in san diego!!!,Sooo SAD,negative,"[sooo, sad, i, will, miss, you, here, in, san,..."
2,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me]"
3,what interview! leave me alone,leave me alone,negative,"[what, interview, leave, me, alone]"
4,"sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[sons, of, why, couldn, t, they, put, them, on..."


### Removing Stopwords from Tokens List

In [17]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
df['D'] = df.C.isin(firsts).astype(int)

In [26]:
train['sample']=train.tokens.isin(stopwords).any()

In [27]:
train.head()

Unnamed: 0,text,selected_text,sentiment,tokens,sample
0,"i`d have responded, if i were going","I`d have responded, if I were going",neutral,"[i, d, have, responded, if, i, were, going]",False
1,sooo sad i will miss you here in san diego!!!,Sooo SAD,negative,"[sooo, sad, i, will, miss, you, here, in, san,...",False
2,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me]",False
3,what interview! leave me alone,leave me alone,negative,"[what, interview, leave, me, alone]",False
4,"sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[sons, of, why, couldn, t, they, put, them, on...",False


## Sentiment DataFrames

Here, we will break the larger dataframe into smaller dataframes of neutral, negative, and positive sentiments. We will also produce word clouds to see which words influence sentiment classification.

### Neutral Sentiment

In [34]:
neutral_tokens = []
neutral = train[train['sentiment']=='neutral']

In [19]:
neutral.head()

Unnamed: 0,text,selected_text,sentiment,tokens
0,"i`d have responded, if i were going","I`d have responded, if I were going",neutral,"[i, d, have, responded, if, i, were, going]"
5,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,"[http, www, dothebouncy, com, smf, some, shame..."
7,soooo high,Soooo high,neutral,"[soooo, high]"
8,both of you,Both of you,neutral,"[both, of, you]"
10,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral,"[as, much, as, i, love, to, be, hopeful, i, re..."


In [35]:
for item in neutral['tokens']:
    for token in item:
        neutral_tokens.append(token)

### Positive Sentiment

In [39]:
pos_tokens = []
pos = train[train['sentiment']=='positive']

In [21]:
pos.head()

Unnamed: 0,text,selected_text,sentiment,tokens
6,2am feedings for the baby are fun when he is a...,fun,positive,"[2am, feedings, for, the, baby, are, fun, when..."
9,journey!? wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,"[journey, wow, u, just, became, cooler, hehe, ..."
11,i really really like the song love story by ta...,like,positive,"[i, really, really, like, the, song, love, sto..."
21,playing ghost online is really interesting. th...,interesting.,positive,"[playing, ghost, online, is, really, interesti..."
25,"the free fillin` app on my ipod is fun, im add...","the free fillin` app on my ipod is fun, im add...",positive,"[the, free, fillin, app, on, my, ipod, is, fun..."


In [40]:
for item in pos['tokens']:
    for token in item:
        pos_tokens.append(token)

### Negative Sentiment

In [42]:
neg_tokens = []
neg = train[train['sentiment']=='negative']

In [23]:
neg.head()

Unnamed: 0,text,selected_text,sentiment,tokens
1,sooo sad i will miss you here in san diego!!!,Sooo SAD,negative,"[sooo, sad, i, will, miss, you, here, in, san,..."
2,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me]"
3,what interview! leave me alone,leave me alone,negative,"[what, interview, leave, me, alone]"
4,"sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[sons, of, why, couldn, t, they, put, them, on..."
12,my sharpie is running dangerously low on ink,DANGERously,negative,"[my, sharpie, is, running, dangerously, low, o..."


In [43]:
for item in neg['tokens']:
    for token in item:
        neg_tokens.append(token)