### Read the dataset (tweets.csv)

In [1]:
import pandas as pd
data = pd.read_csv('tweets.csv', engine='python', warn_bad_lines=False, error_bad_lines=False)

In [2]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
data.dtypes

tweet_text                                            object
emotion_in_tweet_is_directed_at                       object
is_there_an_emotion_directed_at_a_brand_or_product    object
dtype: object

##### Preprocess the text and add the preprocessed text in a column with name `text` in the dataframe.

In [4]:
def preprocess(text):
    try:
        return text.decode('ascii')
    except Exception as e:
        print(e)
        return ""

In [5]:
data.tweet_text.shape

(9092,)

In [6]:
temp = '@sxtxstate great stuff on Fri #SXSW: Marissa M'

In [7]:
preprocess(temp)

u'@sxtxstate great stuff on Fri #SXSW: Marissa M'

In [9]:
data["text"] = pd.DataFrame([preprocess(text) for text in data.tweet_text], index=data.index)

'float' object has no attribute 'decode'
'ascii' codec can't decode byte 0x89 in position 54: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 98: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 110: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 10: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 10: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 10: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 10: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 10: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 0: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 0: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 0: ordinal not in range(128)
'ascii' codec can't decode byte 0x89 in position 0: ordinal not in range(128)
'ascii' codec 

In [10]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@jessedee Know about @fludapp ? Awesome iPad/i...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@swonderlin Can not wait for #iPad 2 also. The...
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@sxsw I hope this year's festival isn't as cra...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@sxtxstate great stuff on Fri #SXSW: Marissa M...


In [13]:
data.isnull().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5801
is_there_an_emotion_directed_at_a_brand_or_product       0
text                                                     0
dtype: int64

### Consider only rows having Positive emotion and Negative emotion and remove other rows from the dataframe.

In [18]:
gb = data.groupby(by='is_there_an_emotion_directed_at_a_brand_or_product')

In [19]:
gb.count()

Unnamed: 0_level_0,tweet_text,emotion_in_tweet_is_directed_at,text
is_there_an_emotion_directed_at_a_brand_or_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I can't tell,156,9,156
Negative emotion,570,519,570
No emotion toward brand or product,5387,91,5388
Positive emotion,2978,2672,2978


In [34]:
#dropping rows with emotion as I can't tell or No emotion toward brand or product
mod_data = data[data['is_there_an_emotion_directed_at_a_brand_or_product'] != "I can't tell"]
mod_data = mod_data[mod_data['is_there_an_emotion_directed_at_a_brand_or_product'] != "No emotion toward brand or product"]


In [44]:
mod_data.groupby(by='is_there_an_emotion_directed_at_a_brand_or_product').count()

Unnamed: 0_level_0,tweet_text,emotion_in_tweet_is_directed_at,text
is_there_an_emotion_directed_at_a_brand_or_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative emotion,570,519,570
Positive emotion,2978,2672,2978


### Represent text as numerical data using `CountVectorizer` and get the document term frequency matrix

#### Use `vect` as the variable name for initialising CountVectorizer.

In [45]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vect = CountVectorizer(ngram_range=(1, 2))

### Find number of different words in vocabulary

In [53]:
data_dtm = vect.fit_transform(mod_data['text'])

In [54]:
data_dtm.shape

(3548, 31363)

In [56]:
print vect.get_feature_names()[-50:]

[u'zaarly referrals', u'zaarlyiscoming', u'zaarlyiscoming winning', u'zagg', u'zagg keyboard', u'zaggle', u'zaggle showed', u'zap', u'zap lt', u'zappos', u'zappos amp', u'zappos and', u'zappos likability', u'zappos likeability', u'zazzlesxsw', u'zazzlesxsw link', u'zazzlesxsw sxsw', u'zazzlesxsw sxswi', u'ze', u'ze frank', u'zelda', u'zelda on', u'zeldman', u'zeldman autocorrects', u'zero', u'zero juice', u'zimride', u'zimride etc', u'zing', u'zing check', u'zip', u'zip it', u'zite', u'zite the', u'zms', u'zms mention', u'zombies', u'zombies sxsw', u'zombies what', u'zomg', u'zomg its', u'zomg mention', u'zomg rt', u'zone', u'zone right', u'zoom', u'zoom in', u'zoom to', u'zzzs', u'zzzs iphone']


#### Tip: To see all available functions for an Object use dir

In [57]:
dir(vect)

['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getstate__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_limit_features',
 '_sort_features',
 '_validate_vocabulary',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',
 'fit_transform',
 'fixed_vocabulary_',
 'get_feature_names',
 'get_params',
 'get_stop_words',
 'input',
 'inverse_transform',
 'lowercase',
 'max_df',
 'max_features',
 'min_df',
 'ngram_range',
 'preprocessor',
 'set_params',
 'stop_words',
 'stop_words_',
 'strip_accents',
 'token_pattern',
 'tokenizer',
 'transform',
 'vocabulary',
 'vocabulary_']

### Find out how many Positive and Negative emotions are there.

Hint: Use value_counts on that column

In [49]:
mod_data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

Positive emotion    2978
Negative emotion     570
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

### Change the labels for Positive and Negative emotions as 1 and 0 respectively.

Hint: use map on that column and give labels

In [61]:
y = {'Positive emotion' : 1 , 'Negative emotion':0}

In [64]:
mod_data['label'] = mod_data['is_there_an_emotion_directed_at_a_brand_or_product'].map(y)

In [65]:
mod_data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,text,label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,@jessedee Know about @fludapp ? Awesome iPad/i...,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,@swonderlin Can not wait for #iPad 2 also. The...,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,@sxsw I hope this year's festival isn't as cra...,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,@sxtxstate great stuff on Fri #SXSW: Marissa M...,1


### Define feature set as `text` column and above labels as target  and divide into train and test datasets

In [67]:
from sklearn.cross_validation import train_test_split

X = mod_data['text']
y = mod_data['label']

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## **Predicting the sentiment:**


### Use Naive Bayes and Logistic Regression and their accuracy scores for predicting the sentiment of the given text

In [69]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [70]:
vect = CountVectorizer(ngram_range=(1, 2))

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_class)

0.8666666666666667


In [71]:
logreg = LogisticRegression()
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)

print metrics.accuracy_score(y_test, y_pred_class)

0.8798122065727699
