# Read Brand Sentiment Twitter Data

In [116]:
from __future__ import unicode_literals
from __future__ import division

import pandas as pd
import numpy as np
path = '../data/'

In [117]:
## tweet_text, emotion_in_tweet_is_directed_at, is_there_an_emotion_directed_at_a_brand_or_product 
train_file = 'judge-1377884607_tweet_product_company.csv'

In [118]:
def decode_unicode(data):
    unicode_error_counter = 0
    tweets = []
    labels = []
    for tweet, label in data:
        try:
            tweets.append(tweet.decode('utf8'))
            labels.append(label)
        except:
            unicode_error_counter += 1
    return tweets, labels, unicode_error_counter

In [119]:
def read_data(fn):
    df = pd.read_csv(fn)
    tweets = df['tweet_text']
    labels = df['is_there_an_emotion_directed_at_a_brand_or_product']
    
    tweets = tweets[pd.notnull(tweets)]
    labels = labels[pd.notnull(labels)]

    data = zip(tweets.get_values(), labels.get_values())
    tweets, labels, errors = decode_unicode(data)
            
    return tweets, labels, errors

In [120]:
text, target, errors = read_data(path+train_file)
print 'rows', len(text), len(target), 'unicode errors', errors
print text[0:1], '\n'
print target[0:1], '\n'

rows 8613 8613 unicode errors 479
[u'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'] 

['Negative emotion'] 



### Feature Extraction

In [121]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

count_vect.fit(text)

print count_vect.vocabulary_.get(u'3g')
print count_vect.transform(["I love my iphone!!!"])
#print counts[0][0]

164
  (0, 4467)	1
  (0, 5052)	1
  (0, 5563)	1


### Train

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

counts = count_vect.transform(text)

nb.fit(counts[0:6000], target[0:6000])

### Predict

In [122]:
print nb.predict(count_vect.transform(["I love my iphone!!!"]))

['Positive emotion']


### Validation

In [130]:
predictions = nb.predict(counts)
print 'Accuracy on training data:',sum(predictions[6000:8613] == target[6000:8613]) / len(target[6000:8613])

from sklearn import cross_validation

scores = cross_validation.cross_val_score(nb, counts, target, cv=10)
print scores
print scores.mean()

Accuracy on training data: 0.551090700344
[ 0.52375435  0.51448436  0.51216686  0.51911935  0.53077816  0.54302326
  0.56046512  0.54651163  0.52209302  0.51744186]
0.528983796174


### Baseline

In [131]:
from sklearn.dummy import DummyClassifier
dc = DummyClassifier(strategy='most_frequent')

from sklearn import cross_validation

scores = cross_validation.cross_val_score(dc, counts, target, cv=10)
print scores
print scores.mean()

[ 0.59212051  0.59212051  0.59212051  0.59212051  0.59233449  0.59302326
  0.59302326  0.59302326  0.59302326  0.59302326]
0.592593281324


### Feature Selection

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

p = Pipeline(steps=[('counts', CountVectorizer(ngram_range=(1, 2))),
                ('feature_selection', SelectKBest(chi2, k=10000)),
                ('multinomialnb', MultinomialNB())])

p.fit(text, target)

Pipeline(steps=[(u'counts', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...t 0x7f9f3605ede8>)), (u'multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [136]:
scores = cross_validation.cross_val_score(p, text, target, cv=10)
print scores
print scores.mean()

[ 0.5561993   0.53997683  0.53070684  0.50984936  0.54936121  0.51860465
  0.57093023  0.55232558  0.55348837  0.52325581]
0.540469818815


### Hyperparameter Optimization (GridSearch)

In [None]:

from sklearn.grid_search import GridSearchCV

parameters = {
    'counts__max_df': (0.5, 0.75, 1.0),
    'counts__min_df': (1, 2, 3),
    'counts__ngram_range': ((1,1), (1,2)),
    'feature_selection__k': (100, 200, 300, 'all')
    }

grid_search = GridSearchCV(p, parameters, n_jobs=1, verbose=1, cv=10)

grid_search.fit(text, target)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


### Testing

In [80]:
## tweet_text, emotion_in_tweet_is_directed_at, is_there_an_emotion_directed_at_a_brand_or_product 
test_file = 'crowdflower-data'

In [81]:
# test_text, test_target, _ = read_data(path+test_file)

# print test_target[0:1], '\n'
# print test_text[0:1], '\n'