In [1]:
import sys
import os
import time
import codecs
import pandas as pd
from pprint import pprint

sys.path.append("/Users/Bya/git/predictEPL/utils/")
sys.path.append("/Users/Bya/git/predictEPL/config/")

import emolex
import paths
import tokenizer
import useful_methods as my_methods
import train_datas

In [2]:
%pylab inline
%matplotlib inline

import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve

Populating the interactive namespace from numpy and matplotlib


### Choose : DATA

In [3]:
##########################################################
# Step 1. Prepare Data => Soccer Hash Emolex

dic_emolex_soccer, y = emolex.EmolexSoccerDic()

# Read Hash Emolex CSV
dfHashEmolex = train_datas.HashEmolexAllRead()

# Labeling Emolex 8 cat => POS, NEG
texts = []
sentiments = []

for i in range(len(dfHashEmolex)):
    text = dfHashEmolex.iloc[i]['text']
    sentiment = dfHashEmolex.iloc[i]['sentiments'].split(',')
    
    for sent in sentiment:
        if sent in ['anger', 'disgust', 'fear', 'sadness', 'negative']:
            texts.append(text)
            sentiments.append(0) # 0 is negative
            break

        elif sent in ['anticipation', 'joy', 'positive', 'trust']:
            texts.append(text)
            sentiments.append(1) # 1 is negative
            break

# Create New POS, NEG dataframe
dfTwitter = pd.DataFrame(columns=['tweet', 'sentiment'])
dfTwitter['tweet'] = texts
dfTwitter['sentiment'] = sentiments
dfTwitter['length'] = [len(text) for text in dfTwitter['tweet']]

All Words: 14152


In [22]:
##########################################################
# Step 1. Prepare Data => SemEval Tweet PN

# Read Data
dfTwitter = train_datas.TweetPnEqualRead()

# Change sentiment to number
label_dic = {
    'positive': 1,
    'negative': 0,
}

dfTwitter.sentiment = [label_dic[sentiment] for sentiment in list(dfTwitter.sentiment)]

# Adding 'length' column
dfTwitter['length'] = [len(text) for text in dfTwitter['tweet']]

In [28]:
##########################################################
# Step 1. Prepare Data => Movie Short reviews

os.chdir(paths.READ_PATH_REVIEW_SHORT)
dfTwitter = my_methods.csv_dic_df("short_movie_reviews.csv")

# Adding 'length' column
dfTwitter['tweet'] = dfTwitter['review']
dfTwitter['length'] = [len(text) for text in dfTwitter['tweet']]

### Tunning Starts

In [29]:
##########################################################
# Step 2: Data to Vectors

# Bag Of Word(bof), Porter Stemmer
print("Bag Of Word(bof), Porter Stemmer:")
%time bow_transformer = CountVectorizer(analyzer=tokenizer.Stem).fit(dfTwitter['tweet'])



# The bag-of-words counts for the entire Tweets corpus are a large,
# sparse matrix:
tweets_bow = bow_transformer.transform(dfTwitter['tweet'])
print('\nsparse matrix shape:', tweets_bow.shape)
print('number of non-zeros:', tweets_bow.nnz)
print('sparsity: %.2f%%' % (100.0 * tweets_bow.nnz / (tweets_bow.shape[0] * tweets_bow.shape[1])))



# And finally, after the counting,
# the term weighting and normalization can be done with TF-IDF,
# using scikit-learn's TfidfTransformer:
tfidf_transformer = TfidfTransformer().fit(tweets_bow)


# To transform the entire bag-of-words corpus into TF-IDF corpus at once:
tweets_tfidf = tfidf_transformer.transform(tweets_bow)
print("\nTF-IDF(bow)\n", tweets_tfidf.shape)

Bag Of Word(bof), Porter Stemmer:
CPU times: user 9 s, sys: 134 ms, total: 9.14 s
Wall time: 9.46 s

sparse matrix shape: (10662, 14686)
number of non-zeros: 118455
sparsity: 0.08%

TF-IDF(bow)
 (10662, 14686)


In [30]:
##########################################################
# Step 3: Run Expirements & Tune Params


# Split data Train and Test data
tweets_train, tweets_test, sentiment_train, sentiment_test = \
    train_test_split(dfTwitter['tweet'], dfTwitter['sentiment'], test_size=0.2)

print("Train data: ", len(tweets_train), "\nTest data: ", len(tweets_test),
      "\nAll data: ", len(sentiment_train) + len(sentiment_test))



# Let's recap the entire pipeline up to this point,
# putting the steps explicitly into scikit-learn's Pipeline:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenizer.Stem)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])


##############################################################
# Tune Parameters
params = {
#     'tfidf__use_idf': (True),
    'bow__analyzer': (tokenizer.StemNoStops, tokenizer.LemmaNoStops,
                      tokenizer.StemNoEnglishStops, tokenizer.LemmaNoEnglishStops,
                      tokenizer.StemNoSoccerStops, tokenizer.LemmaNoSoccerStops,
                      tokenizer.Stem, tokenizer.Lemma),
}

grid = GridSearchCV(
    pipeline,  # pipeline from above
    params,  # parameters to tune via cross validation
    refit=True,  # fit using all available data at the end, on the best found param combination
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(sentiment_train, n_folds=5),  # what type of cross validation to use
)

Train data:  8529 
Test data:  2133 
All data:  10662


In [31]:
########################################################
%time nb_detector = grid.fit(tweets_train, sentiment_train)
pprint(nb_detector.grid_scores_)

CPU times: user 7.88 s, sys: 237 ms, total: 8.11 s
Wall time: 2min 23s
[mean: 0.77113, std: 0.00800, params: {'bow__analyzer': <function StemNoStops at 0x115e54c80>},
 mean: 0.76809, std: 0.00577, params: {'bow__analyzer': <function LemmaNoStops at 0x115edd378>},
 mean: 0.77113, std: 0.00783, params: {'bow__analyzer': <function StemNoEnglishStops at 0x115e38f28>},
 mean: 0.76844, std: 0.00622, params: {'bow__analyzer': <function LemmaNoEnglishStops at 0x115edd400>},
 mean: 0.76129, std: 0.00669, params: {'bow__analyzer': <function StemNoSoccerStops at 0x115e38e18>},
 mean: 0.76257, std: 0.00573, params: {'bow__analyzer': <function LemmaNoSoccerStops at 0x115edd488>},
 mean: 0.76175, std: 0.00716, params: {'bow__analyzer': <function Stem at 0x115edd268>},
 mean: 0.76140, std: 0.00545, params: {'bow__analyzer': <function Lemma at 0x115edd2f0>}]


In [32]:
nb_detector.best_params_

{'bow__analyzer': <function tokenizer.StemNoStops>}

In [None]:
##########################################################
# Step 3: Save Detecter

os.chdir("/Users/Bya/Dropbox/Research/datas/Detecter/")
with open('nb_tweets_pn_detector.pkl', 'wb') as fout:
    pickle.dump(nb_detector, fout)