In [1]:
%pylab inline
%matplotlib inline

import sys
import os
import time
import codecs
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pickle


sys.path.append("/Users/Bya/git/predictEPL/utils/")
sys.path.append("/Users/Bya/git/predictEPL/config/")

import emolex
import paths
import tokenizer
import useful_methods as my_methods
import train_datas

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score

### Choose : DATA

In [3]:
##########################################################
# Step 1. Prepare Data => Soccer Hash Emolex
dic_emolex_soccer, y = emolex.EmolexSoccerDic()

# Read Hash Emolex CSV
dfHashEmolex = train_datas.HashEmolexAllRead()

# Labeling Emolex 8 cat => POS, NEG
texts = []
sentiments = []

for i in range(len(dfHashEmolex)):
    text = dfHashEmolex.iloc[i]['text']
    sentiment = dfHashEmolex.iloc[i]['sentiments'].split(',')
    
    for sent in sentiment:
        if sent in ['anger', 'disgust', 'fear', 'sadness', 'negative']:
            texts.append(text)
            sentiments.append(0) # 0 is negative
            break

        elif sent in ['anticipation', 'joy', 'positive', 'trust']:
            texts.append(text)
            sentiments.append(1) # 1 is negative
            break

# Create New POS, NEG dataframe
dfTwitter = pd.DataFrame(columns=['tweet', 'sentiment'])
dfTwitter['tweet'] = texts
dfTwitter['sentiment'] = sentiments
dfTwitter['length'] = [len(text) for text in dfTwitter['tweet']]

print("POS: ", len(dfTwitter[dfTwitter.sentiment == 1]))
print("NEG: ", len(dfTwitter[dfTwitter.sentiment == 0]))
print("\nAll: ", len(dfTwitter))

All Words: 14136
POS:  3207
NEG:  4008

All:  7215


### Tunning Starts

In [13]:
##########################################################
# Step 2: Split Datas


# Split data Train and Test data
tweets_train, tweets_test, sentiment_train, sentiment_test = \
    train_test_split(dfTwitter['tweet'], dfTwitter['sentiment'], test_size=0.2)

print("Train data: ", len(tweets_train), "\nTest data: ", len(tweets_test),
      "\nAll data: ", len(sentiment_train) + len(sentiment_test))




#########################################################
# Step 3: Set Parameters for Classifier

# Classifier Pipeline
pipeline = Pipeline([
   ('vect', TfidfVectorizer()),
   ('clf', LogisticRegression())
])

# Hyper Parameters
parameters = {
    'vect__analyzer': (
#         tokenizer.StemNoStops,
#                        tokenizer.StemNoEnglishStops,
#                        tokenizer.StemNoSoccerStops,
#                        tokenizer.StemNoNegation,
                       tokenizer.Stem,
#                        tokenizer.LemmaNoStops,
#                        tokenizer.LemmaNoEnglishStops,
#                        tokenizer.LemmaNoSoccerStops,
#                        tokenizer.LemmaNoNegation,
                       tokenizer.Lemma),
    'vect__max_df': (0.25, 0.5, 0.75),
    'vect__max_features': (2500, 5000, 10000, None),
    'vect__ngram_range': ((1, 1), (1, 2)),
#     'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),
}

# Grid Search
grid_search = GridSearchCV(
    pipeline,  # pipeline from above
    parameters,  # parameters to tune via cross validation
    refit=True,  # fit using all available data at the end, on the best found param combination
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(sentiment_train, n_folds=5),  # what type of cross validation to use
)

Train data:  5772 
Test data:  1443 
All data:  7215


In [14]:
#########################################################
# Step 4: Compute Classifier

%time grid_search.fit(tweets_train, sentiment_train)

print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
   print('\t%s: %r' % (param_name, best_parameters[param_name]))


predictions = grid_search.predict(tweets_test)
print("\n\n\n[Test Results]:\n")
print('Accuracy:', accuracy_score(sentiment_test, predictions))
print('Precision:', precision_score(sentiment_test, predictions))
print('Recall:', recall_score(sentiment_test, predictions))

CPU times: user 46.1 s, sys: 6.48 s, total: 52.6 s
Wall time: 3h 18min 59s
Best score: 0.886
Best parameters set:
	clf__C: 10
	clf__penalty: 'l1'
	vect__analyzer: <function Lemma at 0x10736b378>
	vect__max_df: 0.25
	vect__max_features: 5000
	vect__ngram_range: (1, 1)
	vect__norm: 'l2'



[Test Results]:

Accuracy: 0.878724878725
Precision: 0.873040752351
Recall: 0.855606758833


In [15]:
print('Best parameters set:')
grid_search.best_params_

Best parameters set:


{'clf__C': 10,
 'clf__penalty': 'l1',
 'vect__analyzer': <function tokenizer.Lemma>,
 'vect__max_df': 0.25,
 'vect__max_features': 5000,
 'vect__ngram_range': (1, 1),
 'vect__norm': 'l2'}

In [12]:
pprint(grid_search.grid_scores_)

[mean: 0.83056, std: 0.01476, params: {'vect__ngram_range': (1, 1), 'vect__analyzer': <function StemNoStops at 0x1072c8e18>},
 mean: 0.83056, std: 0.01476, params: {'vect__ngram_range': (1, 2), 'vect__analyzer': <function StemNoStops at 0x1072c8e18>},
 mean: 0.85031, std: 0.01524, params: {'vect__ngram_range': (1, 1), 'vect__analyzer': <function StemNoNegation at 0x10736b268>},
 mean: 0.85031, std: 0.01524, params: {'vect__ngram_range': (1, 2), 'vect__analyzer': <function StemNoNegation at 0x10736b268>},
 mean: 0.85100, std: 0.01335, params: {'vect__ngram_range': (1, 1), 'vect__analyzer': <function Stem at 0x10736b2f0>},
 mean: 0.85100, std: 0.01335, params: {'vect__ngram_range': (1, 2), 'vect__analyzer': <function Stem at 0x10736b2f0>},
 mean: 0.83039, std: 0.01778, params: {'vect__ngram_range': (1, 1), 'vect__analyzer': <function LemmaNoStops at 0x10736b400>},
 mean: 0.83039, std: 0.01778, params: {'vect__ngram_range': (1, 2), 'vect__analyzer': <function LemmaNoStops at 0x10736b400>}

In [None]:
##########################################################
# Step 3: Save Detecter

os.chdir("/Users/Bya/Dropbox/Research/datas/Detecter/")
with open('log_tweets_pn_detector-160114.pkl', 'wb') as fout:
    pickle.dump(grid_search, fout)