In [1]:
%pylab inline
%matplotlib inline

# Global Imports
import sys
import os
import time
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint

# Local Imports
sys.path.append("../utils/")
sys.path.append("../config/")

import tokenizer
import useful_methods
import train_datas

Populating the interactive namespace from numpy and matplotlib


ImportError: No module named 'tokenizer'

In [None]:
# Scikit-Learn imports
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.learning_curve import learning_curve

### Step 1. Data Load

In [None]:
##########################################################

# Data Load



# Prepare Data => Soccer Hash Emolex
dic_emolex_soccer, y = emolex.EmolexSoccerDic()

# Read Hash Emolex CSV
dfHashEmolex = train_datas.HashEmolexAllRead()

# Labeling Emolex 8 cat => POS, NEG
texts = []
labels = []

for i in range(len(dfHashEmolex)):
    text = dfHashEmolex.iloc[i]['text']
    sentiment = dfHashEmolex.iloc[i]['sentiments'].split(',')
    
    for sent in sentiment:
        if sent in ['anger', 'disgust', 'fear', 'sadness', 'negative']:
            texts.append(text)
            labels.append(0) # 0 is negative
            break

        elif sent in ['anticipation', 'joy', 'positive', 'trust']:
            texts.append(text)
            labels.append(1) # 1 is positive
            break



# Create New POS, NEG dataframe
df = pd.DataFrame(columns=['text', 'label'])
df['text'] = texts
df['label'] = labels

print("\n\nPOS: ", len(df[df.label == 1]))
print("NEG: ", len(df[df.label == 0]))
print("\nAll: ", len(df))

In [None]:
useful_methods.DFtoCSV(
    df=df,
    pathToSave="/Users/Bya/Dropbox/Research/datas/TweetsPN/",
    fileName="tweet_hash_emolex_pn",
    index=False
)

### Step 2. Data Split

In [None]:
##########################################################

# Split data Train and Test data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2
)

print(
    "Train data: \t", len(X_train),
    "\nTest data: \t", len(X_test),
    "\nAll data: \t", len(y_train) + len(y_test)
)

### Step 3. Define Classifiers

In [None]:
##########################################################

# [Naive Bayes]

# putting the steps explicitly into Pipeline
pipeline_nb = Pipeline([
        # strings to token counts to weighted TF-IDF scores
        ('vect', TfidfVectorizer(
                analyzer=tokenizer.Lemma, # extract the sequence of features out of the raw
                use_idf=True,             # Enable inverse-document-frequency reweighting
                max_df=1.0,               # frequency threshold
                max_features=None,        # max features
            )
        ),
        
        # train on vectors with classifier
        ('clf', MultinomialNB())
    ])


# tunning parameters
params_nb = {
    'vect__analyzer': (
        tokenizer.Stem,
        tokenizer.Lemma
    ),
}


# grid search
grid_nb = GridSearchCV(
    pipeline_nb,       # pipeline from above
    params_nb,         # parameters to tune via cross validation
    refit=True,         # fit using all available data at the end, on the best found param combination
    n_jobs=-1,          # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy', # what score are we optimizing?
    cv=StratifiedKFold(y_train, n_folds=5), # what type of cross validation to use
)

In [None]:
##########################################################

# [Logistic Regression]

# putting the steps explicitly into Pipeline
pipeline_log = Pipeline([
        # strings to token counts to weighted TF-IDF scores
        ('vect', TfidfVectorizer(
                analyzer=tokenizer.Lemma, # extract the sequence of features out of the raw
                use_idf=True,             # Enable inverse-document-frequency reweighting
                max_df=1.0,               # frequency threshold
                max_features=None,        # max features
            )
        ),
        
        # train on vectors with classifier
        ('clf', LogisticRegression())
    ])


# tunning parameters
params_log = {
    'vect__analyzer': (
        tokenizer.Stem,
        tokenizer.Lemma
    ),
}


# grid search
grid_log = GridSearchCV(
    pipeline_log,       # pipeline from above
    params_log,         # parameters to tune via cross validation
    refit=True,         # fit using all available data at the end, on the best found param combination
    n_jobs=-1,          # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy', # what score are we optimizing?
    cv=StratifiedKFold(y_train, n_folds=5), # what type of cross validation to use
)

In [None]:
##########################################################

# [Decision Trees]

# putting the steps explicitly into Pipeline
pipeline_dt = Pipeline([
        # strings to token counts to weighted TF-IDF scores
        ('vect', TfidfVectorizer(
                analyzer=tokenizer.Lemma, # extract the sequence of features out of the raw
                use_idf=True,             # Enable inverse-document-frequency reweighting
                max_df=1.0,               # frequency threshold
                max_features=None,        # max features
            )
        ),
        
        # train on vectors with classifier
        ('clf', RandomForestClassifier(
                criterion='entropy'
            )
        )
    ])


# tunning parameters
params_dt = {
    'vect__analyzer': (
        tokenizer.Stem,
        tokenizer.Lemma
    ),
    
    'clf__criterion': (
        'entropy',
        'gini'
    )
}


# grid search
grid_dt = GridSearchCV(
    pipeline_dt,       # pipeline from above
    params_dt,         # parameters to tune via cross validation
    refit=True,         # fit using all available data at the end, on the best found param combination
    n_jobs=-1,          # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy', # what score are we optimizing?
    cv=StratifiedKFold(y_train, n_folds=5), # what type of cross validation to use
)

In [None]:
##########################################################

# [Support Vector Machines]

# putting the steps explicitly into Pipeline
pipeline_svm = Pipeline([
        # strings to token counts to weighted TF-IDF scores
        ('vect', TfidfVectorizer(
                analyzer=tokenizer.Lemma, # extract the sequence of features out of the raw
                use_idf=True,             # Enable inverse-document-frequency reweighting
                max_df=1.0,               # frequency threshold
                max_features=None,        # max features
            )
        ),
        
        # train on vectors with classifier
        ('clf', SVC(
                kernel='linear',
                C=1, # defualt
            )
        )
    ])


# tunning parameters
params_svm = {
    'vect__analyzer': (
        tokenizer.Stem,
        tokenizer.Lemma
    ),
}


# grid search
grid_svm = GridSearchCV(
    pipeline_svm,       # pipeline from above
    params_svm,         # parameters to tune via cross validation
    refit=True,         # fit using all available data at the end, on the best found param combination
    n_jobs=-1,          # number of cores to use for parallelization; -1 for "all cores"
    scoring='accuracy', # what score are we optimizing?
    cv=StratifiedKFold(y_train, n_folds=5), # what type of cross validation to use
)

### Step 4: Compute Tunning

In [None]:
########################################################

%time grid_nb.fit(X_train, y_train)

# print params
DetecterParams(grid_nb, title="NB", all_tunes=True)

In [None]:
########################################################

%time grid_log.fit(X_train, y_train)

# print params
DetecterParams(grid_log, title="Log", all_tunes=True)

In [None]:
########################################################

%time grid_dt.fit(X_train, y_train)

# print params
DetecterParams(grid_dt, title="DT", all_tunes=True)

In [None]:
########################################################

%time grid_svm.fit(X_train, y_train)

# print params
DetecterParams(grid_svm, title="SVM", all_tunes=True)

### Step 5: Compare Detecters

In [None]:
DetecterMetrics(X_test, y_test, grid_log, title="Test")

In [None]:
PlotRocAuc(X_test, y_test, grid_log, title="Log")

In [None]:
# Print Test Prediction
def DetecterMetrics(features, labels, detecter, title=""):
    predictions = detecter.predict(features)
    
    print("[%s Results]: \n" % title)
    print(classification_report(labels, predictions))
    print('Accuracy: ', accuracy_score(labels, predictions))

In [None]:
# Print Training Results
def DetecterParams(detecter, title="", all_tunes=False):
    if all_tunes:
        print("[All Params Results]:\n")
        pprint(detecter.grid_scores_)
        print("\n")

    print("[%s Detecter Params]: \n" % title)
    print("Best Score: ", detecter.best_score_)
    print("Best Params: ", detecter.best_params_)

In [None]:
# Receiver Operating Characteristic = ROC curve
# Visualizes a classifier's performance
# for all values of the discrimination threshold. 
# fall out: F = FP / (TN + FP)
# AUC (area under the curve)
def PlotRocAuc(features, labels, detecter, title=""):
    # predict features
    predictions = detecter.predict_proba(features)
    
    # calculate Fall Out & Recall
    false_positive_rate, recall, thresholds = roc_curve(
        labels, predictions[:, 1])

    # ROC AUC
    roc_auc = auc(false_positive_rate, recall)

    # Plot
    plt.title('Receiver Operating Characteristic: ' + title)
    plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)

    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('Fall-out')
    plt.show()

In [None]:
##########################################################
# Step 3: Save Detecter

os.chdir("/Users/Bya/Dropbox/Research/datas/Detecter/")
with open('log_tweets_pn_detector-20160114.pkl', 'wb') as fout:
    pickle.dump(log_detector, fout)

### Classify

In [None]:
# Read Detecter
os.chdir("/Users/Bya/Dropbox/Research/datas/Detecter/")
with open('log_tweets_pn_detector-20160114.pkl', 'rb') as f:
    u = pickle._Unpickler(f)
    u.encoding = 'utf-8'
    log_detector_reloaded = u.load()
    classifier = log_detector_reloaded
    print(log_detector_reloaded)

In [None]:
for index in range(10):
    tweet = dfTwitter.tweet[index]
    label = dfTwitter.sentiment[index]
    print("\n\n================================")
    print("[Tweet]:\n", tweet)
    print("[Sentiment]: ", label)
    print("\n[Classifier]:")
    print (classifier.predict_proba(tweet)[0], classifier.predict(tweet)[0])