In [None]:
# General imports
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import collections
import os
from time import time

# Data Science
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sb

# Natural Language processing
import nltk

# Algorithms / estimators
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Process
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

# Corpus
from documentModel import DocumentModel as DM
from export_results import *

from scipy import stats

def is_significant(model_1, model_2, significance = 0.05):
    
    p_value = stats.wilcoxon(recall_avg[model_1], recall_avg[model_2])[1]
    print(p_value)
    if p_value < significance:
        print("It is statically significant")
    else:    
        print("It is NOT statically significant")

def save_image(image, url='../images/', name = 'default'):
    image.savefig(url + name)    

In [None]:
stop_words = ['a', 'bajo', 'en', 'para','un', 'la', 'el', 'los', 'las', 'su', 'sus', 'través', 'al','con', \
             'más', 'muy', 'cual', 'poco', 'que']

print("Transforming annotated files into training datasets...")
dm = DM()
fito_dataset = dm.get_sentences(0)
X, y = fito_dataset["data"], fito_dataset["target"]

print("OK")

In [None]:
import itertools

fn_c = [1, 5, 10]
fp_c = [1, 5, 10]

colors = "rgbmy"
i = 1
plt.figure(figsize=(18, 22))

for element in itertools.product(fn_c, fp_c):
    plt.subplot(3,3,i)
    for model, color in zip(models, colors):
        fpr = 1 - np.array(precision_avg[model[0]])
        fnr = 1 - np.array(recall_avg[model[0]])
        cost = np.add(fpr * element[1], fnr * element[0])
        plt.plot(costs, cost, 
                 color=color, label=model[0])
        plt.xlabel("fn: %s fp:%s" % (element[0], element[1]))
        plt.xlim([2, 30])
        plt.title("COST")
    i += 1

    plt.ylabel("COST")
    plt.legend(loc='better')
plt.show()

## Chaining PCA and Naïve Bayes

In [None]:
from sklearn import decomposition
from sklearn.naive_bayes import BernoulliNB
import itertools
import time

costs = np.arange(1, 21, 1)
n_components = np.arange(1, 500, 25)
recalls_pca = []
precisions_pca = []

data_pca = []

#for element in itertools.product(costs, n_components):
t0 = time.time()
for element in itertools.product(costs, n_components):
    bayes = BernoulliNB(class_prior=[1, element[0]])
    pca = decomposition.PCA(n_components=element[1])
    x_new = pca.fit_transform(extractor.transform(X).toarray())
    bayes.fit(x_new, y)
    recall = recall_score(y, bayes.predict(x_new))
    precision = precision_score(y, bayes.predict(x_new))
    data_pca.append((element[0], element[1], recall))
    recalls_pca.append(recall)
    precisions_pca.append(precision)
t1 = time.time()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(18,6))

X_t, Y_t = np.meshgrid(costs, n_components)
recalls_t_pca = np.array(recalls_pca).reshape(X_t.shape)
precisions_t_pca = np.array(precisions_pca).reshape(X_t.shape)
ax = fig.add_subplot(1, 2, 1, projection='3d')
p = ax.plot_surface(costs, n_components, recalls_t_pca, cmap=cm.coolwarm)
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.view_init(azim=-0, elev = 60)
p = ax.plot_surface(costs, n_components, recalls_t_pca, cmap=cm.coolwarm)
cb = fig.colorbar(p, shrink=0.5)
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1,2,1)
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(np.array(recalls_t_pca).T, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('costs')
plt.ylabel('n_components')
plt.colorbar()
plt.xticks(np.arange(len(costs)), costs, rotation=45)
plt.yticks(np.arange(len(n_components)), n_components)
plt.title('Recall')

plt.subplot(1,2,2)
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(np.array(precisions_t_pca).T, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('costs')
plt.ylabel('n_components')
plt.xticks(np.arange(len(costs)), costs, rotation=45)
plt.yticks(np.arange(len(n_components)), n_components)
plt.title('Precision')
plt.colorbar()

plt.show()

## Chaining Feature Selection (Supervised) and Naïve Bayes

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
import itertools
import time

costs = np.arange(1, 21, 1)
n_components = np.arange(1, 500, 25)
recalls_sel = []
precisions_sel = []

data_sel = []

t0 = time.time()
for element in itertools.product(costs, n_components):
    bayes = MultinomialNB(class_prior=[1, element[0]])
    sel = SelectKBest(chi2, k=element[1])
    x_new = sel.fit_transform(extractor.transform(X).toarray(),y)
    bayes.fit(x_new, y)
    recall = recall_score(y, bayes.predict(x_new))
    data_sel.append((element[0], element[1], recall))
    recalls_sel.append(recall)
    precisions_sel.append(precision)
t1 = time.time()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(18,6))

X_t, Y_t = np.meshgrid(costs, n_components)
recalls_t_sel = np.array(recalls_sel).reshape(X_t.shape)
precisions_t_sel = np.array(precisions_sel).reshape(X_t.shape)
ax = fig.add_subplot(1, 2, 1, projection='3d')
p = ax.plot_surface(costs, n_components, recalls_t_sel, cmap=cm.coolwarm)
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.view_init(azim=-0, elev = 60)
p = ax.plot_surface(costs, n_components, recalls_t_sel, cmap=cm.coolwarm)
cb = fig.colorbar(p, shrink=0.5)
plt.show()

In [None]:
plt.figure(figsize=(16, 6))

plt.subplot(1,2,1)
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(np.array(recalls_t_sel).T, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('costs')
plt.ylabel('n_components')
plt.colorbar()
plt.xticks(np.arange(len(costs)), costs, rotation=45)
plt.yticks(np.arange(len(n_components)), n_components)
plt.title('Recall')

plt.subplot(1,2,2)
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(np.array(precisions_t_sel).T, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('costs')
plt.ylabel('n_components')
plt.colorbar()
plt.xticks(np.arange(len(costs)), costs, rotation=45)
plt.yticks(np.arange(len(n_components)), n_components)
plt.title('Precision')

plt.show()

In [None]:
is_significant("Decision Tree", "Linear SVC", significance = 0.05)
print("")
is_significant("Random Forest", "Decision Tree", significance = 0.05)
print("")
is_significant("Random Forest", "Linear SVC", significance = 0.05)

## Without changing weights

### Decision Tree

In [None]:
# Classification and ROC analysis
sss = StratifiedShuffleSplit(y, n_iter = 10, test_size=0.2, random_state=0)

X, y = fito_dataset["data"], fito_dataset["target"]
X_trans = extractor.transform(X)

models = [("Decision Tree", DecisionTreeClassifier(class_weight={1:1})),
          ("Decision Tree CS", DecisionTreeClassifier(class_weight={1:100})),
          ("Decision Tree CSS", DecisionTreeClassifier(class_weight={1:1000})),
          ("Decision Tree CSSS", DecisionTreeClassifier(class_weight={1:10000}))]


plt.figure(figsize=(12,8))
plt.plot([0,1], [0,1], '--', color=(0.6,0.6,0.6), label='luck')

for model in models:
    mean_tpr = 0.0
    mean_fpr = np.linspace(0,1,100)
    for i, (train, test) in enumerate(sss):
        probas_ = model[1].fit(X_trans[train], y[train]).predict_proba(X_trans[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1], pos_label=1)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw =1, label = 'ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(sss)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, label='%s (area = %0.2f)' % (model[0], mean_auc), lw = 2)
        
plt.ylim([0.6, 1.01])
plt.xlim([0.0, 1.01])
plt.xlabel('False Positive Rate (1-Specifity)')
plt.ylabel('True Positive Rate (Sensivity or Recall)')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### Naive Bayes

In [None]:
# Classification and ROC analysis
sss = StratifiedShuffleSplit(y, n_iter = 10, test_size=0.2, random_state=0)

X, y = fito_dataset["data"], fito_dataset["target"]
X_trans = extractor.transform(X)

models = [("Naive Bayes", MultinomialNB(class_prior=[1, 1], fit_prior=False)),
          ("Naive Bayes CS", MultinomialNB(class_prior=[1, 10], fit_prior=False)),
          ("Naive Bayes CSS", MultinomialNB(class_prior=[1, 100], fit_prior=False)),
          ("Naive Bayes CSSS", MultinomialNB(class_prior=[1, 1000], fit_prior=False)),
         ]


plt.figure(figsize=(12,8))
plt.plot([0,1], [0,1], '--', color=(0.6,0.6,0.6), label='luck')

for model in models:
    mean_tpr = 0.0
    mean_fpr = np.linspace(0,1,100)
    for i, (train, test) in enumerate(sss):
        probas_ = model[1].fit(X_trans[train], y[train]).predict_proba(X_trans[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1], pos_label=1)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw =1, label = 'ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(sss)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, label='%s (area = %0.2f)' % (model[0], mean_auc), lw = 2)
        
plt.ylim([0.6, 1.01])
plt.xlim([0.0, 1.01])
plt.xlabel('False Positive Rate (1-Specifity)')
plt.ylabel('True Positive Rate (Sensivity or Recall)')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### Random Forest

In [1]:
# Classification and ROC analysis
sss = StratifiedShuffleSplit(y, n_iter = 10, test_size=0.2, random_state=0)

X, y = fito_dataset["data"], fito_dataset["target"]
X_trans = extractor.transform(X)

models = [("Random Forest", RandomForestClassifier(class_weight={1:1})),
          ("Random Forest CS", RandomForestClassifier(class_weight={1:10})),
          ("Random Forest CSS", RandomForestClassifier(class_weight={1:100})),
          ("Random Forest CSS", RandomForestClassifier(class_weight={1:1000})),]


plt.figure(figsize=(12,8))
plt.plot([0,1], [0,1], '--', color=(0.6,0.6,0.6), label='luck')

for model in models:
    mean_tpr = 0.0
    mean_fpr = np.linspace(0,1,100)
    for i, (train, test) in enumerate(sss):
        probas_ = model[1].fit(X_trans[train], y[train]).predict_proba(X_trans[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1], pos_label=1)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw =1, label = 'ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(sss)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, label='%s (area = %0.2f)' % (model[0], mean_auc), lw = 2)
        
plt.ylim([0.6, 1.01])
plt.xlim([0.0, 1.01])
plt.xlabel('False Positive Rate (1-Specifity)')
plt.ylabel('True Positive Rate (Sensivity or Recall)')
plt.title('ROC Curve')
plt.legend(loc="lower right")

plt.show()

NameError: name 'StratifiedShuffleSplit' is not defined

### Linear SVM

In [None]:
# Classification and ROC analysis
sss = StratifiedShuffleSplit(y, n_iter = 10, test_size=0.2, random_state=0)

X, y = fito_dataset["data"], fito_dataset["target"]
X_trans = extractor.transform(X)

models = [("Linear SVC", SVC(kernel = "linear", class_weight={1:1}, probability=True)),
          ("Linear SVC CS", SVC(kernel = "linear", class_weight={1:10}, probability=True)),
          ("Linear SVC CSS", SVC(kernel = "linear", class_weight={1:100}, probability=True)),
          ("Linear SVC CSSS", SVC(kernel = "linear", class_weight={1:1000}, probability=True))
         ]


plt.figure(figsize=(12,8))
plt.plot([0,1], [0,1], '--', color=(0.6,0.6,0.6), label='luck')

for model in models:
    mean_tpr = 0.0
    mean_fpr = np.linspace(0,1,100)
    for i, (train, test) in enumerate(sss):
        probas_ = model[1].fit(X_trans[train], y[train]).predict_proba(X_trans[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1], pos_label=1)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw =1, label = 'ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(sss)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, label='%s (area = %0.2f)' % (model[0], mean_auc), lw = 2)
        
plt.ylim([0.6, 1.01])
plt.xlim([0.0, 1.01])
plt.xlabel('False Positive Rate (1-Specifity)')
plt.ylabel('True Positive Rate (Sensivity or Recall)')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

### All together

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
stop_words = ['a', 'bajo', 'en', 'para','un', 'la', 'el', 'los', 'las', 'su', 'sus', 'través', 'al','con', \
             'más', 'muy', 'cual', 'poco', 'que']

print("Transforming annotated files into training datasets...")
dm = DM()
fito_dataset = dm.get_sentences(0)
X, y = fito_dataset["data"], fito_dataset["target"]

print("OK")

In [None]:
pipe_nb = Pipeline([('vect', CountVectorizer(stop_words=stop_words)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB(fit_prior=False))])

param_grid = [{ 'vect__tokenizer': (LemmaTokenizer(), None),
                'vect__ngram_range': [(1, 1), (1, 2), (2,2), (1,3)],
                'tfidf__use_idf': (True, False),
                'clf__class_prior': [[0.5, 0.5],[0.1, 0.9],[0.01, 0.99]],
              }]


cross_validation = StratifiedKFold(y, n_folds=5)

gs_nb = GridSearchCV(estimator=pipe_nb,
                 param_grid=param_grid,
                 scoring='recall',
                 cv=5)

gs_nb.fit(X, y)

In [None]:
print(gs_nb.best_params_)

## Weighting schema selection

In [1]:
def init_ws():
    recall_avg = {}
    recall_std = {}
    precision_avg = {}
    precision_std = {}
    
    for extractor in extractors:
        recall_avg[extractor[0]] = []
        recall_std[extractor[0]] = []
        precision_avg[extractor[0]] = []
        precision_std[extractor[0]] = []
        
    return recall_avg, recall_std, precision_avg, precision_std

In [None]:
extractors = [("TF-IDF", TfidfVectorizer(use_idf=False, stop_words=stop_words)),
              ("TF", TfidfVectorizer(use_idf=True, stop_words=stop_words))]

recall_avg, recall_std, precision_avg, precision_std = init_ws()

for extractor in extractors:
    extractor[1].fit(X)
    for cost in costs:
        model = MultinomialNB(class_prior=[1/cost, (cost-1)/cost])
        precisions = []
        recalls = []
        for train_index, test_index in sss:
            X_train, X_test = extractor[1].transform(X)[train_index], extractor[1].transform(X)[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)
            precisions.append(precision_score(y_test, model.predict(X_test), pos_label=1))
            recalls.append(recall_score(y_test, model.predict(X_test), pos_label=1))
        recall_avg[extractor[0]].append(np.average(recalls))
        recall_std[extractor[0]].append(np.std(recalls))
        precision_avg[extractor[0]].append(np.average(precisions))
        precision_std[extractor[0]].append(np.std(precisions))

In [None]:
plot_image(axis_costs, recall_avg, title="RECALL", ylim = [0.9, 1.02], 
           xlim = cxlim, colors="rg", models=extractors, name="recall_ws", labels = labels)

In [None]:
plot_image(axis_costs, precision_avg, title="PRECISION", ylim = [0.3, 0.8], 
           xlim = cxlim, colors="rg", models=extractors, name="precision_ws", labels = labels)