In [None]:
# General imports
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import collections
import os
from time import time

# Data Science
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sb

# Natural Language processing
import nltk

# Algorithms / estimators
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Process
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from scipy import interp

# Corpus
from documentModel import DocumentModel as DM
from export_results import *

from scipy import stats

def is_significant(model_1, model_2, significance = 0.05):
    
    p_value = stats.wilcoxon(recall_avg[model_1], recall_avg[model_2])[1]
    print(p_value)
    if p_value < significance:
        print("It is statically significant")
    else:    
        print("It is NOT statically significant")

def save_image(image, url='../images/', name = 'default'):
    image.savefig(url + name)    

In [None]:
stop_words = ['a', 'bajo', 'en', 'para','un', 'la', 'el', 'los', 'las', 'su', 'sus', 'través', 'al','con', \
             'más', 'muy', 'cual', 'poco', 'que']

print("Transforming annotated files into training datasets...")
dm = DM()
fito_dataset = dm.get_sentences(0)
X, y = fito_dataset["data"], fito_dataset["target"]

print("OK")

In [None]:
import itertools

fn_c = [1, 5, 10]
fp_c = [1, 5, 10]

colors = "rgbmy"
i = 1
plt.figure(figsize=(18, 22))

for element in itertools.product(fn_c, fp_c):
    plt.subplot(3,3,i)
    for model, color in zip(models, colors):
        fpr = 1 - np.array(precision_avg[model[0]])
        fnr = 1 - np.array(recall_avg[model[0]])
        cost = np.add(fpr * element[1], fnr * element[0])
        plt.plot(costs, cost, 
                 color=color, label=model[0])
        plt.xlabel("fn: %s fp:%s" % (element[0], element[1]))
        plt.xlim([2, 30])
        plt.title("COST")
    i += 1

    plt.ylabel("COST")
    plt.legend(loc='better')
plt.show()

In [None]:
## Chaining PCA and Naïve Bayes

In [None]:
from sklearn import decomposition
from sklearn.naive_bayes import BernoulliNB
import itertools
import time

costs = np.arange(1, 21, 1)
n_components = np.arange(1, 500, 25)
recalls_pca = []
precisions_pca = []

data_pca = []

#for element in itertools.product(costs, n_components):
t0 = time.time()
for element in itertools.product(costs, n_components):
    bayes = BernoulliNB(class_prior=[1, element[0]])
    pca = decomposition.PCA(n_components=element[1])
    x_new = pca.fit_transform(extractor.transform(X).toarray())
    bayes.fit(x_new, y)
    recall = recall_score(y, bayes.predict(x_new))
    precision = precision_score(y, bayes.predict(x_new))
    data_pca.append((element[0], element[1], recall))
    recalls_pca.append(recall)
    precisions_pca.append(precision)
t1 = time.time()