In [2]:
# General imports
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import collections
import os
import time
import re
import itertools

# Data Science
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Natural Language processing
import nltk

## Stemming
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

# Learning Algorithms / estimators
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier


# Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer, confusion_matrix

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Process
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.pipeline import Pipeline


# Corpus
from documentModel import DocumentModel as DM
from export_results import *
from utils import *

In [4]:
from multiprocessing import Process
def runInParallel(fns):
  proc = []
  for fn in fns:
    p = Process(target=fn)
    p.start()
    proc.append(p)
  for p in proc:
    p.join()
    
def total_cost_ratio(ground_truth = None, predictions = None, price = 1, expensive_class=1):
    
    n_class_to_remove = len(ground_truth[ground_truth==expensive_class])
    cm = confusion_matrix(ground_truth, predictions)
    
    if expensive_class == 1:
        n_fp = cm[0][1]
        n_fn = cm[1][0]
    elif expensive_class == 0:
        n_fp = cm[1][0]
        n_fn = cm[0][1]
    
    if n_fn != 0 or n_fp != 0:
        result = n_class_to_remove/(price * n_fn + n_fp) 
    else:
        result = 100.0

    return result
    
def normalize(recall_avg, precision_avg):
    recall_avg_normalized = {}

    for key, value in recall_avg.items():
        recall_avg_normalized[key] = []

    for key, values in recall_avg.items():
        for value in values:
            recall_avg_normalized[key].append(float(value * 100))
 

    precision_avg_normalized = {}

    for key, value in precision_avg.items():
        precision_avg_normalized[key] = []

    for key, values in precision_avg.items():
        for value in values:
            precision_avg_normalized[key].append(float(value * 100)) 
            
    return recall_avg_normalized, precision_avg_normalized

def write_results(n_experiment, nlp, algorithm, cost_ratio, precision, recall):
    import pymysql.cursors

    connection = pymysql.connect(host='localhost',
                             user='root',
                             password='',
                             db='agriculture_experiments',
                             charset='utf8')

    try:
        with connection.cursor() as cursor:
            sql = "INSERT INTO test_results(n_experiment, nlp, algorithm, cost_ratio, prec, recall) \
            VALUES(%s, %s, %s, %s, %s, %s)"
            cursor.execute(sql, (n_experiment, nlp, algorithm, cost_ratio, str(precision), str(recall)))
            connection.commit()
    
    except Exception as e:
        print(e)
    finally:
        connection.close()
        
def reset_results():
    import pymysql.cursors

    connection = pymysql.connect(host='localhost',
                             user='root',
                             password='',
                             db='agriculture_experiments',
                             charset='utf8')

    try:
        with connection.cursor() as cursor:
            sql = "TRUNCATE TABLE test_results;"
            cursor.execute(sql)
            connection.commit()
    
    except Exception as e:
        print(e)
    finally:
        connection.close()

In [5]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = SnowballStemmer(language="spanish")
    
    def __call__(self, doc):
        return [self.wnl.stem(t) for t in word_tokenize(doc)]

In [5]:
def init():
    pipelines = []

    for element in itertools.product(estimators, nlp):
        name = element[1][0] + '-' + element[0][0]
        pipeline = Pipeline([('nlp', element[1][1]), ('clf', element[0][1])])
        pipelines.append((name , pipeline))
        
    return pipelines

# Business Rule Classification

In [6]:
stop_words = ['a', 'bajo', 'en', 'para','un', 'la', 'el', 'los', 'las', 'su', 'sus', 'través', 'al','con', \
             'más', 'muy', 'cual', 'poco', 'que']

print("Transforming annotated files into training datasets...")
dm = DM()
fito_dataset = dm.get_sentences(0)
X, y = fito_dataset["data"], fito_dataset["target"]

print("OK")

Transforming annotated files into training datasets...
OK


## Experiment Variables

In [7]:
labels = ["1:1","1:2","1:4","1:6","1:10","1:25", "1:50", r"1:$10^2$", r"1:$10^3$", r"1:$10^6$"]
# like the ShuffleSplit strategy, stratified random splits do not guarantee 
#that all folds will be different, although this is still very likely for sizeable datasets

costs = np.array([2, 3, 4, 6, 10, 25, 50, 100, 1000, 1000000])
axis_costs = np.arange(1,11,1)
cxlim = [0.8, 10.15]
estimators = [("Naive Bayes", MultinomialNB(fit_prior=False)), 
              ("Random Forest", RandomForestClassifier(n_estimators=20, n_jobs=2)), 
              ("SVM", SVC(kernel='linear', C = 0.1)),
              ("Logistic", LogisticRegression()),
              ("LDA", LDA()),
              ("Perceptron", LogisticRegression()),
              ("Baseline", DummyClassifier(strategy = "constant", constant=1))]

nlp = [("None", TfidfVectorizer(use_idf = True, stop_words=stop_words)),
       ("Stemming", TfidfVectorizer(use_idf = True, stop_words=stop_words, tokenizer=LemmaTokenizer())), 
       ("Bigrams", TfidfVectorizer(use_idf = True, stop_words=stop_words, ngram_range=(2, 2))), 
       ("Combination", TfidfVectorizer(use_idf = True, stop_words=stop_words, ngram_range=(1, 2)))]

## Main Experiment Loop

In [8]:
def experiment(pattern = ".*"):
    
    import random
    
    t0 = time.time()
    pipes = init()
    
    random_state = random.randint(0,100)
    sss = StratifiedKFold(y, n_folds = 10, shuffle = True, random_state = random_state)
    for estimator in pipes:
        name = estimator[0]
        extractor = estimator[1].steps[0][1].fit(X)
        if re.match(pattern, name) is not None:
            print("Trying: " + name + " ...")
            for cost in costs:
                if "Naive Bayes" in name:
                    model = estimator[1].set_params(clf__class_prior=[1/cost, (cost-1)/cost]).steps[1][1]
                elif "Random Forest" in name:
                    model = estimator[1].set_params(clf__class_weight={1:cost-1}).steps[1][1]
                elif "SVM" in name:
                    model = estimator[1].set_params(clf__class_weight={1:cost-1}).steps[1][1]
                elif "Logistic" in name:
                    model = estimator[1].set_params(clf__class_weight={1:cost-1}).steps[1][1]
                elif "LDA" in name:
                    model = estimator[1].set_params(clf__priors=[1/cost, (cost-1)/cost]).steps[1][1]
                elif "Perceptron" in name:
                    model = estimator[1].set_params(clf__class_weight={1:cost-1}).steps[1][1]
                elif "Baseline" in name:
                    model = estimator[1].steps[1][1]
            
                precisions = []
                recalls = []
                for train_index, test_index in sss:
                    X_train, X_test = \
                            extractor.transform(X)[train_index], extractor.transform(X)[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    model.fit(X_train, y_train)
                    precisions.append(precision_score(y_test, model.predict(X_test), pos_label=1))
                    recalls.append(recall_score(y_test, model.predict(X_test), pos_label=1))
                    
                write_results(str(random_state), name.split("-")[0], name.split("-")[1], \
                                  str(cost), np.mean(np.array(precisions)), \
                                  np.mean(np.array(recalls)))
                
    t1 = time.time()
    print()
    print("Execution time: %.3f min" % ((t1 - t0)/60))

In [10]:
#reset_results()
#runInParallel([experiment] * 10)
for i in range(9):
    experiment(pattern=".*Perceptron.*")

Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Perceptron ...
Trying: Combination-Perceptron ...

Execution time: 2.282 min
Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Perceptron ...
Trying: Combination-Perceptron ...

Execution time: 2.234 min
Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Perceptron ...
Trying: Combination-Perceptron ...

Execution time: 2.241 min
Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Perceptron ...
Trying: Combination-Perceptron ...

Execution time: 2.264 min
Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Perceptron ...
Trying: Combination-Perceptron ...

Execution time: 2.248 min
Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Perceptron ...
Trying: Combination-Perceptron ...

Execution time: 2.270 min
Trying: None-Perceptron ...
Trying: Stemming-Perceptron ...
Trying: Bigrams-Percep