In [4]:
# General imports
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import collections
import os
import time
import re
import itertools

# Data Science
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Plotting
from matplotlib.markers import MarkerStyle
from PIL import Image

# Corpus
from export_results import *
from utils import *
from scipy.stats import *

In [5]:
#".*Stemming.*(Bayes|SVM)"    
def summary(recalls, precisions, pattern=".*"):
    r, p = filter_results(recalls, precisions, pipes, pattern) 
    max_recall = 0.0
    best_precision = 0.0
    best_cost = "1:1"
    best_algorithm = ''

    for (name, recalls), (name_2, precisions) in zip(r.items(), p.items()):
        print("Executing: " + name + " ...")
        recs = []
        pres = []
        recs.append(np.mean(recalls))
        pres.append(np.mean(precisions))
        for i, (recall, precision) in enumerate(zip(recalls, precisions)):
            if recall > max_recall:
                best_cost = labels[i]
                max_recall = recall
                best_precision = precisions[i]
                best_algorithm = name
            if recall == max_recall and precision > best_precision:
                best_cost = labels[i]
                max_recall = recall
                best_precision = precisions[i]
                best_algorithm = name
                
    return best_cost, max_recall, best_precision, best_algorithm, np.mean(recs), np.mean(pres) 

def load_results():
    import pymysql.cursors

    connection = pymysql.connect(host='localhost',
                             user='root',
                             password='',
                             db='agriculture_experiments',
                             charset='utf8')
    results = []

    try:
        sql = "SELECT * FROM results"
        results = pd.read_sql(sql, connection)
    except Exception as e:
        print(e)
    finally:
        connection.close()
        
    return results


def save_image(image, url='../images/', name = 'default'):
    image.savefig(url + name)
    Image.open(url + name + '.png').convert('L').save(url + name + '.png')
    
def plot_image(x, y, title="title", ylim = [0, 1.02], xlim = [2, 50.5], 
               colors="rgbmyc", models=None, name="name", labels=[], ylabel = "ylabel", 
               loc="better", markers=".,ov<>", pattern=""):
    plt.figure(figsize=(14,13))
    plt.ylim(ylim)
    plt.xlim(xlim)
    plt.xlabel("Misclassification Cost Ratio")
    plt.ylabel(ylabel)
    plt.style.use('paper.mplstyle')
    
    filled_markers = ('<', 'D', 'o', '|', 'v', '>', 'p', 'd') #' '^', ', '>', '8', 's', 'p', '*', 'h', 'H', , 'd')
    fillstyles = ('full', 'full', 'full', 'full', 'top', 'none')

    i = 0
    for model in models:
        model_name = model[0]
        if re.match(pattern, model_name) is not None:
            plt.plot(x, y[model_name])
            marker = MarkerStyle(marker=filled_markers[i], fillstyle=fillstyles[i])
            plt.scatter(x, y[model_name], marker=marker, s=300, label=model[0])
            i+=1
    
    plt.xticks(x, labels, rotation='vertical')
    plt.legend(loc=loc, prop={'size':30})

    save_image(plt,'../images/', name)
    plt.show()
    
def filter_results(recalls, precisions, models=None, pattern=".*"):

    recalls_avg = {}
    precisions_avg = {}

    for model in models:
        name = model[0]
        if re.match(pattern, name) is not None:
            recalls_avg[name] = recalls[name]
            precisions_avg[name] = precisions[name]
    
    return recalls_avg, precisions_avg

In [6]:
data = load_results()

In [10]:
bayes = data[data["algorithm"] == "Naive Bayes"]["recall"]
svm = data[data["algorithm"] == "SVM"]["recall"]

### Parametric

In [11]:
z_stat, p_val = ttest_ind(bayes, svm, equal_var=False)
if p_val < 0.05:
    print("Statistically significan different results")

### Non-parametric

In [12]:
z_stat, p_val = wilcoxon(bayes, svm, zero_method='wilcox', correction=False)
if p_val < 0.05:
    print("Statistically significan different results")

ValueError: Unequal N in wilcoxon.  Aborting.

In [None]:
dp = data[data["recall"] == data["recall"].max()]
dp[dp["prec"] == dp["prec"].max()]

In [None]:
best_cost, max_recall, best_precision, best_algorithm, mean_recall, mean_precision = \
    summary(recall_avg_normalized, precision_avg_normalized, pattern=".*")

In [None]:
print(best_cost)
print(max_recall)
print(best_precision)
print(best_algorithm)
print(mean_recall)
print(mean_precision)

## Plot best results

In [None]:
plot_image(axis_costs, recall_avg_normalized, title="RECALL", ylim = [70., 100.5], 
           xlim = cxlim, models=pipes, name="recall", labels=labels, ylabel="Recall (%)", 
           loc='lower righ', markers="<Do|", pattern="(.*Bigram.*(Bayes|SVM|.*Forest)|None_Baseline)")

In [None]:
plot_image(axis_costs, precision_avg_normalized, title="PRECISION", ylim = [0., 100], 
           xlim = cxlim, colors="rgbmyc", models=pipes, name="precision", labels = labels, ylabel="Precision (%)",
           loc ="lower right", markers="<Do|", pattern="(.*Bigram.*(Bayes|SVM|.*Forest)|None_Baseline)")