In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import collections

Dataset import
============

*Iris*
---------------------------

In [None]:
def import_iris_dataset():
    data = datasets.load_iris()

    target = data.target
    data = data.data
#     print("----------------Dane Wejściowe--------------------")
#     print(data[0])
#     print(data[1])
#     print(data[2])
#     print(data[3])

    return data, target

*Glass*
---------------------------

In [None]:
def import_glass_dataset():
    data = np.loadtxt('Datasets/Glass/glass.data', delimiter=',')
    target = []
    for x in data:
        target.append(x[-1])
    data = data[:,:-1]
    target = np.asarray(target)

    return data, target

*Diabetes*
---------------------------

In [None]:
def import_diabetes_dataset():
    data = np.loadtxt('Datasets/PrimaIndiansDiabetes/pima-indians-diabetes.csv', delimiter=',')
    target = []
    for x in data:
        target.append(x[-1])
    data = data[:,:-1]
    target = np.asarray(target)

    return data, target

*Wine*
---------------------------

In [None]:
def import_wine_dataset():
    data = datasets.load_wine()
    
    target = data.target
    data = data.data
    
    return data, target

*User Knowledge*
---------------------------

In [None]:
def import_knowledge_dataset():
    data = np.loadtxt('Datasets/user-knowledge.csv', delimiter=',')
    target = []
    for x in data:
        target.append(x[-1])
    data = data[:,:-1]
    target = np.asarray(target)

    return data, target

Discretiazation
============

In [None]:
def mean_of_bin(min_value, max_value, cutoffs):
    means = [(min_value + cutoffs[0]) / 2]
    for x in range(0, len(cutoffs)- 1):
        means.append((cutoffs[x] + cutoffs[x+1]) / 2)
    means.append((cutoffs[-1] + max_value) / 2)
    return means

In [None]:
def adjust_value(data, means):
    for i, r in enumerate(data):
        data[i] = means[int(data[i])]
    return data   

*Discretize by Frequency*
---------------------------

In [None]:
def freqenty_discretize(data, bins):
    min_value = np.min(data)
    max_value = np.max(data)
    split = np.array_split(np.sort(data), bins)
    cutoffs = [x[-1] for x in split]
    cutoffs = cutoffs[:-1]
    discrete = np.digitize(data, cutoffs, right=True)

    return discrete, cutoffs, min_value, max_value

In [None]:
def freqenty_discretization(data, bins_N, color, plot):
    unzip_lst = zip(*data)
    for index, i in enumerate(unzip_lst):
        discrete_dat, cutoff, min_value, max_value = freqenty_discretize(i, bins_N)
#         print("----------------Kubeczki--------------------")
#         print(discrete_dat)
        
#         print("-------Means-------------")
        means = mean_of_bin(min_value, max_value, cutoff)
#         print(means)
        
#         print("----------------Kubeczki z wartości średnich--------------------") 
        discrete_dat = np.array(discrete_dat, dtype=float)
        discrete_dat = adjust_value(discrete_dat, means)
#         print(discrete_dat)
                
        discrete_data = np.array(discrete_dat)
        discrete_data = discrete_data[:,np.newaxis]   

        if index == 0:
            discreted_data = np.copy(discrete_data)
        else: 
            discreted_data = np.append(discreted_data, discrete_data, axis=1)
  
        counter = collections.Counter(discrete_data[:,0])
        counter = sorted(counter.items())
        labels, values = zip(*counter)
        if plot == True:
            plot_bar_diagram(labels, values, color)            
            
    return discreted_data

*Discretize by Equal*
---------------------------

In [None]:
def equal_discretize(data, bins):
    min_value = np.min(data)
    max_value = np.max(data)
    
    cutoffs = []
    width = (max(data) - min(data))/bins
    for x in range(1, bins):
        cutoffs.append(min(data)+x*width)
    discrete = np.digitize(data, cutoffs, right=True)
    return discrete, cutoffs, min_value, max_value

In [None]:
def equal_discretization(data, bins_N, color, plot):
    unzip_lst = zip(*data)
    for index, i in enumerate(unzip_lst):
        discrete_dat, cutoff, min_value, max_value = equal_discretize(i, bins_N)
        means = mean_of_bin(min_value, max_value, cutoff)
        
        discrete_dat = np.array(discrete_dat, dtype=float)
        discrete_dat = adjust_value(discrete_dat, means)
        
        discrete_data = np.array(discrete_dat)
        discrete_data = discrete_data[:,np.newaxis]

        if index == 0:
            discreted_data = np.copy(discrete_data)
        else: 
            discreted_data = np.append(discreted_data, discrete_data, axis=1)

        counter = collections.Counter(discrete_data[:,0])
        counter = sorted(counter.items())
        labels, values = zip(*counter)
        if plot == True:
            plot_bar_diagram(labels, values, color)

    return discreted_data

*Discretize using Histogram*
---------------------------

In [None]:
def histogram_discretize(data):
    min_value = np.min(data)
    max_value = np.max(data)
    
    _, bin = np.histogram(data, bins='auto')
    bins = np.delete(bin, 0)
    discrete = np.digitize(data, bins, right=True)
    return discrete, bin, min_value, max_value

In [None]:
def histogram_discretization(data, color, plot):
    unzip_lst = zip(*data)
    for index, i in enumerate(unzip_lst):
        discrete_dat, bin, min_value, max_value = histogram_discretize(i)

        means = mean_of_bin(min_value, max_value, bin)
        discrete_dat = np.array(discrete_dat, dtype=float)
        discrete_dat = adjust_value(discrete_dat, means)
        
        discrete_data = np.array(discrete_dat)
        discrete_data = discrete_data[:,np.newaxis]

        
        if index == 0:
            discreted_data = np.copy(discrete_data)
        else: 
            discreted_data = np.append(discreted_data, discrete_data, axis=1)

        counter = collections.Counter(discrete_data[:,0])
        counter = sorted(counter.items())
        labels, values = zip(*counter)
        if plot == True:
            plot_bar_diagram(labels, values, color)    
    
    return discreted_data

*Discretization Histogram Diagram*
---------------------------

In [None]:
def plot_bar_diagram(labels, values, color):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(7, 4))

    ax.set_ylabel('Number of elements')
    ax.set_xlabel('Bin value')
#     ax.set_title('Discretization Bin Diagram')

    y = values

    # bar graphs
    x = np.arange(len(labels))

    width = 0.9
    ax.bar(x, y, width, color=color)
    ax.set_xticks(x)
    labels = list(labels)
    for i, element in enumerate(labels):
        labels[i] = round(element, 2)

    ax.set_xticklabels(labels)
    plt.show()

In [None]:
bins_N = 8
data, target = import_wine_dataset()
# data, target = import_glass_dataset()
# data, target = import_glass_dataset()


equal_discretization(data, bins_N, 'IndianRed', True)
freqenty_discretization(data, bins_N, 'SkyBlue', True)
histogram_discretization(data, 'mediumseagreen', True)

Klasyfikatory (Gaussowski i Multinomial)
---------------

In [None]:
def gauss_classificator(X_train, X_test, Y_train, Y_test):
    clf = GaussianNB()
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    print (classification_report(Y_test, Y_pred))
    return clf, Y_pred

def multinomial_classificator(X_train, X_test, Y_train, Y_test):
    clf = MultinomialNB()
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    print (classification_report(Y_test, Y_pred))
    return clf, Y_pred

Confusion Matrix Generator
---------------

In [None]:
def confusion_matrix_generator(Y_test, Y_pred):
    cm = metrics.confusion_matrix(Y_test, Y_pred)
    img = plt.matshow(cm, cmap=plt.cm.autumn)
    plt.colorbar(img, fraction=0.045)
    for x in range(cm.shape[0]):
        for y in range(cm.shape[1]):
            plt.text(x, y, "%0.2f" % cm[x,y], 
                     size=12, color='black', ha="center", va="center")
    plt.show()

K-Fold vs Stratified K-Fold Diagram
==================

In [None]:
folds_number = [2, 5, 10, 20, 30, 50]
f1_kfold_score = []
f1_stratified_kfold_score = []

for i in folds_number:
    scores = cross_validation_test(data, target, KFold(n_splits=i), True)
    f1_kfold_score.append(return_f1_macro_mean_score(scores))
    print(return_f1_macro_mean_score(scores))
    scores = cross_validation_test(data, target, StratifiedKFold(n_splits=i), True)
    f1_stratified_kfold_score.append(return_f1_macro_mean_score(scores))
    print(return_f1_macro_mean_score(scores))
    print("")


In [None]:
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(20, 10))

ax.set_ylabel('F1-Score')
ax.set_title('K-Fold vs Stratified K-Fold for Wine Dataset (Gaussian Classifier)')

y1 = f1_kfold_score
y2 = f1_stratified_kfold_score

# bar graphs
x = np.arange(len(folds_number))

width = 0.25
ax.bar(x, y1, width, label='K-Fold')
ax.bar(x + width, y2, width,
        color=list(plt.rcParams['axes.prop_cycle'])[2]['color'], label='Stratified K-Fold')
ax.set_xticks(x + width)
ax.set_xticklabels(folds_number)
ax.legend()

plt.show()

In [None]:
def cross_validation_test(data, target, cv, clf):
    if clf == True:
        clf = MultinomialNB()
    else:
        clf = GaussianNB()
    toReturn = 0
    cv = ShuffleSplit(n_splits=2, test_size=0.3, random_state=0)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(clf, data, target,
                            cv=cv, scoring=scoring, return_train_score=False)
    return scores
    
def print_cross_validation_test(scores):
    for key, score in scores.items():
#         print('{}: {:.3f}'.format(key, np.mean(score)))
        print('{:.3f}'.format(np.mean(score)))

def return_f1_macro_mean_score(scores):
    for key, score in scores.items():
        if key == 'test_f1_macro':
            return np.mean(score)

Wine - F1-Score, KFold & Discretization ways
==================