In [None]:
import importlib
import pickle
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib qt4
import seaborn as sns

## Explore the amount of nans

In [None]:
# Explore nans
with sns.plotting_context('paper', font_scale=2): #, rc={"figure.figsize": [6.4,4.4]}):
    file_path = './data/Dane/1year.csv'
    class_in_focus = 1
    data = pd.read_csv(file_path, na_values='?')
    if class_in_focus != -1:
        data = data.loc[data['class'] == class_in_focus,:]
    data = data.drop(['class'],axis=1)
    na_data = pd.isnull(data)
    num_nans = na_data.sum(axis=1)
    print('In total we have {} rows and {} features'.format(*data.shape))
    nan_share_tot = num_nans.sum()/float((data.shape[0]*data.shape[1]))
    print('Total share of nans is {}'.format(nan_share_tot)) 
    # Features with most nans
    print((na_data.sum(axis=0)/len(na_data)).loc[['attr11', 'attr21','attr27','attr37']])
    plt.figure(1)
    plt.hist(na_data.drop(['attr11','attr21', 'attr27', 'attr37'], axis=1).sum(axis=0)/len(na_data)*100, range=(0,6), bins=10)
    plt.title('Missing values per feature (class {})'.format(class_in_focus))
    plt.xlabel('% missing values')
    plt.ylabel('Features')
    savepath = 'hist_nan_features_class{}.eps'.format(class_in_focus)
    plt.savefig(savepath, format='eps', bbox_inches = 'tight')
    plt.figure(2)
    plt.hist(num_nans, bins=num_nans.max())
    plt.xlim((0,10))
    plt.title('Missing values per company (class {})'.format(class_in_focus))
    plt.xlabel('Number of missing values')
    plt.ylabel('Companies')
    savepath = 'hist_nan_companies_class{}.eps'.format(class_in_focus)
    plt.savefig(savepath, format='eps', bbox_inches = 'tight')
    plt.show()

In [None]:
# What happens if we through away the two features with highest frequency nans?
na_data_dropped_some = na_data.drop(['attr37', 'attr21'], axis=1)
num_nan_dropped = na_data_dropped_some.sum(axis=1)
share_with_nan = (num_nan_dropped != 0).mean()
print('Then we get {} share with at least one nan.'.format(share_with_nan))
data_dropped = data.drop(['attr37', 'attr21'], axis=1)
nan_share_dropped = num_nan_dropped.sum()/float((data_dropped.shape[0]*data_dropped.shape[1]))
print('The total share of nans is then {}'.format(nan_share_dropped))

## Explore saved data

In [None]:
# Load data
data_path = 'output/s_20170518-RF/results_year1.pkl'

with open(data_path, 'rb') as f:
    results_RF = pickle.load(f)

In [None]:
# Plot roc curve
with sns.plotting_context('paper', font_scale=1): #, rc={"figure.figsize": [6.4,4.4]}):
    plt.title('ROC curve')
    plt.plot((0, 1), (0, 1), ls='--', c='k')
    if type(results['roc_curve']['fpr']) == list:
        # A CV run with multiple arrays
        for fpr, tpr in zip(results['roc_curve']['fpr'], results['roc_curve']['tpr']):
            plt.plot(fpr, tpr)
    else:
        # Not a CV run
        plt.plot(results['roc_curve']['fpr'], results['roc_curve']['tpr'])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    savepath = 'roc_curve.eps'
    plt.savefig(savepath, format='eps', bbox_inches = 'tight')
    plt.show()
print('AUC value: {}'.format(results['roc_auc']))

In [None]:
i = 7
with sns.plotting_context('paper', font_scale=1): #, rc={"figure.figsize": [6.4,4.4]}):
    plt.title('ROC curve')
    plt.plot((0, 1), (0, 1), ls='--', c='k')
    plt.plot(results_GB['roc_curve']['fpr'][i], results_GB['roc_curve']['tpr'][i], label='Gradient boosting algorithm')
    plt.plot(results_RF['roc_curve']['fpr'][i], results_RF['roc_curve']['tpr'][i], label='Random forest algorithm')
    plt.plot(results_NN['roc_curve']['fpr'][i], results_NN['roc_curve']['tpr'][i], label='Multilayer perceptron')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.legend(loc='lower right')
    savepath = 'roc_curve.eps'
    #plt.savefig(savepath, format='eps', bbox_inches = 'tight')
    plt.show()

In [None]:
i = 7
%matplotlib qt4
with sns.plotting_context('paper', font_scale=1): #, rc={"figure.figsize": [6.4,4.4]}):
    plt.title('ROC curve with probability threshold')
    plt.plot((0, 1), (0, 1), ls='--', c='k')
    plt.plot(results_NN['roc_curve']['fpr'][i], results_NN['roc_curve']['tpr'][i], label='ROC curve MLP')
    plt.plot(results_NN['roc_curve']['fpr'][i], results_NN['roc_curve']['thresholds'][i], label='Probability threshold MLP')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate & probability threshold')
    plt.legend(loc='center right')
    savepath = 'roc_curve.eps'
    plt.savefig(savepath, format='eps', bbox_inches = 'tight')
    plt.show()