# Plotting roc curve for Zahra's model
### March 11, 2019

In [3]:
import sys
import os

import matplotlib.pyplot as plt
import numpy as np
import glob
import pickle
import time

import pandas as pd

In [4]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [5]:
%matplotlib widget

In [23]:
# Make numpy array of actual data and predictions
def f_extract_zahra_data():
    '''
    Extract data from Zahra's predictions to plot the roc curve
    '''
    
    fname='zahra_cnn_3d_preds.csv'
    df1=pd.read_csv(fname)
    
    ### Get ydata values from dataframe: nugen -> signal (y=1), corsika -> background (y=0)
    length=df1.shape[0]
    ydata=np.zeros(length,dtype=np.float64)
    for count,ii in enumerate(df1.event_id.values):
        if ii=='nugen': ydata[count]=1

    ### Get predictions and weights from dataframe
    y_pred,wts=df1.pred.values,df1.weight.values

    return ydata,y_pred,wts


def f_plot_roc_curve(fpr,tpr,label=''):
    '''
    Module for roc plot and printing AUC
    '''
    
    plt.plot(fpr,tpr,label=label)
    plt.xscale('log')
    plt.xlim([10**-7,1.0])
    plt.ylim([0,1.0])
    # y=x line for comparison
#     x=np.linspace(0,1,num=500)
#     plt.plot(x,x)
#     plt.xscale('log')
#     plt.xlim(1e-10,1e-5)
    plt.show()

    # AUC 
    auc_val = auc(fpr, tpr)
    print("AUC: ",auc_val)


In [25]:
# param_arr=[14789,14789,503401,2116881,3499441,]
# tpr_arr=[]

## Plot my models after testing on full data

In [26]:
model_save_dir='/global/project/projectdirs/dasrepo/vpa/ice_cube/data_for_cnn/results_data/final_2models_test_on_full_dataset/'

model_name='4'

In [29]:
def f_model_plot(model_name,label):
    test_file_name=model_save_dir+'y-predict_model-'+str(model_name)+'.pred'
    test_y_file_name=model_save_dir+'y-test_model-'+str(model_name)+'.test'
    test_weights_file_name=model_save_dir+'wts-test_model-'+str(model_name)+'.test'    

    assert os.path.exists(test_file_name),"y-predictions not saved"
    y_pred=np.loadtxt(test_file_name)
    ydata=np.loadtxt(test_y_file_name)
    wts=np.loadtxt(test_weights_file_name)
    # assert(test_y.shape[0]==y_pred.shape[0]),"Data %s and prediction arrays %s are not of the same size"%(test_y.shape,y_pred.shape)

    # Condition for the case when the prediction is a 2column array 
    if len(y_pred.shape)==2: y_pred=y_pred[:,1]
    
#     print(y_pred.shape,ydata.shape)
    fpr,tpr,threshold=roc_curve(ydata,y_pred,sample_weight=wts)
    f_plot_roc_curve(fpr,tpr,label)
    
    


In [34]:
plt.figure()
label_dict={'old_paper':'resnet_paper','4':'layered CNN','7':'resnet 18'}

## Plotting Zahra's old curve
ydata,y_pred,wts=f_extract_zahra_data()
fpr,tpr,threshold=roc_curve(ydata,y_pred,sample_weight=wts)
f_plot_roc_curve(fpr,tpr,label='resnet_paper')


for model_name in ['4','7']:
    f_model_plot(model_name,label=label_dict[model_name])
    

plt.legend(loc='best')

plt.savefig('full_test_data_roc_curve_comparison.pdf')

FigureCanvasNbAgg()

AUC:  0.954750151921803
(737715,) (737715,)
AUC:  0.9601436584457085
(737715,) (737715,)
AUC:  0.9349805347726424
