In [5]:
# import
import numpy as np
import pandas as pd
import math
import os
import random
import scipy.stats as stats
from scipy.stats import ks_2samp
from scipy.stats import entropy
import copy
import pickle
import itertools
from scipy.optimize import minimize
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score
from datetime import timedelta, datetime
np.set_printoptions(suppress=True)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"                # show multi variables without print

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")
from scipy.stats import gamma
from scipy.optimize import brentq

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Functions
def get_gamma_pdfs(x, params):         # pdf of mixture model
    # x: a list of samples, for example, x=[1,2,3,4]
    # params: mixing, shape and scale of gamma mix, for example, [[1,0.1,0.2]]
    y = np.zeros(len(x))    
    for t in np.arange(len(params)):
        mixing, alpha, beta = params[t]
        temp_y = mixing * stats.gamma.pdf(x, alpha, loc=0, scale=beta)
        y = y + temp_y
    return y
 
def get_gamma_cdfs(x, params):       # cdf of mixture model
    y = np.zeros(len(x))    
    for t in np.arange(len(params)):
        mixing, alpha, beta = params[t]
        temp_y = mixing * stats.gamma.cdf(x, alpha, loc=0, scale=beta)
        y = y + temp_y
    return y

def get_gamma_cdfs_single(x, params):    # cdf of gamma
    y = 0
    for t in np.arange(len(params)):
        mixing, alpha, beta = params[t]
        temp_y = mixing * stats.gamma.cdf(x, alpha, loc=0, scale=beta)
        y = y + temp_y
    return y

def get_gamma_ppfs(q, params):    # ppf of mixture model
    y = []
    
    for p in q:
        def cdf_minus_p(x):
            return get_gamma_cdfs_single(x, params) - p

        x_min, x_max = 0, 6
        while get_gamma_cdfs_single(x_max, params) < p:
            x_max *= 2
        y.append(brentq(cdf_minus_p, x_min, x_max))
    return y

In [7]:
def save_fig(x, y, data_hist, tag, path_save):
    font = {'weight':"bold",'size':12}
    fontsize = 12
    font_legend = {'size':10}
    DPI = 300
    nbins = 50
    colour_names = ['b', 'r', 'c', 'k', 'm']

    fig, ax = plt.subplots(figsize=(6, 3.6))
    ploot = plt.xlim([0,100])
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    
    if tag == 'PDF':
        ploot = plt.plot(x, y)
        ploot = plt.hist(data_hist, bins=nbins, density=True)
    if tag == 'CP':
        K = y.shape[1]
        for k in np.arange(K): 
            ploot = plt.plot(x, y[:,k].reshape(-1,1), color=colour_names[k], linestyle='-', label='gamma' + str(k+1))  
          
        ploot = plt.legend(prop=font_legend)
    my_x_ticks = ax.get_xticks()
    my_y_ticks = ax.get_yticks()
    my_x_ticks = np.round(my_x_ticks, decimals=3)
    my_y_ticks = np.round(my_y_ticks, decimals=3)
    ploot = ax.set_xticks(my_x_ticks)
    ploot = ax.set_xticklabels(my_x_ticks, fontdict=font)
    ploot = ax.set_yticks(my_y_ticks)
    ploot = ax.set_yticklabels(my_y_ticks, fontdict=font)
    
    ploot = plt.savefig(path_save, dpi = DPI, bbox_inches = 'tight')

# two comparing windows

In [8]:
# Drift detection using KS
Path_model = '../Sec3 Model fitting/out3Years2/long run/'
path_out = 'OfflineDrift/'
# IDs = ['2013', '2014', '2015']
IDs = [1,2,3] 
Nsamples = 200

out = []
for ID1, ID2 in zip(IDs[0:-1], IDs[1:]):
    model1 = pd.read_csv(Path_model + str(ID1) + '/W1.csv')
    model1['mean'] = model1['shape'] * model1['scale']
    parameters1 = model1.sort_values(by=['mean'], ascending=True).values
    parameters1 = parameters1[:,0:3]
    K1 = len(parameters1)
    
    model2 = pd.read_csv(Path_model + str(ID2) + '/W1.csv')
    model2['mean'] = model2['shape'] * model2['scale']
    parameters2 = model2.sort_values(by=['mean'], ascending=True).values
    parameters2 = parameters2[:,0:3]
    K2 = len(parameters2)

    # sampling data from mixture models
    uniform_samples = np.random.uniform(0,1,Nsamples)
    samples1 = get_gamma_ppfs(uniform_samples, parameters1)   
    samples2 = get_gamma_ppfs(uniform_samples, parameters2)  

    t2 = np.linspace(0,1,Nsamples)[1:-1]
    samples1_new = get_gamma_ppfs(t2, parameters1)
    samples2_new = get_gamma_ppfs(t2, parameters2)

    # drift detection
    test_statistic, p_KS = stats.ks_2samp(samples1, samples2)
    test_statistic = str(np.round(test_statistic,decimals=2))
    if 'e' in str(p_KS):
        p_KS = str(np.round(float(str(p_KS).split('e')[0]),decimals=2)) + 'e' + str(p_KS).split('e')[1]
    else:
        p_KS = str(np.round(p_KS,decimals=2))
    
    # drift rationalization
    K1 = model1.values.shape[0]
    K2 = model2.values.shape[0]
    
    if stats.ks_2samp(samples1, samples2)[1] > 0.05:
        Dtype = 0
    else:
        if K1 != K2:    
            Dtype = 1
        else:
            ft1, ft2 = [], []
            for k in np.arange(K1):
                ft1.append( gamma.pdf(samples1_new, parameters1[k][1], loc=0, scale=parameters1[k][2]) )
                ft2.append( gamma.pdf(samples2_new, parameters2[k][1], loc=0, scale=parameters2[k][2]) )
            ft1 = np.array(ft1)
            ft2 = np.array(ft2)
            mixft1 = np.sum(ft1 * parameters1[:,0].reshape(-1,1), axis=0)   
            mixft2 = np.sum(ft2 * parameters2[:,0].reshape(-1,1), axis=0)   

            isType2 = False
            Cprob1, Cprob2 = [], []
            uuu = []
            yyy = []
            for k in np.arange(K1):
                con_prob1 = parameters1[k][0] * ft1[k] / mixft1
                con_prob2 = parameters2[k][0] * ft2[k] / mixft2
                Cprob1.append(con_prob1)
                Cprob2.append(con_prob2)

#                 _, p_KS2 = stats.ks_2samp(con_prob1, con_prob2)
                _, p_KS2 = stats.ttest_rel(con_prob1, con_prob2)
#                 _, p_KS2 = stats.wilcoxon(con_prob1, con_prob2)
                uuu.append(p_KS2)
#                 uuu[0] = uuu[0] * 2
                if p_KS2 < 0.05:
                    yyy.append(k)
                    isType2 = True
            if isType2:
#                 main_changes = np.argmin(uuu)
#                 uuu, main_changes
#                 Dtype = '2_' + str(main_changes+1)
                Dtype = ', '.join(map(str, yyy))
            if isType2 != True:
                Dtype = 3       

            Cprob1 = np.array(Cprob1).T            
            Cprob2 = np.array(Cprob2).T 
            
#             save_fig(t, get_gamma_pdfs(t, parameters1), samples1, 'PDF', 'OfflineDrift/' str(ID1) + '.png')
#             save_fig(samples1_new, Cprob1, samples2, 'CP', 'OfflineDrift/' + str(ID1) + '_cp.png') 

#             save_fig(t, get_gamma_pdfs(t, parameters2), samples2, 'PDF', 'OfflineDrift/' + i.split('.')[0] + '.png')
#             if K1 == K2:
#                 save_fig(samples1_new, Cprob2, samples2, 'CP', 'OfflineDrift/' + i.split('.')[0] + '_cp.png') 
                
    out.append([ID1, ID2, test_statistic, p_KS, Dtype])
df = pd.DataFrame( np.array(out), columns=['ID1', 'ID2', 'statis', 'pvalue', 'Dtype'] )
df.to_csv(path_out + 'RC_3years.csv', index=False)
df

FileNotFoundError: [Errno 2] No such file or directory: '../Sec3 Model fitting/out3Years2/long run/2.csv'