In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from collections import defaultdict
import os
import pickle
import scipy.stats as st

from itertools import combinations
from scipy.stats import chi2

from code.DataGeneration_class import DataGenerationBernoulli
from code.DataGeneration_class import DataGenerationPoisson

# DATA GENERATION

## Bernoulli

In [2]:
# BERNOULLI GENERATION 

for w in range(0,100):
  
  DG = DataGenerationBernoulli(False, True, 1, sim=True)
  DG.set_parameters()
  eta, y = DG.compute_eta_and_y()

  x1 = DG.fix[0]

  g = []
  j = 1
  for i in y:
    g.append(np.repeat(j, len(i)))
    j = j + 1

  y = np.hstack(y).squeeze()
  x1 = np.hstack(x1).squeeze()
  g = np.hstack(g).squeeze()

  data = {'y': y, 'x1': x1, 'g': g}
  df = pd.DataFrame(data)
  
  filename = 'Bernoulli_' + str(w) +'.csv'

  df.to_csv('output/comparison_state_of_art/Bernoulli_DG_output/' + filename)

## Poisson

In [3]:
# POISSON GENERATION

for w in range(0,100):
  
  DG = DataGenerationPoisson(False, True, 1, sim=True)
  DG.set_parameters()
  eta, y = DG.compute_eta_and_y()

  x1 = DG.fix[0]

  g = []
  j = 1
  for i in y:
    g.append(np.repeat(j, len(i)))
    j = j + 1

  y = np.hstack(y).squeeze()
  x1 = np.hstack(x1).squeeze()
  g = np.hstack(g).squeeze()

  data = {'y': y, 'x1': x1, 'g': g}
  df = pd.DataFrame(data)
  
  filename = 'Poisson_' + str(w) +'.csv'

  df.to_csv('output/comparison_state_of_art/Poisson_DG_output/' + filename)

# GLMMDRE

In [4]:
from code.algorithm_alpha import algorithm_alpha

def compute_order(knots):
    if knots.ndim > 1:
        return np.flip(np.argsort(knots[:, 0]))
    else:
        return np.flip(np.argsort(knots))

## Bernoulli

In [11]:
# Bernoulli

results3 = pd.DataFrame(columns=['Sensitivity', 'Specificity', 'Accuracy'])
melt = pd.DataFrame(columns=['variable', 'value'])

for file in os.listdir('output/comparison_state_of_art/Bernoulli_DG_output/'):
  if file.endswith(".csv"):
    cond = True
    while cond:
        try:
            df = pd.read_csv('output/comparison_state_of_art/Bernoulli_DG_output/' + file).iloc[:,1:]

            N = df.g.nunique()
            lengths = np.array(df.groupby('g').count().reset_index().iloc[:,1])
            y = list(df.groupby('g')['y'].apply(np.array).values)

            fix = defaultdict(list)

            fix[0] = df.groupby('g')['x1'].apply(np.array).values.tolist()

            ran_var = False
            ran_int = True
            n_fix = 1
            tol = 0.05
            sim = True

            #ran_var, ran_int, n_fix, sim, tol, model, fix, lengths, y, N
            knots, par, W, hess_ran, hess_fix, modeleval = algorithm_alpha(ran_var, ran_int, n_fix, sim, tol, model = 'B', 
                                                                fix=None, lengths=None, y=None, N=None, t=None)

            x = compute_order(knots)
            knots = knots[x]
            W = W[:, x]

            group = np.array([np.nan if np.sum(W[i,:])==0 else np.argmax(W[i,:]) for i in range(N)])

            if len(knots)==3:
              cond = False
        except:
            pass
    
    df['ran_int'] = df['g'].apply(lambda x: knots[0] if x == 1 or x == 2 else knots[2] if x == 8 or x == 9 or x == 10 else knots[1])

    temp = par * df['x1'] + df['ran_int']
    df['y_pred'] = np.where(np.exp(temp) / (1 + np.exp(temp)) > 0.5, 1, 0)

    # Append rows in Empty Dataframe by adding dictionaries
    new_rows = pd.DataFrame([
    {'variable': 'c1,1', 'value': knots[0]},
    {'variable': 'c1,2', 'value': knots[1]},
    {'variable': 'c1,3', 'value': knots[2]},
    {'variable': 'beta1', 'value': par[0]},], index=[0, 1, 2, 3])
    melt = pd.concat([melt, new_rows])
    print(melt)

    confusion_matrix = pd.crosstab(df['y'], df['y_pred'], rownames=['Actual'], colnames=['Predicted'])

    TN = confusion_matrix[0][0]
    TP = confusion_matrix[1][1]
    FN = confusion_matrix[0][1]
    FP = confusion_matrix[1][0]

    Sensitivity = TP/(TP + FN) 
    Specificity = TN/(TN + FP) 
    Accuracy = (TN + TP)/(TN+TP+FN+FP) 

    # Append rows in Empty Dataframe by adding dictionaries
    results3 = pd.concat([results3, 
                          pd.DataFrame([{'Sensitivity': Sensitivity, 'Specificity': Specificity, 'Accuracy': Accuracy}])], 
                          ignore_index=True)
    print(results3)

[-8.41526 -8.39208  1.92882 -8.51033  4.35375  4.35373 -8.50752 -8.5135
 -8.47438 -8.49411]
D_triu
[[        nan 2.31804e-02 1.03441e+01 9.50659e-02 1.27690e+01 1.27690e+01
  9.22615e-02 9.82424e-02 5.91151e-02 7.88462e-02]
 [        nan         nan 1.03209e+01 1.18246e-01 1.27458e+01 1.27458e+01
  1.15442e-01 1.21423e-01 8.22955e-02 1.02027e-01]
 [        nan         nan         nan 1.04391e+01 2.42494e+00 2.42491e+00
  1.04363e+01 1.04423e+01 1.04032e+01 1.04229e+01]
 [        nan         nan         nan         nan 1.28641e+01 1.28641e+01
  2.80440e-03 3.17648e-03 3.59508e-02 1.62197e-02]
 [        nan         nan         nan         nan         nan 2.69609e-05
  1.28613e+01 1.28673e+01 1.28281e+01 1.28479e+01]
 [        nan         nan         nan         nan         nan         nan
  1.28612e+01 1.28672e+01 1.28281e+01 1.28478e+01]
 [        nan         nan         nan         nan         nan         nan
          nan 5.98089e-03 3.31464e-02 1.34153e-02]
 [        nan         nan 

In [12]:
filename = 'results3' + '.csv'
results3.to_csv('output/comparison_state_of_art/Bernoulli_GLMMDRE_output/' + filename)

filename = 'melt' +'.csv'
melt.to_csv('output/comparison_state_of_art/Bernoulli_GLMMDRE_output/' + filename)

## Poisson

In [14]:
# POISSON
results3_POI = pd.DataFrame(columns=['MSE', 'MSE_log', 'Chi-Squared-Error'])
melt_POI = pd.DataFrame(columns=['variable', 'value'])

for file in os.listdir('output/comparison_state_of_art/Poisson_DG_output/'):
  if file.endswith(".csv"):
    cond = True
    while cond:
        try:
            df = pd.read_csv('output/comparison_state_of_art/Poisson_DG_output/' + file).iloc[:,1:]

            N = df.g.nunique()
            lengths = np.array(df.groupby('g').count().reset_index().iloc[:,1])
            y = list(df.groupby('g')['y'].apply(np.array).values)

            fix = defaultdict(list)

            fix[0] = df.groupby('g')['x1'].apply(np.array).values.tolist()

            ran_var = False
            ran_int = True
            n_fix = 1
            tol = 0.05
            sim = True

            knots, par, W, hess_ran, hess_fix, modeleval = algorithm_alpha(ran_var, ran_int, n_fix, sim, tol, model = 'P', 
                                                                fix=None, lengths=None, y=None, N=None, t=None)

            x = compute_order(knots)
            knots = knots[x]
            W = W[:, x]

            group = np.array([np.nan if np.sum(W[i,:])==0 else np.argmax(W[i,:]) for i in range(N)])

            if len(knots)==3:
              cond = False
        except:
            pass    
    
    df['ran_int'] = df['g'].apply(lambda x: knots[0] if x == 1 or x == 2 else knots[2] if x == 8 or x == 9 or x == 10 else knots[1])

    temp = par * df['x1'] + df['ran_int']
    df['y_pred'] = temp.apply(lambda x: round(np.exp(x)))

    # Append rows in Empty Dataframe by adding dictionaries
    new_rows = pd.DataFrame([
    {'variable': 'c1,1', 'value': knots[0]},
    {'variable': 'c1,2', 'value': knots[1]},
    {'variable': 'c1,3', 'value': knots[2]},
    {'variable': 'beta1', 'value': par[0]},], index=[0, 1, 2, 3])
    melt_POI = pd.concat([melt_POI, new_rows])
    print(melt_POI)

    MSE = np.mean((df['y'] - df['y_pred'])**2)
    MSE_log = np.mean((np.log(df['y']+1) - np.log(df['y_pred']+1))**2)
    CSE = np.mean((df['y'] - df['y_pred'])**2 / (df['y_pred']+1))


    # Append rows in Empty Dataframe by adding dictionaries
    results3_POI = pd.concat([results3_POI, pd.DataFrame([{'MSE': MSE, 'MSE_log': MSE_log, 'Chi-Squared-Error': CSE}])], ignore_index=True)
    print(results3_POI)

[ 2.49338 -0.9913  -0.99061  1.04451  0.99651 -0.99115  1.04377 -0.99137
  2.49337  2.49336]
D_triu
[[        nan 3.48468e+00 3.48398e+00 1.44886e+00 1.49687e+00 3.48452e+00
  1.44961e+00 3.48474e+00 2.52959e-06 1.71087e-05]
 [        nan         nan 6.93717e-04 2.03581e+00 1.98781e+00 1.51714e-04
  2.03507e+00 6.71199e-05 3.48467e+00 3.48466e+00]
 [        nan         nan         nan 2.03512e+00 1.98711e+00 5.42003e-04
  2.03437e+00 7.60837e-04 3.48398e+00 3.48396e+00]
 [        nan         nan         nan         nan 4.80057e-02 2.03566e+00
  7.42674e-04 2.03588e+00 1.44886e+00 1.44885e+00]
 [        nan         nan         nan         nan         nan 1.98765e+00
  4.72630e-02 1.98787e+00 1.49687e+00 1.49685e+00]
 [        nan         nan         nan         nan         nan         nan
  2.03492e+00 2.18834e-04 3.48452e+00 3.48451e+00]
 [        nan         nan         nan         nan         nan         nan
          nan 2.03514e+00 1.44960e+00 1.44959e+00]
 [        nan         nan

In [None]:
filename = 'results3_POI' + '.csv'
results3_POI.to_csv('output/comparison_state_of_art/Poisson_GLMMDRE_output/' + filename)

filename = 'melt_POI' +'.csv'
melt_POI.to_csv('output/comparison_state_of_art/Poisson_GLMMDRE_output/' + filename)