In [1]:
# import simbipartiteTest as simTest
import matplotlib.pyplot as plt
import matplotlib.gridspec # To plot clustermap and heatmap side by side
import seaborn as sns
# import CostVisitSimTest as CostSim
import pandas as pd
import pytwoway as tw
import bipartitepandas as bpd
import numpy as np
# import PyChest
import ruptures as rpt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
# import scipy
import time
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
# Ignore warnings below
simplefilter("ignore", category=ConvergenceWarning) # Useful for logistic regression
pd.options.mode.chained_assignment = None  # default='warn' # Remove copy on slice warning

In [50]:
def temporal_simulation_v2(nb_of_periods,
                           n_patients,
                           n_doctors,
                           z=np.sqrt(2),
                           alpha_law_graph=(0, 0.5),
                           psi_law_graph=(0, 0.5),
                           alpha_law_cost=(0, 0.5),
                           psi_law_cost=(0, 0.5),
                           preconditioner = 'ichol',
                           beta_age_p_graph=0.01,
                           beta_age_d_graph=0.01,
                           beta_sex_p_graph=0.5,
                           beta_sex_d_graph=0.5,
                           beta_distance_graph=0.5,
                           beta_age_p_cost=0.5,
                           beta_age_d_cost=0.5,
                           beta_sex_p_cost=0.5,
                           beta_sex_d_cost=0.5,
                           beta_distance_cost=0.5):
    """
    dataframe has to be the dataframe of connections between patients and doctors.
    """

    rng = np.random.default_rng(None)
    alpha_graph = []
    psi_graph = []
    alpha_cost = {} # These are dicts to use the function map later
    psi_cost = {}
    changepoint_patient = np.zeros(n_patients)
    changepoint_doctor = np.zeros(n_doctors + 1)
    coor_patients = []
    coor_doctors = []
    D = np.zeros([n_patients, n_doctors + 1], dtype = np.ndarray)
    log = LogisticRegression()
    lin = LinearRegression()

    for i in range(n_patients):
        
        # We generate the FE for the graph formation model
        alpha_graph.append( np.random.uniform(alpha_law_graph[0], alpha_law_graph[1]) )
        
        # We generate the FE for the cost model
        alpha_cost[i] = np.random.uniform(alpha_law_cost[0], alpha_law_cost[1])

        # We generate the periods when there's a changepoint for each patient
        changepoint_patient[i] = np.random.randint(0, nb_of_periods)

        # Generate the coordinates of the patients
        coor_patients.append( np.random.uniform(0, 1, 2) )
                               
    for j in range(n_doctors + 1):

        # We generate the FE for the graph formation model
        psi_graph.append( np.random.uniform(psi_law_graph[0], psi_law_graph[1]) )

        # We generate the FE for the cost model
        psi_cost[j] = np.random.uniform(psi_law_cost[0], psi_law_cost[1])

        # We generate the periods when there's a changepoint for each doctor
        changepoint_doctor[j] = np.random.randint(0, nb_of_periods)
        
        if j != 0:
            
            # Generate the coordinates of the doctors
            coor_doctors.append( np.random.uniform(0, 1, 2) )

    # Generate distance matrix
    for i in range(n_patients):
        for j in range(0, n_doctors + 1):
            if j == 0: # We associate the indice 0 to the "ghost doctor"
                D[i][0] = 0
            else: # we take the j-1 index of coor_doctors as we added the ghost doctor, j = 1 corresponds to j = 0 in coord_doctors
                d = np.sqrt(np.power((coor_patients[i][0] - coor_doctors[j-1][0]), 2) + np.power((coor_patients[i][1] - coor_doctors[j-1][1]), 2))
                D[i][j] = d

    # Random draws of ages for patients and doctors
    sim_patient_age = rng.integers(low = 1, high = 99, size = n_patients)
    sim_doctor_age = rng.integers(low = 26, high = 99, size = n_doctors + 1)

    # Random draws of genders of patients and doctors
    sim_patient_gender = rng.integers(low = 0, high = 2, size = n_patients)
    sim_doctor_gender = rng.integers(low = 0, high = 2, size = n_doctors + 1)

    # Compile ids
    id_p = np.repeat(range(n_patients), n_doctors + 1)
    id_d = np.tile(range(n_doctors + 1), n_patients)

    # Compile observed features
    age_p_data = np.repeat(sim_patient_age, n_doctors + 1)
    age_d_data = np.tile(sim_doctor_age, n_patients)
    sex_p_data = np.repeat(sim_patient_gender, n_doctors + 1)
    sex_d_data = np.tile(sim_doctor_gender, n_patients)

    estimates = []
                               
    # At each period, determine connections                           
    for t in range(nb_of_periods):
    
        # Generate the identifier matrix A based on the distance
        A = np.zeros([n_patients, n_doctors + 1], dtype = np.ndarray)
        for i in range(0, n_patients):
            for j in range(0, n_doctors + 1):
                if j == 0:
                    A[i][0] = 1
                elif D[i][j] > z: # if patient i and doctor j are too far away, there is no relation
                    continue
                else:
                    T = alpha_graph[i] + psi_graph[j] + beta_age_p_graph * sim_patient_age[i] + beta_age_d_graph * sim_doctor_age[j] + beta_sex_p_graph * sim_patient_gender[i] + beta_sex_d_graph * sim_doctor_gender[j] + beta_distance_graph * D[i][j]
                    p = 1 / (1 + np.exp(-T))
                    A[i][j] = np.random.binomial(1, p)

        # Compile relations between doctors and patients
        relation = A.flatten()

        # Merge all columns into a dataframe
        dataframe = pd.DataFrame(data={'i': id_p, 'j': id_d, 'y' : relation, 'age_p': age_p_data, 'age_d': age_d_data, 
                               'sex_p': sex_p_data, 'sex_d': sex_d_data
                                })
        dataframe['distance'] = D[dataframe['i'], dataframe['j']].astype(float)

        # Logistic regression for graph formation

        # Add dummy variables
        e_i = pd.DataFrame(np.zeros((n_patients*(n_doctors + 1), n_patients), dtype=int))
        for col in e_i.columns:
            e_i.rename(columns = {col :f'p_{col}'}, inplace = True)
            
        e_j = pd.DataFrame(np.zeros((n_patients*(n_doctors + 1), n_doctors + 1), dtype=int))
        for col in e_j.columns:
            e_j.rename(columns = {col :f'd_{col}'}, inplace = True)
        
        df = pd.concat([dataframe, e_i, e_j], axis = 1)
        
        for i in range(n_patients):
            indexes = df[df['i'] == i].index
            df[f'p_{i}'][indexes] = [1 for i in range(len(indexes))]
        
        for j in range(n_doctors + 1):
            indexes = df[df['j'] == j].index
            df[f'd_{j}'][indexes] = [1 for i in range(len(indexes))]
        
        y = df['y'].astype(int)
        X = df.drop(['i', 'j', 'y'], axis = 1)
        
        reg = log.fit(X, y)
        coeffs = reg.coef_[0]

        # drop the rows if there is no relation between patient_i and doctor_j
        dataframe = dataframe.drop(dataframe[dataframe['y'] == 0].index)
        dataframe = dataframe.drop('y', axis = 1)
        dataframe = dataframe.reset_index().drop(['index'], axis = 1)

        # We update the laws (if needed) of the patients/doctors
        list_of_indexes_patient = np.where(changepoint_patient == t)[0]
        list_of_indexes_doctor = np.where(changepoint_doctor == t)[0]
        for index_patient in list_of_indexes_patient: 
            
            alpha_cost[index_patient] = np.random.uniform( np.random.uniform(alpha_law_graph[0] + 5, alpha_law_graph[1] + 5) )
    
        for index_doctor in list_of_indexes_doctor:
            
            psi_cost[index_doctor] = np.random.uniform( np.random.uniform(psi_law_graph[0] + 5, psi_law_graph[1] + 5) )

        dataframe['alpha'] = dataframe['i'].map(alpha_cost).astype(float)
        dataframe['psi'] = dataframe['j'].map(psi_cost).astype(float)
        # dataframe['distance'] = D[dataframe['i'], dataframe['j']].astype(float)

        # Compute the cost
        dataframe['y'] = dataframe['alpha'] + dataframe['psi'] + beta_age_p_cost * dataframe['age_p'] + beta_age_d_cost * dataframe['age_d'] + beta_sex_p_cost * dataframe['sex_p'] + beta_sex_d_cost * dataframe['sex_d'] + beta_distance_cost * dataframe['distance']

        # Change dtype of categorical variables
        # dataframe['sex_p'] = dataframe['sex_p'].astype("category")
        # dataframe['sex_d'] = dataframe['sex_d'].astype("category")
        
        # We estimate the FE for the cost model using LinearRegression

        # Add dummy variables
        e_i_cost = pd.DataFrame(np.zeros((len(dataframe), n_patients), dtype=int))
        for col in e_i_cost.columns:
            e_i_cost.rename(columns = {col :f'p_{col}'}, inplace = True)
            
        e_j_cost = pd.DataFrame(np.zeros((len(dataframe), n_doctors + 1), dtype=int))
        for col in e_j_cost.columns:
            e_j_cost.rename(columns = {col :f'd_{col}'}, inplace = True)
        
        df2 = pd.concat([dataframe, e_i_cost, e_j_cost], axis = 1)
        
        for i in range(n_patients):
            indexes = df2[df2['i'] == i].index
            df2[f'p_{i}'][indexes] = [1 for i in range(len(indexes))]
        
        for j in range(n_doctors + 1):
            indexes = df2[df2['j'] == j].index
            df2[f'd_{j}'][indexes] = [1 for i in range(len(indexes))]

        # Scale specific columns
        df2['age_p'] =( df['age_p'] - df['age_p'].mean() ) / df['age_p'].std()
        df2['age_d'] =( df['age_d'] - df['age_d'].mean() ) / df['age_d'].std()
        
        y_2 = df2['y'].astype(int)
        X_2 = df2.drop(['i', 'j', 'y', 'alpha', 'psi'], axis = 1)
        
        reg_2 = lin.fit(X_2, y_2)
        coeffs_2 = reg_2.coef_

        d = {}
        d['estimates'] = coeffs_2 # Estimates of the EF, Beta for the cost model
        # d['estimates'] = [fe_estimator.alpha_hat, fe_estimator.psi_hat]
        d['true_value'] = dataframe # True values of the features, the initial dataframe.
        d['graph'] = {}
        d['graph']['coeffs'] = coeffs
        d['graph']['alpha'] = alpha_graph
        d['graph']['psi'] = psi_graph
        estimates.append(d)

    return estimates

In [51]:
simulation = temporal_simulation_v2(nb_of_periods=30,
                                  n_patients=100,
                                  n_doctors=50,
                                  z=1.1,
                                  alpha_law_graph=(0, 0.5),
                                  psi_law_graph=(0, 0.5),
                                  alpha_law_cost=(0, 0.5),
                                  psi_law_cost=(0, 0.5),
                                  preconditioner = 'ichol',
                                  beta_age_p_graph=0.01,
                                  beta_age_d_graph=0.01,
                                  beta_sex_p_graph=0.5,
                                  beta_sex_d_graph=0.5,
                                  beta_distance_graph=-0.5,
                                  beta_age_p_cost=0.01,
                                  beta_age_d_cost=0.01,
                                  beta_sex_p_cost=0.5,
                                  beta_sex_d_cost=0.5,
                                  beta_distance_cost=0.5)

In [52]:
simulation[0]['true_value']

Unnamed: 0,i,j,age_p,age_d,sex_p,sex_d,distance,alpha,psi,y
0,0,0,9,85,1,0,0.000000,0.00144,0.135341,1.576782
1,0,2,9,96,1,1,0.262108,0.00144,0.179622,2.362116
2,0,3,9,39,1,1,0.674835,0.00144,3.734746,5.553604
3,0,4,9,33,1,1,0.566027,0.00144,0.376139,2.080592
4,0,5,9,78,1,0,0.404427,0.00144,4.917124,6.490777
...,...,...,...,...,...,...,...,...,...,...
4319,99,45,92,45,0,0,0.602528,0.39500,0.442700,2.508964
4320,99,46,92,36,0,1,0.288076,0.39500,0.462524,2.781562
4321,99,47,92,48,0,0,0.624409,0.39500,0.294874,2.402078
4322,99,49,92,27,0,1,0.825499,0.39500,0.451272,2.949022


In [63]:
simulation[2]['estimates']

array([-1.24077512e-02, -6.02103534e-04, -7.87951418e+10,  1.85604279e+11,
        5.17199329e-01, -3.41787527e+10, -3.41787527e+10, -1.12973894e+11,
       -3.41787527e+10, -3.41787527e+10, -3.41787527e+10, -3.41787527e+10,
       -3.41787527e+10, -1.12973894e+11, -3.41787527e+10, -3.41787527e+10,
       -1.12973894e+11, -1.12973894e+11, -1.12973894e+11, -3.41787527e+10,
       -3.41787527e+10, -3.41787527e+10, -3.41787527e+10, -1.12973894e+11,
       -1.12973894e+11, -3.41787527e+10, -1.12973894e+11, -3.41787527e+10,
       -3.41787527e+10, -1.12973894e+11, -1.12973894e+11, -1.12973894e+11,
       -1.12973894e+11, -1.12973894e+11, -3.41787527e+10, -1.12973894e+11,
       -3.41787527e+10, -1.12973894e+11, -1.12973894e+11, -1.12973894e+11,
       -3.41787527e+10, -1.12973894e+11, -3.41787527e+10, -1.12973894e+11,
       -3.41787527e+10, -1.12973894e+11, -3.41787527e+10, -1.12973894e+11,
       -3.41787527e+10, -3.41787527e+10, -3.41787527e+10, -3.41787527e+10,
       -3.41787527e+10, -

In [66]:
simulation[0]['estimates'][10]

124878827547.599