In [3]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from patsy import dmatrix
# Import Necessary Libraries

In [4]:
df = pd.read_csv("processed_df.csv", index_col = 0)
mrns = pd.read_csv("mrns.csv")
mrns = mrns[['DATE_CIRRHOSIS', 'MRN', 'Gender', 'Age']]

df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True)
mrns['DATE_CIRRHOSIS'] = pd.to_datetime(mrns['DATE_CIRRHOSIS'], format='mixed', dayfirst=True)
df = df.dropna().reset_index(drop = True)
df_show = df.drop(['MRN'], axis = 1)
df_show
# Read two csv files, df and mrns
# df is a longitudinal data with columns: MRN, Date of Check-up, Tac Dose Level, Tac Trough level, Bilirubin,
# Creatinine, Tac Dose-To-Trough Ratio, Age at the time of checkup, 
# and the number of days relative to the first check-up (For example, for MRN 141619, checkup at 2007-04-09 has value 0,
# since it is the first check-up, while 2007-06-20 has value 72, since it is 72 days after the first check-up)

Unnamed: 0,Date,Dose,Level,Bili,Creat,Meld,Ratio,Age,Time
0,2007-04-09,10.0,13.3,5.0,93.0,7.0,1.330,58.633128,0
1,2007-06-20,10.0,17.0,8.0,85.0,7.0,1.700,58.830253,72
2,2007-07-26,6.0,19.2,10.0,96.0,8.0,3.200,58.928816,108
3,2005-05-26,4.0,10.5,17.0,101.0,10.0,2.625,46.932923,0
4,2005-06-02,4.0,11.6,23.0,120.0,13.0,2.900,46.952088,7
...,...,...,...,...,...,...,...,...,...
6254,2009-11-02,4.0,5.7,25.0,76.0,10.0,1.425,61.726215,287
6255,2009-11-11,4.0,8.1,19.0,68.0,9.0,2.025,61.750856,296
6256,2009-12-11,4.0,7.3,40.0,66.0,12.0,1.825,61.832991,326
6257,2010-01-14,4.0,9.0,31.0,70.0,11.0,2.250,61.926078,360


In [5]:
# dataframe mrns have information about date of cirrhosis, gender and age at the diagnosis of cirrhosis for 
# each individual patients (mrns)
mrns_show = mrns.drop(['MRN'], axis = 1)
mrns_show

Unnamed: 0,DATE_CIRRHOSIS,Gender,Age
0,2020-02-24,Female,60.0
1,2020-01-25,,
2,2020-01-24,Male,75.0
3,2020-01-21,Male,68.0
4,2020-01-17,,
...,...,...,...
448,1999-06-03,,
449,1999-01-27,,
450,1999-01-19,,
451,1999-01-12,,


## Defining Spline Terms
Since the mixedlm from statsmodels library is a Linear Mixed Effect Model, I tried to use cubic splines to apply nonlinearity. I thought instead of simply taking polynomial terms of Times, taking spline terms would help me to capture more complex relationship

In [6]:
spline_terms = dmatrix("bs(Time, degree=3, knots=(100,), include_intercept=False)", df, return_type='dataframe')
spline_terms.columns = [f'spline_{i}' for i in range(spline_terms.shape[1])]
df = pd.concat([df, spline_terms], axis=1)
# define the spline terms, concatenate those terms with original df

In [7]:
from sklearn.preprocessing import StandardScaler
model_df_columns = df.columns[4:]
model_df = df.iloc[:, 4:]
scaler = StandardScaler()
model_df = scaler.fit_transform(model_df)
model_df = pd.DataFrame(columns = model_df_columns, data = model_df)
model_df = pd.concat([model_df, df['MRN'].reset_index(drop = True)], axis = 1)
model_df
# Drop Date (since we have another variable Time), Tac Dose and Tac trough level
# standardize all the columns (Bilirubin, Creatinine, Meld, Ratio, Age, Time and splines)

Unnamed: 0,Bili,Creat,Meld,Ratio,Age,Time,spline_0,spline_1,spline_2,spline_3,spline_4,MRN
0,-0.509093,-0.211609,-0.741982,-0.741173,0.484659,-0.997308,0.0,-2.262861,-1.563418,-0.723816,-0.281814,141619
1,-0.421610,-0.349984,-0.741982,-0.587974,0.501022,-0.923111,0.0,1.249791,-1.437186,-0.722916,-0.281814,141619
2,-0.363287,-0.159718,-0.498965,0.033101,0.509203,-0.886013,0.0,1.268199,-1.326119,-0.720779,-0.281814,141619
3,-0.159159,-0.073234,-0.012929,-0.204978,-0.486560,-0.997308,0.0,-2.262861,-1.563418,-0.723816,-0.281814,498916
4,0.015808,0.255407,0.716124,-0.091114,-0.484969,-0.990094,0.0,-1.547090,-1.561870,-0.723815,-0.281814,498916
...,...,...,...,...,...,...,...,...,...,...,...,...
6254,0.074131,-0.505656,-0.012929,-0.701838,0.741412,-0.701551,0.0,0.964352,-0.801549,-0.683365,-0.281421,6381914
6255,-0.100837,-0.644031,-0.255947,-0.453408,0.743457,-0.692277,0.0,0.949554,-0.776893,-0.680340,-0.281361,6381914
6256,0.511549,-0.678625,0.473106,-0.536218,0.750275,-0.661361,0.0,0.900552,-0.695872,-0.669495,-0.281119,6381914
6257,0.249098,-0.609437,0.230088,-0.360246,0.758002,-0.626324,0.0,0.845621,-0.606205,-0.655811,-0.280756,6381914


# Attempt 1: Models with both Men / Women
**Input**: Bili, Creat, Meld, Age, Spline 1, Spline 2, Spline 3, Spline 4
**Output**: Ratio
- I removed spline 0 because spline 0 turned out to be all 0, and it created an error when I ran mixed effect model

In [8]:
model_a1_1= smf.mixedlm(
    "Ratio ~ spline_1 + spline_2 + spline_3 + spline_4 + Bili + Creat + Meld + Age",
    model_df, 
    groups=model_df["MRN"]
)
result_a1_1 = model_a1_1.fit(reml = False)
print(result_a1_1.summary())
# reml was set to False to calculate the AIC and BIC

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Ratio     
No. Observations: 6259    Method:             ML        
No. Groups:       143     Scale:              0.4577    
Min. group size:  3       Log-Likelihood:     -6694.2472
Max. group size:  160     Converged:          Yes       
Mean group size:  43.8                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.020    0.060  0.342 0.732 -0.097  0.138
spline_1      -0.005    0.015 -0.325 0.745 -0.033  0.024
spline_2       0.105    0.015  6.980 0.000  0.076  0.135
spline_3       0.033    0.020  1.601 0.109 -0.007  0.073
spline_4       0.040    0.015  2.699 0.007  0.011  0.069
Bili          -0.010    0.014 -0.735 0.462 -0.038  0.017
Creat         -0.026    0.017 -1.472 0.141 -0.060  0.008
Meld           0.136    0.016  8.575 0.00

In [10]:
# Fit the mixed-effects model only using spline 2, which has the lowest p-value of 0.000
model_a1_2 = smf.mixedlm(
    "Ratio ~  spline_2 + Bili + Creat + Meld + Age",
    model_df, 
    groups=model_df["MRN"]
)
result_a1_2 = model_a1_2.fit(reml = False)

# Print the summary of the model
print(result_a1_2.summary())


         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Ratio     
No. Observations: 6259    Method:             ML        
No. Groups:       143     Scale:              0.4585    
Min. group size:  3       Log-Likelihood:     -6704.0068
Max. group size:  160     Converged:          Yes       
Mean group size:  43.8                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.021    0.062  0.342 0.732 -0.100  0.142
spline_2       0.105    0.012  8.909 0.000  0.082  0.129
Bili          -0.006    0.014 -0.424 0.671 -0.034  0.022
Creat         -0.031    0.017 -1.776 0.076 -0.064  0.003
Meld           0.135    0.016  8.504 0.000  0.104  0.166
Age            0.229    0.045  5.111 0.000  0.141  0.317
Group Var      0.523    0.099                           



# Attempt 2: Men and Women were separated into Two Groups

In [11]:
men_mrns = mrns[mrns['Gender'] == 'Male']['MRN'].values
women_mrns = mrns[mrns['Gender'] == 'Female']['MRN'].values
men = model_df[model_df['MRN'].isin(men_mrns)].reset_index(drop = True)
women = model_df[model_df['MRN'].isin(women_mrns)].reset_index(drop = True)

In [12]:
model_men_1 = smf.mixedlm(
    "Ratio ~ spline_1 + spline_2 + spline_3 + spline_4 + Bili + Creat + Meld + Age",
    men, 
    groups=men["MRN"]
)
result_men_1 = model_men_1.fit(reml = False)
print(result_men_1.summary())

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Ratio     
No. Observations: 3436    Method:             ML        
No. Groups:       80      Scale:              0.4152    
Min. group size:  3       Log-Likelihood:     -3511.5455
Max. group size:  131     Converged:          Yes       
Mean group size:  43.0                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.006    0.079  0.077 0.938 -0.149  0.161
spline_1      -0.001    0.019 -0.075 0.940 -0.038  0.036
spline_2       0.114    0.024  4.767 0.000  0.067  0.161
spline_3      -0.030    0.045 -0.669 0.504 -0.118  0.058
spline_4       0.190    0.052  3.690 0.000  0.089  0.292
Bili          -0.038    0.019 -2.017 0.044 -0.075 -0.001
Creat         -0.077    0.029 -2.661 0.008 -0.133 -0.020
Meld           0.107    0.026  4.107 0.00

In [13]:
model_women_1 = smf.mixedlm(
    "Ratio ~ spline_1 + spline_2 + spline_3 + spline_4 + Bili + Creat + Meld + Age",
    women, 
    groups=women["MRN"]
)
result_women_1 = model_women_1.fit(reml = False)
print(result_women_1.summary())

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Ratio     
No. Observations: 2823    Method:             ML        
No. Groups:       63      Scale:              0.4977    
Min. group size:  5       Log-Likelihood:     -3132.0070
Max. group size:  160     Converged:          Yes       
Mean group size:  44.8                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept      0.089    0.091  0.976 0.329 -0.090  0.267
spline_1      -0.024    0.023 -1.049 0.294 -0.069  0.021
spline_2       0.137    0.022  6.127 0.000  0.093  0.181
spline_3      -0.009    0.028 -0.305 0.760 -0.064  0.047
spline_4       0.028    0.017  1.638 0.101 -0.005  0.061
Bili           0.077    0.023  3.360 0.001  0.032  0.122
Creat          0.009    0.023  0.383 0.702 -0.036  0.053
Meld           0.155    0.021  7.533 0.00

In [14]:
model_men_2 = smf.mixedlm(
    "Ratio ~ spline_2 + Bili + Creat + Meld + Age",
    men, 
    groups=men["MRN"]
)
result_men_2 = model_men_2.fit(reml = False)
print(result_men_2.summary())

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Ratio     
No. Observations: 3436    Method:             ML        
No. Groups:       80      Scale:              0.4172    
Min. group size:  3       Log-Likelihood:     -3525.3126
Max. group size:  131     Converged:          Yes       
Mean group size:  43.0                                  
--------------------------------------------------------
              Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept     -0.058    0.084 -0.687 0.492 -0.222  0.107
spline_2       0.074    0.017  4.242 0.000  0.040  0.109
Bili          -0.034    0.019 -1.826 0.068 -0.071  0.003
Creat         -0.091    0.029 -3.154 0.002 -0.147 -0.034
Meld           0.110    0.026  4.229 0.000  0.059  0.161
Age            0.261    0.078  3.336 0.001  0.108  0.415
Group Var      0.535    0.146                           



In [15]:
model_women_2 = smf.mixedlm(
    "Ratio ~ spline_2 + Bili + Creat + Meld + Age",
    women, 
    groups=women["MRN"]
)
result_women_2 = model_women_2.fit(reml = False)
print(result_women_2.summary())

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Ratio     
No. Observations: 2823    Method:             ML        
No. Groups:       63      Scale:              0.4985    
Min. group size:  5       Log-Likelihood:     -3134.8486
Max. group size:  160     Converged:          Yes       
Mean group size:  44.8                                  
---------------------------------------------------------
            Coef.  Std.Err.    z    P>|z|  [0.025  0.975]
---------------------------------------------------------
Intercept   0.111     0.091  1.212  0.226  -0.068   0.290
spline_2    0.130     0.017  7.586  0.000   0.096   0.164
Bili        0.080     0.023  3.475  0.001   0.035   0.125
Creat       0.006     0.022  0.259  0.796  -0.038   0.049
Meld        0.154     0.021  7.493  0.000   0.114   0.194
Age         0.232     0.055  4.191  0.000   0.124   0.341
Group Var   0.495     0.133                              



# AIC and BIC Comparison

In [16]:
labels = ["Both Men / Women, All Splines", "Both Men / Women, Spline 2", "Only Men, All Splines", "Only Men, Spline 2", "Only Women, All Splines", "Only Women, Spline 2"]
aics = [result_a1_1.aic, result_a1_2.aic, result_men_1.aic, result_men_2.aic, result_women_1.aic, result_women_2.aic]
bics = [result_a1_1.bic, result_a1_2.bic, result_men_1.bic, result_men_2.bic, result_women_1.bic, result_women_2.bic]
result_df = pd.DataFrame({
    "AIC": aics,
    "BIC": bics
})
result_df.index = labels
result_df

Unnamed: 0,AIC,BIC
"Both Men / Women, All Splines",13410.494439,13484.653972
"Both Men / Women, Spline 2",13424.013632,13477.947838
"Only Men, All Splines",7045.090941,7112.653637
"Only Men, Spline 2",7066.625101,7115.761607
"Only Women, All Splines",6286.014099,6351.415209
"Only Women, Spline 2",6285.697198,6333.261641
