# Analysis 0: Preprocessing
- Drop missing variables with > 60% missingness
- Demographics table 

## Import libraries

In [1]:
%load_ext rpy2.ipython 
# Load the R magic extension

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import os
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import patsy
from lifelines import CoxPHFitter 
import statsmodels as sm
from pathlib import Path

In [3]:
# Add the directory to sys.path
import sys
module_path = Path('./../code')
sys.path.append(str(module_path))
import utils

In [4]:
# Import libraries to allow data to be passed between Python and R env
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects import r

pandas2ri.activate()

In [5]:
%%R
# Run this cell to install packages the first time. 

# install.packages("bshazard")
# install.packages("survival")
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("magrittr")
# install.packages("tableone")
# install.packages("pROC")
# install.packages("PRROC")
# install.packages("caret")
# install.packages("survivalROC")
# install.packages("survminer")
# install.packages("scales")
# install.packages("broom")
# install.packages("purrr")


NULL


In [6]:
%%R
library(bshazard)
library(survival)
library(tidyr)
library(dplyr)
library(magrittr)
library(tableone)
library(pROC)
library(PRROC)
library(caret)
library(survivalROC)
library(survminer)
library(scales)
library(finalfit)
library(broom)
library(broom.helpers)
library(purrr)

R[write to console]: Loading required package: splines

R[write to console]: Loading required package: survival

R[write to console]: Loading required package: Epi

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


R[write to console]: 
Attaching package: ‘magrittr’


R[write to console]: The following object is masked from ‘package:tidyr’:

    extract


R[write to console]: Type 'citation("pROC")' for a citation.

R[write to console]: 
Attaching package: ‘pROC’


R[write to console]: The following objects are masked from ‘package:stats’:

    cov, smooth, var


R[write to console]: Loading required package: rlang

R[write to console]: 
Attaching package: ‘rlang’


R[write to console]: The following object is masked from ‘package:magrittr’:

    set_names


R[write to consol

In [7]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

# Displays all the columns, does 
pd.set_option('display.max_columns', None)

## Import data

In [8]:
demographics_table_filename = './../results/demographics_table.csv'
univariate_filename = './../results/univariate_analysis.csv'
multivariate_filename = './../results/multivariate_analysis.csv'

In [9]:
data_filename = './../data/cleaned_cohort_20250429.csv'
df = pd.read_csv(data_filename)[utils.VARS_TO_ANALYZE]
df.shape

(1745288, 79)

In [10]:
df.ugica.value_counts()

ugica
0.0    1744975
1.0        313
Name: count, dtype: int64

In [11]:
df.hpylori_active_chronic_binary.value_counts()

hpylori_active_chronic_binary
0    1743688
1       1600
Name: count, dtype: int64

In [12]:
# Pass the DataFrame into the R environment
def pass_df(df, r_df_name):
    ro.globalenv[r_df_name] = df

ro.globalenv['numerical_vars'] = utils.NUMERICAL_VARS
ro.globalenv['categorical_vars'] = utils.CATEGORICAL_VARS # + ['sex_clean']
ro.globalenv['demographics_table_filename'] = str(demographics_table_filename)
ro.globalenv['univariate_filename'] = str(univariate_filename)
ro.globalenv['multivariate_filename'] = str(multivariate_filename)

##### Add variables to compare with current guidelines for risk-factor triggered screening for EAC

In [13]:
def num_risk_factors(row):
    score = 0 

    if row.age > 50: 
        score += 1 
    if row.sex == 'MALE':
        score += 1 
    if row.race_clean == 'White':
        score += 1
    if row.tobacco_binary == '1':
        score += 1
    if row.gerd == '1':
        score += 1
    if row.BMI_baseline >= 30:
        score += 1 
    if row.famhx_esophagealca or row.famhx_barretts:
        score += 1 
    
    return score 

df['eac_risk_factors_screening'] = df.apply(lambda x: num_risk_factors(x), axis=1)
df['meets_screening'] = (df.eac_risk_factors_screening >= 3).astype(int)

##### Clean subtype cancer outcomes


In [14]:
df[['ugica_ESCC', 'ugica_EAC', 'ugica_CGC', 'ugica_NCGC']] = df[['ugica_ESCC', 'ugica_EAC', 'ugica_CGC', 'ugica_NCGC']].fillna(0)

In [15]:
df[['ugica', 'ugica_ESCC', 'ugica_EAC', 'ugica_CGC', 'ugica_NCGC']].sum()

ugica         313.0
ugica_ESCC     62.0
ugica_EAC      68.0
ugica_CGC      63.0
ugica_NCGC    120.0
dtype: float64

In [16]:
pass_df(df, "r_df")



## Demographics table

In [17]:
%%R 
vars_to_analyze <- unlist(c(categorical_vars, numerical_vars))

demtable <- CreateTableOne(
    vars = vars_to_analyze,
    data = r_df,
    factorVars = unlist(categorical_vars),
    strata = "ugica",
    addOverall = TRUE,
    includeNA = TRUE
)
demtable_df <- print(demtable, quote = FALSE, noSpaces = TRUE, printToggle = FALSE, missing = TRUE)

write.csv(demtable_df, file = demographics_table_filename)

## Preprocessing

In [18]:
%%R 
# Ignore these columns
cols_to_ignore <- c(
    'months_to_event', 'ugica', 'ugica_ESCC', 'ugica_EAC', 'ugica_CGC', 'ugica_NCGC', 
    'death', 'subtype', 'visit_year', 'diagnosis_year', 'encounter_type', 'social_language', 
    'days_to_event', 'days_to_dx', 'days_to_death',
    "eac_risk_factors_screening", "meets_screening"
)

### Remove variables that have >60% missing

In [19]:
%%R 
missing_vars <- names(which(sapply(r_df, function(x) mean(is.na(x))) > 0.60))
missing_vars <- missing_vars[!missing_vars %in% cols_to_ignore]
missing_vars

 [1] "alcohol_all_missing"            "alcohol_binary_missing"        
 [3] "hpylori_active_missing"         "hpylori_active_chronic_missing"
 [5] "hgball_baseline"                "hgb_baseline"                  
 [7] "mcv_baseline"                   "wbc_baseline"                  
 [9] "plt_baseline"                   "sodium_baseline"               
[11] "potassium_baseline"             "chloride_baseline"             
[13] "bicarbonate_baseline"           "bun_baseline"                  
[15] "scr_baseline"                   "magnesium_baseline"            
[17] "calcium_baseline"               "phosphate_baseline"            
[19] "ast_baseline"                   "alt_baseline"                  
[21] "alp_baseline"                   "tbili_baseline"                
[23] "tprotein_baseline"              "albumin_baseline"              
[25] "tsh_baseline"                   "vitD_baseline"                 
[27] "triglycerides_baseline"         "LDL_baseline"                  
[29] "

In [20]:
%%R 
print(dim(r_df))
r_df_nonmissing <- r_df[, !names(r_df) %in% missing_vars]
print(dim(r_df_nonmissing))

[1] 1745288      81
[1] 1745288      52


### Impute mean for continuous variables with <= 60% missing

In [21]:
%%R 
# Get the variables that still have missing data 
missing_less_60_vars <- names(which(sapply(r_df_nonmissing, function(x) mean(is.na(x))) > 0))
missing_less_60_vars

 [1] "subtype"                "diagnosis_year"         "sex_missing"           
 [4] "race_clean_missing"     "ethnicity_missing"      "social_language"       
 [7] "tobacco_all_missing"    "tobacco_binary_missing" "age"                   
[10] "days_to_event"          "months_to_event"        "days_to_dx"            
[13] "days_to_death"          "height_baseline"        "weight_baseline"       
[16] "BMI_baseline_all"       "BMI_baseline"          


In [22]:
%%R 
cols_to_impute <- c('BMI_baseline_all', 'BMI_baseline', 'height_baseline', 'weight_baseline')

r_df_imputed <- r_df_nonmissing %>%
  mutate(across(all_of(cols_to_impute), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

In [23]:
%%R 
# Before imputation
summary(r_df_nonmissing[, cols_to_impute])

 [1] "Min.   :    0.0  " "1st Qu.:   23.8  " "Median :   27.1  "
 [4] "Mean   :   31.5  " "3rd Qu.:   31.2  " "Max.   :48330.8  "
 [7] "NA's   :820984  "  "Min.   :    0.0  " "1st Qu.:   23.8  "
[10] "Median :   27.2  " "Mean   :   32.6  " "3rd Qu.:   31.2  "
[13] "Max.   :48307.3  " "NA's   :836779  "  "Min.   :  1.6  "  
[16] "1st Qu.: 63.0  "   "Median : 66.0  "   "Mean   : 66.0  "  
[19] "3rd Qu.: 69.0  "   "Max.   :115.0  "   "NA's   :766584  " 
[22] "Min.   :    4  "   "1st Qu.: 2304  "   "Median : 2720  "  
[25] "Mean   : 2794  "   "3rd Qu.: 3184  "   "Max.   :23648  "  
[28] "NA's   :714568  " 


In [24]:
%%R 
# After imputation
summary(r_df_imputed[, cols_to_impute])

 [1] "Min.   :    0.04  " "1st Qu.:   26.64  " "Median :   31.53  "
 [4] "Mean   :   31.53  " "3rd Qu.:   31.53  " "Max.   :48330.77  "
 [7] "Min.   :    0.00  " "1st Qu.:   26.80  " "Median :   32.63  "
[10] "Mean   :   32.63  " "3rd Qu.:   32.63  " "Max.   :48307.30  "
[13] "Min.   :  1.61  "   "1st Qu.: 65.00  "   "Median : 65.99  "  
[16] "Mean   : 65.99  "   "3rd Qu.: 66.15  "   "Max.   :115.00  "  
[19] "Min.   :    3.98  " "1st Qu.: 2576.00  " "Median : 2793.47  "
[22] "Mean   : 2793.47  " "3rd Qu.: 2836.80  " "Max.   :23648.00  "


### Normalize continuous variables

In [25]:
%%R 
vars_to_normalize <- names(r_df_imputed)[
    !names(r_df_imputed) %in% cols_to_ignore &
    names(r_df_imputed) %in% numerical_vars
]

preproc <- preProcess(r_df_imputed[, vars_to_normalize], method = c("center", "scale"))
r_df_normal <- predict(preproc, r_df_imputed)
dim(r_df_normal)


[1] 1745288      52


### Factorize categorical variables

In [26]:
%%R
# Remove categorical variables that we do not want to do univariate analysis on
vars_categorical_for_univariate <- names(r_df_normal)[
    !names(r_df_normal) %in% cols_to_ignore &
    names(r_df_normal) %in% categorical_vars
]

# Factorize 
for (i in vars_categorical_for_univariate) {
    r_df_normal[[i]] <- factor(r_df_normal[[i]])
}

# Set reference levels 
r_df_normal$race_clean <- relevel(r_df_normal$race_clean, ref = 'White')
r_df_normal$race_clean_missing <- relevel(r_df_normal$race_clean_missing, ref = 'White')
r_df_normal$ethnicity <- relevel(r_df_normal$ethnicity, ref='Not Hispanic or Latino')
r_df_normal$ethnicity_missing <- relevel(r_df_normal$ethnicity_missing, ref='Not Hispanic or Latino')
r_df_normal$alcohol_all <- relevel(r_df_normal$alcohol_all, ref = '0.0')
# r_df_normal$alcohol_all_missing <- relevel(r_df_normal$alcohol_all_missing, ref = '0')
r_df_normal$alcohol_binary <- relevel(r_df_normal$alcohol_binary, ref = '0')
# r_df_normal$alcohol_binary_missing <- relevel(r_df_normal$alcohol_binary_missing, ref = '0')
r_df_normal$tobacco_all <- relevel(r_df_normal$tobacco_all, ref = '0.0')
r_df_normal$tobacco_all_missing <- relevel(r_df_normal$tobacco_all_missing, ref = '0')
r_df_normal$tobacco_binary <- relevel(r_df_normal$tobacco_binary, ref = '0')
r_df_normal$tobacco_binary_missing <- relevel(r_df_normal$tobacco_binary_missing, ref = '0')
r_df_normal$hpylori_active <- relevel(r_df_normal$hpylori_active, ref = '0.0')
# r_df_normal$hpylori_active_missing <- relevel(r_df_normal$hpylori_active_missing, ref = '0')
r_df_normal$hpylori_active_chronic <- relevel(r_df_normal$hpylori_active_chronic, ref = '0.0')
# r_df_normal$hpylori_active_chronic_missing <- relevel(r_df_normal$hpylori_active_chronic_missing, ref = '0')
r_df_normal$hpylori_active_chronic_binary <- relevel(r_df_normal$hpylori_active_chronic_binary, ref = '0')

In [27]:
%%R 
write.csv(r_df_normal, "df_analysis0_imputed.csv", row.names = FALSE)