In [None]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from itertools import combinations
from scipy.stats import fisher_exact
import statsmodels.stats.contingency_tables as ct

In [None]:
!pip install plotnine

In [None]:
from plotnine import ggplot, aes, geom_tile, geom_text, scale_fill_gradient, ggtitle

In [None]:
#read file TableObe65plus and assign to object
icd = pd.read_csv('tblICD10.txt', dtype={0: str, 1: str, 2: str, 3: str, 4: str})

In [None]:
print(icd.info())

In [None]:
pd.set_option('display.width', 1000)
print(icd.head())

In [None]:
#specify columns 
icd.columns = ['patient', 'ICD', 'Hispanic', 'year']

In [None]:
print(icd.head())

In [None]:
#format the data
icd['ICD'] = icd['ICD'].str.replace('.*:', '', regex=True)
icd['ICD'] = icd['ICD'].str.replace('\..*', '', regex=True)
icd['Hispanic'] = icd['Hispanic'].str.replace('.*:', '', regex=True)

In [None]:
print(icd.head())

## Hipanics network analysis

In [None]:
#estimate number of patients with each code and with each pair of disorders to later build the network

#1. estimate all the possible pairs of codes by selecting all the F codes

from itertools import combinations

fCodes = icd['ICD'].unique()

fCodesPairs = pd.DataFrame(list(combinations(fCodes, 2))).T

fCodesPairs = fCodesPairs.transpose()

print(fCodesPairs.head())

In [None]:
#create a subset of icd only for hispanics

icd_hispanics = icd[icd['Hispanic'] == 'Y']
print(icd_hispanics.head(10))

In [None]:
# create an empty data frame to save the results 

column_names = ["disAcode", "disBcode", "disA", "disB", "AB", "AnotB", "BnotA", 
                "notAnotB", "fisher", "oddsRatio", "95%confidenceInterval", "relativeRisk"]
results = pd.DataFrame(columns=column_names)

print(results.head())

In [None]:
## create a data.frame with  pairs of diseases and different counts
#"disAcode", "disBcode", "disA", "disB", "AB", "AnotB", "BnotA", "notAnotB", "fisher", "oddsRatio", "95%CI","RR"

from scipy.stats import fisher_exact
import statsmodels.stats.contingency_tables as ct

results = pd.DataFrame(columns=["disAcode", "disBcode", "disA", "disB", "AB", 
                                "AnotB", "BnotA", "notAnotB", "fisher", "oddsRatio", "95%confidenceInterval", "relativeRisk"])

for i in range(fCodesPairs.shape[0]):
    code1 = fCodesPairs.iloc[i, 0]
    code2 = fCodesPairs.iloc[i, 1]
    
    dis1 = icd_hispanics[icd_hispanics['ICD'] == code1]
    dis2 = icd_hispanics[icd_hispanics['ICD'] == code2]
    
    dis12 = dis2[dis2['patient'].isin(dis1['patient'])]
    
    disAcode = code1
    disBcode = code2
    disA = len(dis1['patient'].unique())
    disB = len(dis2['patient'].unique())
    AB = len(dis12['patient'].unique())
    AnotB = disA - AB
    BnotA = disB - AB
    notAB = len(icd_hispanics['patient'].unique()) - AB - AnotB - BnotA
    
    mm = [[AB, AnotB], [BnotA, notAB]]

    try:
        _, p_value = fisher_exact(mm)
        ci_lower, ci_upper = ct.Table2x2(mm).oddsratio_confint()
        
    except Exception as e:
        print(str(e))
        print("code1:", code1, "- code2:", code2)

    conf_interval = f"({round(ci_lower, 3)}, {round(ci_upper, 3)})"
    relativeRisk = (float(AB) * len(icd_hispanics['patient'].unique())) / (float(disA) * float(disB))
    oddsRatio = (float(AB) * float(notAB)) / (float(AnotB) * float(BnotA))
    
    results.loc[i] = [disAcode, disBcode, disA, disB, AB, AnotB, BnotA,
                      notAB, p_value, oddsRatio, conf_interval, relativeRisk]


In [None]:
print(results.head())

In [None]:
# correct for multiple testing and add the comorbidity score estimation 

results['expect'] = (results['disA'].astype(float) * results['disB'].astype(float)) / len(icd_hispanics['patient'].unique())
results['score'] = np.log2((results['AB'].astype(float) + 1) / (results['expect'] + 1))
results = results.sort_values(by='fisher')

def p_adjust(p_values, method='bonferroni', n=1):
    if method == 'bonferroni':
        return np.minimum(p_values * n, 1)
     
    return p_values

results['correctedPvalue'] = pd.Series(p_adjust(results['fisher'].astype(float), method='bonferroni', n=results.shape[0]))

pd.set_option('display.width', 100)

print(results.head())

In [None]:
#save results to a file
results.to_csv('comorbidityNetworkTable_Hispanic.csv', index=True, header=True)

In [None]:
!pip install python-igraph

In [None]:
!pip install cairocffi

In [None]:
import igraph
#comorbidity network analysis visualization

edges = pd.DataFrame({'disAcode': results['disAcode'], 'disBcode': results['disBcode']})

netw = igraph.Graph.TupleList(edges.itertuples(index=False), directed=False)
netw = netw.simplify()
lay = netw.layout_circle()

disPrev1 = results[["disAcode", "disA"]]
disPrev1.columns = ["dis", "patients"]

disPrev2 = results[["disBcode", "disB"]]
disPrev2.columns = ["dis", "patients"]

disPrev = pd.concat([disPrev1, disPrev2])
disPrev = disPrev[~disPrev.duplicated(subset="dis")]

disPrev["prevalence"] = (disPrev["patients"].astype(float) / len(icd_hispanics["patient"].unique())) * 100

#print(disPrev.head())
disPrev

In [None]:
# Creating the edges DataFrame
edges = pd.DataFrame({'disAcode': results['disAcode'], 'disBcode': results['disBcode']})

# Creating the Graph object
netw = igraph.Graph.TupleList(edges.itertuples(index=False), directed=False)

# Simplifying the graph
netw = netw.simplify()

# Layout
lay = netw.layout_circle()

# Creating disPrev DataFrame
disPrev1 = results[['disAcode', 'disA']]
disPrev1.columns = ['dis', 'patients']

disPrev2 = results[['disBcode', 'disB']]
disPrev2.columns = ['dis', 'patients']

disPrev = pd.concat([disPrev1, disPrev2])
disPrev = disPrev[~disPrev.duplicated(subset='dis')]

# Prevalence calculation
disPrev['prevalence'] = (disPrev['patients'].astype(float) / len(icd_hispanics['patient'].unique())) * 100

# Assigning sizes
sizes = disPrev['prevalence'].astype(float)

# Assigning names as vertex labels
netw.vs['name'] = list(disPrev['dis'])

# Printing information about the network
print("The network contains", netw.vcount(), "nodes and", netw.ecount(), "edges.")

# Plotting the network
plt.figure(figsize=(8, 8))
plt.axis('off')
plt.title('Comorbidity Network: Hispanics')

# Convert lay to a NumPy array
lay = np.array([list(coord) for coord in lay])

# Scatter plot
plt.scatter(lay[:, 0], lay[:, 1], c='lightblue', edgecolors='blue', alpha=0.7, s=sizes * 25)

# Add node labels
for i, name in enumerate(netw.vs['name']):
    plt.text(lay[i, 0], lay[i, 1], name, color='black', fontsize=10, ha='center', va='center')

# Plot edges
for edge in netw.es:
    start = edge.source
    end = edge.target
    plt.plot([lay[start, 0], lay[end, 0]], [lay[start, 1], lay[end, 1]], color='darkgrey', linewidth=results.at[edge.index, 'relativeRisk'])

plt.show()

## Non hipanics network analysis

In [None]:
#create a subset of icd only for non hispanics

icd_non_hispanics = icd[icd['Hispanic'] == 'N']
print(icd_non_hispanics.head(10))

In [None]:
# create an empty data frame to save the results 

column_names = ["disAcode", "disBcode", "disA", "disB", "AB", "AnotB", "BnotA", 
                "notAnotB", "fisher", "oddsRatio", "95%confidenceInterval", "relativeRisk"]
results2 = pd.DataFrame(columns=column_names)

print(results2.head())

In [None]:
## create a data frame with  pairs of diseases and different counts
#"disAcode", "disBcode", "disA", "disB", "AB", "AnotB", "BnotA", "notAnotB", "fisher", "oddsRatio", "95%CI","RR"

results2 = pd.DataFrame(columns=["disAcode", "disBcode", "disA", "disB", "AB", 
                                "AnotB", "BnotA", "notAnotB", "fisher", "oddsRatio", "95%confidenceInterval", "relativeRisk"])

for i in range(fCodesPairs.shape[0]):
    code1 = fCodesPairs.iloc[i, 0]
    code2 = fCodesPairs.iloc[i, 1]
    
    dis1 = icd_non_hispanics[icd_non_hispanics['ICD'] == code1]
    dis2 = icd_non_hispanics[icd_non_hispanics['ICD'] == code2]
    
    dis12 = dis2[dis2['patient'].isin(dis1['patient'])]
    
    disAcode = code1
    disBcode = code2
    disA = len(dis1['patient'].unique())
    disB = len(dis2['patient'].unique())
    AB = len(dis12['patient'].unique())
    AnotB = disA - AB
    BnotA = disB - AB
    notAB = len(icd_non_hispanics['patient'].unique()) - AB - AnotB - BnotA
    
    mm = [[AB, AnotB], [BnotA, notAB]]
    try:
        _, p_value = fisher_exact(mm)
        ci_lower, ci_upper = ct.Table2x2(mm).oddsratio_confint()
        
    except Exception as e:
        print(str(e))
        print("code1:", code1, "- code2:", code2)

    conf_interval = f"({round(ci_lower, 3)}, {round(ci_upper, 3)})"
    relativeRisk = (float(AB) * len(icd_non_hispanics['patient'].unique())) / (float(disA) * float(disB))
    oddsRatio = (float(AB) * float(notAB)) / (float(AnotB) * float(BnotA))
    
    results2.loc[i] = [disAcode, disBcode, disA, disB, AB, AnotB, BnotA, 
                         notAB, p_value, oddsRatio, conf_interval, relativeRisk]


In [None]:
print(results2.head())

In [None]:
# correct for multiple testing and add the comorbidity score estimation 

results2['expect'] = (results2['disA'].astype(float) * results2['disB'].astype(float)) / len(icd_hispanics['patient'].unique())
results2['score'] = np.log2((results2['AB'].astype(float) + 1) / (results2['expect'] + 1))
results2 = results2.sort_values(by='fisher')

def p_adjust(p_values, method='bonferroni', n=1):
    if method == 'bonferroni':
        return np.minimum(p_values * n, 1)
     
    return p_values

results2['correctedPvalue'] = pd.Series(p_adjust(results2['fisher'].astype(float), method='bonferroni', n=results2.shape[0]))

pd.set_option('display.width', 100)

print(results2.head())


In [None]:
#save results to a file
results2.to_csv('comorbidityNetworkTable_NonHispanic.csv', index=True, header=True)

In [None]:
#comorbidity network analysis visualization

edges = pd.DataFrame({'disAcode': results2['disAcode'], 'disBcode': results2['disBcode']})
netw = igraph.Graph.TupleList(edges.itertuples(index=False), directed=False)
netw = netw.simplify()
lay = netw.layout_circle()

disPrev1 = results2[["disAcode", "disA"]]
disPrev1.columns = ["dis", "patients"]

disPrev2 = results2[["disBcode", "disB"]]
disPrev2.columns = ["dis", "patients"]

disPrev = pd.concat([disPrev1, disPrev2])
disPrev = disPrev[~disPrev.duplicated(subset="dis")]

disPrev["prevalence"] = (disPrev["patients"].astype(float) / len(icd_non_hispanics["patient"].unique())) * 100

disPrev

In [None]:
# Creating the edges DataFrame
edges = pd.DataFrame({'disAcode': results['disAcode'], 'disBcode': results['disBcode']})

# Creating the Graph object
netw = igraph.Graph.TupleList(edges.itertuples(index=False), directed=False)

# Simplifying the graph
netw = netw.simplify()

# Layout
lay = netw.layout_circle()

# Creating disPrev DataFrame
disPrev1 = results2[['disAcode', 'disA']]
disPrev1.columns = ['dis', 'patients']

disPrev2 = results2[['disBcode', 'disB']]
disPrev2.columns = ['dis', 'patients']

disPrev = pd.concat([disPrev1, disPrev2])
disPrev = disPrev[~disPrev.duplicated(subset='dis')]

# Prevalence calculation
disPrev['prevalence'] = (disPrev['patients'].astype(float) / len(icd_non_hispanics['patient'].unique())) * 100

# Assigning sizes
sizes2 = disPrev['prevalence'].astype(float)

# Assigning names as vertex labels
netw.vs['name'] = list(disPrev['dis'])

# Printing information about the network
print("The network contains", netw.vcount(), "nodes and", netw.ecount(), "edges.")

# Plotting the network
plt.figure(figsize=(8, 8))
plt.axis('off')
plt.title('Comorbidity Network: Non Hispanics')

# Convert lay to a NumPy array
lay = np.array([list(coord) for coord in lay])

# Scatter plot
plt.scatter(lay[:, 0], lay[:, 1], c='lightblue', edgecolors='blue', alpha=0.7, s=sizes2 * 25)

# Add node labels
for i, name in enumerate(netw.vs['name']):
    plt.text(lay[i, 0], lay[i, 1], name, color='black', fontsize=10, ha='center', va='center')

# Plot edges
for edge in netw.es:
    start = edge.source
    end = edge.target
    plt.plot([lay[start, 0], lay[end, 0]], [lay[start, 1], lay[end, 1]], color='darkgrey', linewidth=results2.at[edge.index, 'relativeRisk'])

plt.show()