In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### HIV

In [None]:
df_group1 = pd.read_excel('Data/Raw/Suppl Data 1_Diff_methyl_CpG HIV+ pre-ART and HI.xlsx') # import raw data as df, HIV+preART/HIVneg
df_group1 = df_group1[df_group1['Adjusted_p-value'] <= 0.05] # filter out any with insignificant p-values
df_group1 = df_group1.drop(labels=['p-value','Adjusted_p-value','Relation to Island','Gen_Name','Gene_Region','Chromosome'], axis=1)

df_group2 = pd.read_excel('Data/Raw/Suppl Data 2_Diff_methyl_CpG HIV+ pre-ART vs HIV+ post-ART.xlsx') # import raw data as df, HIV+preART/HIV+postART
df_group2 = df_group2[df_group2['Adjusted_p-value'] <= 0.05] # filter out any with insignificant p-values
df_group2 = df_group2.drop(labels=['p-value','Adjusted_p-value','Relation to Island','Gen_Name','Gene_Region','Chromosome'], axis=1)

In [None]:
## Data was downloaded from GEO Ascession Platform GPL21145 (11MAR23)
## contains CpG genomic locations from the Illumina BeadChip platform used for analysis 

df_CpG_pos_orig = pd.read_csv('Data/Raw/CpGPositions_GPL21145-48548.txt',sep='\t',low_memory=False) # import raw data as df
df_CpG_pos = df_CpG_pos_orig.copy() # create copy of df for manipulation 

# remove unnecessary columns from the data frame
headers = ['ID','Genome_Build','CHR','MAPINFO']
df_CpG_pos = df_CpG_pos[headers]

# remove CpG sites that do not follow format of experimental data (i.e. do not start with "cg")
df_CpG_pos = df_CpG_pos[df_CpG_pos['ID'].str.startswith('cg')]

# format df
df_CpG_pos.set_index(df_CpG_pos['ID'], inplace = True)
df_CpG_pos.rename_axis(index = None, inplace = True)
df_CpG_pos = df_CpG_pos.drop(labels=['ID'],axis =1)

df_CpG_pos.to_csv('Data/CpGPositions_FILTERED.csv') # save filtered df as csv

In [None]:
### GO DO LIFTOVER TO GET THE DF_CPG_POS INTO GENOME BUILD 38 ###
df_CpG_pos['CHR_lift'] = str('chr')+df_CpG_pos['CHR']
df_CpG_pos['start'] = df_CpG_pos['MAPINFO'].astype(int)
df_CpG_pos['end'] = df_CpG_pos['MAPINFO'].astype(int)
df_CpG_pos['CpG_ID'] = df_CpG_pos.index

cols = ['CHR_lift','start','end','CpG_ID']
df_CpG_pos_for_LO = df_CpG_pos[cols]
df_CpG_pos_for_LO.to_csv('Data/CpGPositions_FILTERED_GHCR37.csv', sep='\t', index=False) # save filtered df as csv

In [None]:
### IMPORT LIFTED CPG POSITIONS ###

df_CpG_pos_lifted = pd.read_csv('Data/CpGPositions_FILTERED_hg38_LIFTED.txt', sep='\t', names = ['CHR','start','end'], index_col=3, header=None)
df_CpG_pos_lifted['Genome_Build'] = 38
df_CpG_pos_lifted['CHR'] = df_CpG_pos_lifted['CHR'].map(lambda x: x[3:])
df_CpG_pos_lifted['CHR'] = df_CpG_pos_lifted['CHR'].astype(str)

In [None]:
dict_CpG_pos_GB = df_CpG_pos_lifted['Genome_Build'].to_dict()
dict_CpG_pos_CHR = df_CpG_pos_lifted['CHR'].to_dict()
dict_CpG_pos_MAPINFO = df_CpG_pos_lifted['start'].to_dict()

In [None]:
# align genomic loci from illumina beadchip assay with the DMPs provided in the paper

df_group1['GB'] = ''
df_group1['CHR'] = ''
df_group1['MAPINFO'] = ''
for i in range(0,len(df_group1)):
    df_group1.iloc[i,4] = dict_CpG_pos_GB[str(df_group1.iloc[i,0])]
    df_group1.iloc[i,5] = dict_CpG_pos_CHR[str(df_group1.iloc[i,0])]
    df_group1.iloc[i,6] = dict_CpG_pos_MAPINFO[str(df_group1.iloc[i,0])]
df_group1.to_csv('Data/Group1_pos_LIFTED.csv') # save group1 data as csv file

df_group2['GB'] = ''
df_group2['CHR'] = ''
df_group2['MAPINFO'] = ''
for i in range(0,len(df_group2)):
    df_group2.iloc[i,4] = dict_CpG_pos_GB[str(df_group2.iloc[i,0])]
    df_group2.iloc[i,5] = dict_CpG_pos_CHR[str(df_group2.iloc[i,0])]
    df_group2.iloc[i,6] = dict_CpG_pos_MAPINFO[str(df_group2.iloc[i,0])]
df_group2.to_csv('Data/Group2_pos_LIFTED.csv') # save group2 data as csv file

#### HPV:

In [None]:
df_HPV_full = pd.read_csv('Data/Raw/GSE24087_series_matrix_manipulated.txt',sep='\t') # read in HPV methylation data as df
# GSM593021 , SCC4 HPV(-)
# GSM593022 , SCC74A HPV(-)
# GSM593023 , SCC47 HPV(+)
# GSM593024 , CaSki HPV(+)

In [None]:
df_HPV = df_HPV_full.copy()
df_HPV.drop(columns=['GSM593022','GSM593024'], axis=1, inplace=True)
df_HPV.rename({'ID_REF':'CpG_ID','GSM593021':'HPV_neg','GSM593023':'HPV_pos'}, axis=1, inplace=True)
df_HPV['delta'] = df_HPV['HPV_pos'] - df_HPV['HPV_neg']
df_HPV = df_HPV[abs(df_HPV['delta']) > (0.05 * df_HPV['HPV_neg'])]
df_HPV.reset_index(inplace=True)
df_HPV.drop(columns=['index'],inplace=True)

In [None]:
df_HPV['GB'] = ''
df_HPV['CHR'] = ''
df_HPV['MAPINFO'] = ''
for i in range(0,len(df_HPV)):
    if str(df_HPV.iloc[i,0]) in df_CpG_pos_lifted.index:
        df_HPV.iloc[i,4] = dict_CpG_pos_GB[str(df_HPV.iloc[i,0])]
        df_HPV.iloc[i,5] = dict_CpG_pos_CHR[str(df_HPV.iloc[i,0])]
        df_HPV.iloc[i,6] = dict_CpG_pos_MAPINFO[str(df_HPV.iloc[i,0])]

df_HPV['GB'].replace('',np.nan, inplace=True)
df_HPV['CHR'].replace('',np.nan, inplace=True)
df_HPV['MAPINFO'].replace('',np.nan, inplace=True)
df_HPV.dropna(subset=['GB','CHR','MAPINFO'], inplace=True)
df_HPV.reset_index(inplace=True)
df_HPV.drop(columns=['index'],inplace=True)
df_HPV['MAPINFO'] = df_HPV['MAPINFO'].astype(int)
df_HPV['CHR'] = df_HPV['CHR'].astype(str)
df_HPV.to_csv('Data/HPV_loci.csv') # save HPV data as csv file

#### SARS2

In [None]:
df_SARS2_full = pd.read_csv('Data/Raw/43856_2021_42_MOESM2_ESM_edited.csv') # read in SARS2 methylation data as df
col_of_interest = ['probeID','meanbeta_covid','meanbeta_control','deltabeta']
df_SARS2 = df_SARS2_full[col_of_interest]
df_SARS2.rename({'probeID':'CpG_ID'}, axis=1, inplace=True)

In [None]:
df_SARS2 = df_SARS2[abs(df_SARS2['deltabeta']) > (0.05 * df_SARS2['meanbeta_control'])]
df_SARS2.reset_index(inplace=True)
df_SARS2.drop(columns=['index'],inplace=True)

In [None]:
df_SARS2['GB'] = ''
df_SARS2['CHR'] = ''
df_SARS2['MAPINFO'] = ''
for i in range(0,len(df_SARS2)):
    if str(df_SARS2.iloc[i,0]) in df_CpG_pos_lifted.index:
        df_SARS2.iloc[i,4] = dict_CpG_pos_GB[str(df_SARS2.iloc[i,0])]
        df_SARS2.iloc[i,5] = dict_CpG_pos_CHR[str(df_SARS2.iloc[i,0])]
        df_SARS2.iloc[i,6] = dict_CpG_pos_MAPINFO[str(df_SARS2.iloc[i,0])]

df_SARS2['GB'].replace('',np.nan, inplace=True)
df_SARS2['CHR'].replace('',np.nan, inplace=True)
df_SARS2['MAPINFO'].replace('',np.nan, inplace=True)
df_SARS2.dropna(subset=['GB','CHR','MAPINFO'], inplace=True)
df_SARS2.reset_index(inplace=True)
df_SARS2.drop(columns=['index'],inplace=True)
df_SARS2['MAPINFO'] = df_SARS2['MAPINFO'].astype(int)
df_SARS2['CHR'] = df_SARS2['CHR'].astype(str)
df_SARS2.to_csv('Data/SARS2_loci.csv') # save SARS2 data as csv file