In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pylab
import scipy.stats as sp

#### Import Data

In [None]:
# read in all regulatory landmark positions

df_DHS = pd.read_csv('Data/dhs_ENCFF503GCK_GRCh38_clean_2.txt', index_col=0) # read in DHS positions as df
df_DHS.dropna(inplace=True)
df_DHS['RL'] = 'DHS' # label data

df_chromhmm = pd.read_csv('Data/chromhmm_epilogos15statehg38_clean.txt', index_col=0) # read in chromhmm data as df
df_chromhmm.dropna(inplace=True)
df_chromhmm.rename(columns={'final_state':'RL'}, inplace=True)
df_chromhmm.drop(columns=['max_val','state'], inplace=True) # drop unnecessary columns

df_ctcf = pd.read_csv('Data/ctcf_loci_GRCh38_clean_2.txt',index_col=0) # read in ctcf positions as df
df_ctcf.dropna(inplace=True)
df_ctcf['RL'] = 'ctcf' # label data
chroms = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22']
df_ctcf = df_ctcf[df_ctcf['chr'].isin(chroms)] # select for autosomal chromosomes 
df_ctcf['chr'] = df_ctcf['chr'].astype(int)

df_ctcf_strong = pd.read_csv('Data/ctcf__STRONG_loci_GRCh38_clean_2.csv',index_col=0) # read in STRONG ctcf pos as df (insulation filtered)
df_ctcf_strong.dropna(inplace=True)
df_ctcf_strong['RL'] = 'ctcf_strong' # label data

df_RL = pd.concat([df_DHS, df_chromhmm, df_ctcf, df_ctcf_strong], axis=0) # concatenate all RLs into a single dataframe

In [None]:
# read in all CpG positions

df_CpG_all = pd.read_csv('Data/CpGPositions_FILTERED_hg38_LIFTED.txt', sep='\t', names = ['CHR','start','end'], index_col=3, header=None)
df_CpG_all['Genome_Build'] = 38
df_CpG_all['CHR'] = df_CpG_all['CHR'].map(lambda x: x[3:])
df_CpG_all['CHR'] = df_CpG_all['CHR'].astype(str)
df_CpG_all.drop(['end'], axis=1, inplace=True)
df_CpG_all.rename({'start':'MAPINFO'}, axis=1, inplace=True)

# save data groups as CSV files (group 1 = infection, group 2 = treatment)
df_group1 = pd.read_csv('Data/Group1_pos_LIFTED.csv', index_col=0)
df_group2 = pd.read_csv('Data/Group2_pos_LIFTED.csv', index_col=0)

#### Function Definition

In [None]:
# define number of simulations for random sampling
num_sims = 10

In [None]:
### FUNC: DISTANCE_TO_CLOSEST
## from a CpG position, calculate distance to closest RL of a particular type
# inputs: CpG loci, RL loci

def distance_to_closest(cpg_position, RL_position):
    dist = np.abs(np.subtract.outer(cpg_position, RL_position)).min(axis=1)
    return dist

In [None]:
### FUNC: OVERLAP_WITH_CLOSEST
## calculate the count of how many CpGs fall within their nearest RL
# inputs: CpG loci, loci defining boudnaries of RLs

def overlap_with_closest(cpg_position, df_RL_chr):
    ind = np.abs(np.subtract.outer(cpg_position, np.asarray(df_RL_chr['mid']))).argmin(axis=1)
    count = (((cpg_position > df_RL_chr.loc[ind, :]["start"]).astype("int64") + (cpg_position < df_RL_chr.loc[ind, :]["end"]).astype("int64")) == 2).sum()
    return count

In [None]:
### FUNC: RL_ANALYSIS
## for a set of CpG sites and a set of RL positions, perform positional enrichment analysis and compare to random expectation
# inputs: experimental CpG loci, RL loci, all CpG loci along genome, number of simulations for random expectation calculations

def RL_analysis(df_cpg_exp, df_RL, df_cpg_all, num_sims):
    
    # create storage location for analysis
    results = {}
  
    for chr_id in range(1,23): # cycle through autosomal chromosomes
        overlap_count = 0

        # print('Beginning chromosome:',chr_id)

        # get CpG pos for this chromosome -- experimental group
        df_cpgpos_chr = df_cpg_exp[df_cpg_exp['CHR'] == chr_id]
        df_cpgpos_chr.reset_index(inplace=True, drop=True)
        num_exp_cpg = df_cpgpos_chr.shape[0]

        # get CpG pos for this chromosome -- control group
        df_cpgpos_chr_ctrl = df_cpg_all[df_cpg_all['CHR'] == str(chr_id)]
        df_cpgpos_chr_ctrl.reset_index(inplace=True, drop=True)

        # find RL positions on this chromosome
        df_RL_chr = df_RL[df_RL['chr'] == chr_id]
        df_RL_chr.reset_index(inplace=True,drop=True)

        # random simulation control
        store_random = np.zeros([num_exp_cpg, num_sims])
        # for j in tqdm(range(num_sims)):
        for j in range(num_sims):
            df_random = df_cpgpos_chr_ctrl['MAPINFO'].sample(n=num_exp_cpg,axis=0, replace=False).to_numpy()
            dist_to_closest_random = distance_to_closest(df_random, np.asarray(df_RL_chr['mid']))
            store_random[:,j] = dist_to_closest_random
        store_random = store_random.mean(axis=1)

        # calculate distances to closest regulatory landmark for each condition 
        dist_to_closest = distance_to_closest(np.asarray(df_cpgpos_chr['MAPINFO']),np.asarray(df_RL_chr['mid']))

        # count how many times CpGs overlap with nearest RL
        count = overlap_with_closest(np.asarray(df_cpgpos_chr['MAPINFO']), df_RL_chr)
        overlap_count += count
            
        results[chr_id] = {'random_ctrl': store_random, 'experimental_dist': dist_to_closest, 'overlap': overlap_count} 
        # print('Done with chromosome:',chr_id)
    
    return(results)    
    

In [None]:
### FUNC: DECODE
## decode results from RL_analysis or similar; print overlap with nearest RL count, and p-values from mannwhitneyU and welches' t tests
# input: dictionary of distances as calculated by RL_analysis

def decode(distance_dict):
    data = []
    rand_ctrl = []
    overlap_count = []

    chr_id = []
    count = 1
    for key in distance_dict:
        curr_dict = distance_dict[key]
        data.append(curr_dict['experimental_dist'])
        rand_ctrl.append(curr_dict['random_ctrl'])
        overlap_count.append(curr_dict['overlap'])

        chr_id.append(len(curr_dict['experimental_dist'])*[key])
        count += 1

    data = np.concatenate(data)/1000 # convert to kb
    rand_ctrl = np.concatenate(rand_ctrl)/1000 # convert to kb
    chr_id = np.concatenate(chr_id)

    df = []
    df = pd.DataFrame(columns = ['distance_(kb)','group'])
    df['distance_(kb)'] = np.concatenate([data,rand_ctrl])
    df['distance_(kb)'] = df['distance_(kb)'] + 1e-3
    df['group'] = len(data)*['Experimental Group'] + len(rand_ctrl)*['Randomized Control']

    print('Number of CpGs that overlap with nearest regulatory landmark:', sum(overlap_count))
    
    t, p = sp.mannwhitneyu(x = data, y = rand_ctrl, alternative='two-sided')
    print('P-value from two sided mannwhitneyu t-test: {}'.format(p))

    t,p = sp.ttest_ind(data, rand_ctrl, alternative='two-sided')
    print('P-value from two sided welches t-test: {}'.format(p))


    return(df)

In [None]:
### FUNC: RL_ANALYSIS_COMPARATIVE
## perform RL analysis between two CpG distributions, as opposed to RL_ANALYSIS which compares to random expectation
# inputs: CpG loci experiment set 1, CpG loci experiment set 2, RL loci

def RL_analysis_comparative(df_cpg_exp1, df_cpg_exp2, df_RL):
    
    # create storage location for analysis
    results = {}
  
    for chr_id in range(1,23): # cycle through autosomal chromosomes
        overlap_count1 = 0
        overlap_count2 = 0

        # print('Beginning chromosome:',chr_id)

        # get CpG pos for this chromosome -- experimental group 1
        df_cpgpos_chr1 = df_cpg_exp1[df_cpg_exp1['CHR'] == chr_id]
        df_cpgpos_chr1.reset_index(inplace=True, drop=True)

        # get CpG pos for this chromosome -- experimental group 2
        df_cpgpos_chr2 = df_cpg_exp2[df_cpg_exp2['CHR'] == chr_id]
        df_cpgpos_chr2.reset_index(inplace=True, drop=True)

        # find RL positions on this chromosome
        df_RL_chr = df_RL[df_RL['chr'] == chr_id]
        df_RL_chr.reset_index(inplace=True,drop=True)

        # calculate distances to closest regulatory landmark for each condition 
        dist_to_closest_1 = distance_to_closest(np.asarray(df_cpgpos_chr1['MAPINFO']),np.asarray(df_RL_chr['mid']))
        dist_to_closest_2 = distance_to_closest(np.asarray(df_cpgpos_chr2['MAPINFO']),np.asarray(df_RL_chr['mid']))

        # count how many times CpGs overlap with nearest RL
        count1 = overlap_with_closest(np.asarray(df_cpgpos_chr1['MAPINFO']), df_RL_chr)
        overlap_count1 += count1

        count2 = overlap_with_closest(np.asarray(df_cpgpos_chr2['MAPINFO']), df_RL_chr)
        overlap_count2 += count2
            
        results[chr_id] = {'experimental_dist_1': dist_to_closest_1, 'experimental_dist_2': dist_to_closest_2, 'overlap_1': overlap_count1, 'overlap_2': overlap_count2} 
        # print('Done with chromosome:',chr_id)
    
    return(results)    
    

In [None]:
### FUNC: DECODE_COMPARATIVE
## decode results from RL_analysis_comparative or similar; print overlap with nearest RL count, and p-values from mannwhitneyU and welches' t tests
# input: dictionary of distances as calculated by RL_analysis_comparative

def decode_comparative(distance_dict):
    data_1 = []
    data_2 = []
    overlap_count_1 = []
    overlap_count_2 = []

    chr_id = []
    count = 1
    for key in distance_dict:
        curr_dict = distance_dict[key]
        data_1.append(curr_dict['experimental_dist_1'])
        data_2.append(curr_dict['experimental_dist_2'])
        overlap_count_1.append(curr_dict['overlap_1'])
        overlap_count_2.append(curr_dict['overlap_2'])

        chr_id.append(len(curr_dict['experimental_dist_1'])*[key])
        count += 1

    data_1 = np.concatenate(data_1)/1000 # convert to kb
    data_2 = np.concatenate(data_2)/1000 # convert to kb
    # chr_id = np.concatenate(chr_id)

    df = []
    df = pd.DataFrame(columns = ['distance_(kb)','group'])
    df['distance_(kb)'] = np.concatenate([data_1,data_2])
    df['distance_(kb)'] = df['distance_(kb)'] + 1e-3
    df['group'] = len(data_1)*['Experimental Group 1'] + len(data_2)*['Experimental Group 2']

    print('Number of CpGs that overlap with nearest regulatory landmark for FIRST group:', sum(overlap_count_1))
    print('Number of CpGs that overlap with nearest regulatory landmark for SECOND group:', sum(overlap_count_2))

    
    t, p = sp.mannwhitneyu(x = data_1, y = data_2, alternative="two-sided")
    print('P-value from two sided mannwhitneyu t-test: {}'.format(p))

    t, p = sp.ttest_ind(data_1, data_2, alternative="two-sided")
    print('P-value from two sided welches t-test: {}'.format(p))

    return(df)

In [None]:
### FUNC: RL_ANALYSIS_OTHERVIR
## perform RL analysis for non-HIV viruses, including HPV and SARS2
# inputs: CpG loci, RL loc, CpG loci along entire genome, number of simulations for random expectation generation

def RL_analysis_otherVIR(df_cpg_exp, df_RL, df_cpg_all, num_sims):
    
    # create storage location for analysis
    results = {}
  
    for chr_id in range(1,23): # cycle through autosomal chromosomes
        overlap_count = 0

        print('Beginning chromosome:',chr_id)

        # get CpG pos for this chromosome -- experimental group
        df_cpgpos_chr = df_cpg_exp[df_cpg_exp['CHR'] == str(chr_id)]
        df_cpgpos_chr.reset_index(inplace=True, drop=True)
        num_exp_cpg = df_cpgpos_chr.shape[0]

        # get CpG pos for this chromosome -- control group
        df_cpgpos_chr_ctrl = df_cpg_all[df_cpg_all['CHR'] == str(chr_id)]
        df_cpgpos_chr_ctrl.reset_index(inplace=True, drop=True)

        # find RL positions on this chromosome
        df_RL_chr = df_RL[df_RL['chr'] == chr_id]
        df_RL_chr.reset_index(inplace=True,drop=True)

        # random simulation control
        store_random = np.zeros([num_exp_cpg, num_sims])
        for j in tqdm(range(num_sims)):
        # for j in range(num_sims):
            df_random = df_cpgpos_chr_ctrl['MAPINFO'].sample(n=num_exp_cpg,axis=0, replace=False).to_numpy()
            dist_to_closest_random = distance_to_closest(df_random, np.asarray(df_RL_chr['mid']))
            store_random[:,j] = dist_to_closest_random
        store_random = store_random.mean(axis=1)

        # calculate distances to closest regulatory landmark for each condition 
        dist_to_closest = distance_to_closest(np.asarray(df_cpgpos_chr['MAPINFO']),np.asarray(df_RL_chr['mid']))

        # count how many times CpGs overlap with nearest RL
        count = overlap_with_closest(np.asarray(df_cpgpos_chr['MAPINFO']), df_RL_chr)
        overlap_count += count
            
        results[chr_id] = {'random_ctrl': store_random, 'experimental_dist': dist_to_closest, 'overlap': overlap_count} 
        print('Done with chromosome:',chr_id)
    
    return(results)    

In [None]:
### FUNC: RL_ANALYSIS_COMPARATIVE OTHERVIR
## perform RL analysis for non-HIV viruses, including HPV and SARS2, comparing two different experimental distributions instead of comparison to random
# inputs: CpG loci experimental group1, CpG loci experimental group 2, RL loci

def RL_analysis_comparative_otherVIR(df_cpg_exp1, df_cpg_exp2, df_RL):
    # create storage location for analysis
    results = {}

    for chr_id in range(1,23): # cycle through autosomal chromosomes
        overlap_count1 = 0
        overlap_count2 = 0

        # print('Beginning chromosome:',chr_id)

        # get CpG pos for this chromosome -- experimental group 1
        df_cpgpos_chr1 = df_cpg_exp1[df_cpg_exp1['CHR'] == str(chr_id)]
        df_cpgpos_chr1.reset_index(inplace=True, drop=True)

        # get CpG pos for this chromosome -- experimental group 2
        df_cpgpos_chr2 = df_cpg_exp2[df_cpg_exp2['CHR'] == str(chr_id)]
        df_cpgpos_chr2.reset_index(inplace=True, drop=True)

        # find RL positions on this chromosome
        df_RL_chr = df_RL[df_RL['chr'] == chr_id]
        df_RL_chr.reset_index(inplace=True,drop=True)

        # calculate distances to closest regulatory landmark for each condition 
        dist_to_closest_1 = distance_to_closest(np.asarray(df_cpgpos_chr1['MAPINFO']),np.asarray(df_RL_chr['mid']))
        dist_to_closest_2 = distance_to_closest(np.asarray(df_cpgpos_chr2['MAPINFO']),np.asarray(df_RL_chr['mid']))

        # count how many times CpGs overlap with nearest RL
        count1 = overlap_with_closest(np.asarray(df_cpgpos_chr1['MAPINFO']), df_RL_chr)
        overlap_count1 += count1

        count2 = overlap_with_closest(np.asarray(df_cpgpos_chr2['MAPINFO']), df_RL_chr)
        overlap_count2 += count2
            
        results[chr_id] = {'experimental_dist_1': dist_to_closest_1, 'experimental_dist_2': dist_to_closest_2, 'overlap_1': overlap_count1, 'overlap_2': overlap_count2} 
        # print('Done with chromosome:',chr_id)
    return(results)

In [None]:
### FUNC: GRAPH_RL
## series of functions to graph results for each RL
# inputs: dataframes containing reults from RL_analysis or similar 

def graph_DHS(df_plot):
    sns.set_theme(font="arial", font_scale=3.2, style="white")
    fig, axes = plt.subplots(2,1, figsize=(11,11), gridspec_kw={'height_ratios': [2,8.5]})
    plt.subplots_adjust(hspace = 0.03)
    i = sns.histplot(ax=axes[0], data=df_plot, x="distance_(kb)", hue="group", common_norm= True, log_scale=True, legend=False).set(
        xlabel=None,
        ylabel=None,
        xticklabels=[],
        yticklabels=[]
    )
    j = sns.ecdfplot(ax=axes[1],data=df_plot, x="distance_(kb)", hue="group", lw=3, log_scale=True, legend=False).set(
        xlabel='Distance to Closest DHS (kb)', 
        ylabel='Proportion'
    )
    # sns.move_legend(axes[1], "upper left")


def graph_enhancer(df_plot):
    sns.set_theme(font="arial", font_scale=3.2, style="white")
    fig, axes = plt.subplots(2,1, figsize=(11,11), gridspec_kw={'height_ratios': [2,8.5]})
    plt.subplots_adjust(hspace = 0.03)
    i = sns.histplot(ax=axes[0], data=df_plot, x="distance_(kb)", hue="group", common_norm= True, log_scale=True, legend=False).set(
        xlabel=None,
        ylabel=None,
        xticklabels=[],
        yticklabels=[]
    )
    j = sns.ecdfplot(ax=axes[1],data=df_plot, x="distance_(kb)", hue="group", lw=3, log_scale=True, legend=False).set(
        xlabel='Distance to Closest Enhancer (kb)', 
        ylabel='Proportion'
    )
    # sns.move_legend(axes[1], "upper left")


def graph_promoter(df_plot):
    sns.set_theme(font="arial", font_scale=3.2, style="white")
    fig, axes = plt.subplots(2,1, figsize=(11,11), gridspec_kw={'height_ratios': [2,8.5]})
    plt.subplots_adjust(hspace = 0.03)
    i = sns.histplot(ax=axes[0], data=df_plot, x="distance_(kb)", hue="group", common_norm= True, log_scale=True, legend=False).set(
        xlabel=None,
        ylabel=None,
        xticklabels=[],
        yticklabels=[]
    )
    j = sns.ecdfplot(ax=axes[1],data=df_plot, x="distance_(kb)", hue="group", lw=3, log_scale=True, legend=False).set(
        xlabel='Distance to Closest Promoter (kb)', 
        ylabel='Proportion'
    )
    # sns.move_legend(axes[1], "upper left")

def graph_ctcf_strong(df_plot):
    sns.set_theme(font="arial", font_scale=3.2, style="white")
    fig, axes = plt.subplots(2,1, figsize=(11,11), gridspec_kw={'height_ratios': [2,8.5]})
    plt.subplots_adjust(hspace = 0.03)
    i = sns.histplot(ax=axes[0], data=df_plot, x="distance_(kb)", hue="group", common_norm= True, log_scale=True, legend=False).set(
        xlabel=None,
        ylabel=None,
        xticklabels=[],
        yticklabels=[]
    )
    j = sns.ecdfplot(ax=axes[1],data=df_plot, x="distance_(kb)", hue="group", lw=3, log_scale=True, legend=False).set(
        xlabel='Distance to Closest Strong CTCFbs (kb)', 
        ylabel='Proportion'
    )
    # sns.move_legend(axes[1], "upper left")


#### Figure 2: Initial HIV RL Analysis

In [None]:
# create empty dfs to store data thoughout analysis 

df_store_group1 = pd.DataFrame(columns=['CpG_ID','DHS','enhancer','promoter','CTCFbs_strong'])
df_store_group1['CpG_ID'] = df_group1['CpG_ID']

df_store_group2 = pd.DataFrame(columns=['CpG_ID','DHS','enhancer','promoter','CTCFbs_strong'])
df_store_group2['CpG_ID'] = df_group2['CpG_ID']

DHS

In [None]:
# perform RL analysis for INFECTION to DHS

group1_DHS = RL_analysis(df_group1,df_DHS,df_CpG_all,num_sims)
df_group1_DHS = decode(group1_DHS) 
graph_DHS(df_group1_DHS)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group1_DHS[df_group1_DHS['group']=="Experimental Group"]
df_store_group1['DHS'] = df_intermediate['distance_(kb)']

In [None]:
# perform RL analysis for TREATMENT to DHS

group2_DHS = RL_analysis(df_group2,df_DHS,df_CpG_all,num_sims)
df_group2_DHS = decode(group2_DHS)
graph_DHS(df_group2_DHS)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group2_DHS[df_group2_DHS['group']=="Experimental Group"]
df_store_group2['DHS'] = df_intermediate['distance_(kb)']

ENHANCER

In [None]:
# perform RL analysis for INFECTION to ENHANCER

group1_enhancer = RL_analysis(df_group1,df_chromhmm[df_chromhmm['RL'] == 'enhancer'],df_CpG_all,num_sims)
df_group1_enhancer = decode(group1_enhancer)
graph_enhancer(df_group1_enhancer)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group1_enhancer[df_group1_enhancer['group']=="Experimental Group"]
df_store_group1['enhancer'] = df_intermediate['distance_(kb)']

In [None]:
# perform RL analysis for TREATMENT to ENHANCER

group2_enhancer = RL_analysis(df_group2,df_chromhmm[df_chromhmm['RL'] == 'enhancer'],df_CpG_all,num_sims)
df_group2_enhancer = decode(group2_enhancer)
graph_enhancer(df_group2_enhancer)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group2_enhancer[df_group2_enhancer['group']=="Experimental Group"]
df_store_group2['enhancer'] = df_intermediate['distance_(kb)']

PROMOTER

In [None]:
# perform RL analysis for INFECTION to PROMOTER

group1_promoter = RL_analysis(df_group1,df_chromhmm[df_chromhmm['RL'] == 'promoter'],df_CpG_all,num_sims)
df_group1_promoter = decode(group1_promoter)
graph_promoter(df_group1_promoter)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group1_promoter[df_group1_promoter['group']=="Experimental Group"]
df_store_group1['promoter'] = df_intermediate['distance_(kb)']

In [None]:
# perform RL analysis for TREATMENT to PROMOTER

group2_promoter = RL_analysis(df_group2,df_chromhmm[df_chromhmm['RL'] == 'promoter'],df_CpG_all,num_sims)
df_group2_promoter = decode(group2_promoter)
graph_promoter(df_group2_promoter)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group2_promoter[df_group2_promoter['group']=="Experimental Group"]
df_store_group2['promoter'] = df_intermediate['distance_(kb)']

STRONG CTCFbs

In [None]:
# perform RL analysis for INFECTON to STRONG CTCFBS

group1_ctcf_strong = RL_analysis(df_group1,df_ctcf_strong,df_CpG_all,num_sims)
df_group1_ctcf_strong = decode(group1_ctcf_strong)
graph_ctcf_strong(df_group1_ctcf_strong)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group1_ctcf_strong[df_group1_ctcf_strong['group']=="Experimental Group"]
df_store_group1['CTCFbs_strong'] = df_intermediate['distance_(kb)']

In [None]:
# perform RL analysis for TREATMENT to STRONG CTCFBS

group2_ctcf_strong = RL_analysis(df_group2,df_ctcf_strong,df_CpG_all,num_sims)
df_group2_ctcf_strong = decode(group2_ctcf_strong)
graph_ctcf_strong(df_group2_ctcf_strong)

# fill storage DFs for use in overlap analysis
df_intermediate = df_group2_ctcf_strong[df_group2_ctcf_strong['group']=="Experimental Group"]
df_store_group2['CTCFbs_strong'] = df_intermediate['distance_(kb)']

In [None]:
# process INFECTION results to get closest RL type to each CpG --> will be used in OVERLAP analysis
df_intermediate = df_store_group1.copy()
df_intermediate = df_intermediate.drop('CpG_ID',axis=1)
df_array = np.array(df_intermediate.values,dtype=float)
indices_of_minima = df_array.argmin(axis=1)
column_names_min = df_intermediate.columns[indices_of_minima]
df_store_group1['closest_RL'] = column_names_min
df_store_group1 

In [None]:
# process TREATMENT results to get closest RL type to each CpG --> will be used in OVERLAP analysis
df_intermediate = df_store_group2.copy()
df_intermediate = df_intermediate.drop('CpG_ID',axis=1)
df_array = np.array(df_intermediate.values,dtype=float)
indices_of_minima = df_array.argmin(axis=1)
column_names_min = df_intermediate.columns[indices_of_minima]
df_store_group2['closest_RL'] = column_names_min
df_store_group2

In [None]:
# store CLOSEST RL data to CSV 
df_store_group1.to_csv('Data/closest_RL_infection_g1.csv')
df_store_group2.to_csv('Data/closest_RL_treatment_g2.csv')

##### Figure 2b: How often do CpGs overlap with their nearest RL? 

In [None]:
df_overlap = pd.read_csv('Data/Processed/overlap_data.csv')

In [None]:
df_overlap = df_overlap.drop(2)

In [None]:
df_overlap

In [None]:
df_proportion = pd.DataFrame(columns=['DHS','enhancer','promoter','CTCFbs_strong'])

df_proportion['DHS'] = df_overlap['DHS'] / df_overlap['total_count']
df_proportion['enhancer'] = df_overlap['enhancer'] / df_overlap['total_count']
df_proportion['promoter'] = df_overlap['promoter'] / df_overlap['total_count']
df_proportion['CTCFbs_strong'] = df_overlap['CTCFbs_strong'] / df_overlap['total_count']
df_proportion['Group'] = ''
df_proportion.loc[0,'Group'] = 'Infection'
df_proportion.loc[1,'Group'] = 'Treatment'
# df_proportion.loc[2,'Group'] = 'Group 3'
df_proportion.set_index(df_proportion['Group'],inplace=True)

In [None]:
# normalize the data for overlap analysis--based on number of RLs of each type that appear along the genome

df_proportion_normalized = df_proportion.copy()
count_DHS = len(df_DHS)
count_enhancer = len(df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
count_promoter = len(df_chromhmm[df_chromhmm['RL'] == 'promoter'])
count_ctcf_strong = len(df_ctcf_strong)
count_sum = count_DHS + count_enhancer + count_promoter + count_ctcf_strong

# scale counts based on number of times that RL type apears on the genome
df_proportion_normalized['DHS'] = df_proportion_normalized['DHS'] / count_DHS
df_proportion_normalized['enhancer'] = df_proportion_normalized['enhancer'] / count_enhancer
df_proportion_normalized['promoter'] = df_proportion_normalized['promoter'] / count_promoter
df_proportion_normalized['CTCFbs_strong'] = df_proportion_normalized['CTCFbs_strong'] / count_ctcf_strong

df_proportion_normalized_initialRL = df_proportion_normalized.copy()
df_proportion_normalized_initialRL.to_csv('Data/Processed/Overlap For Esther/overlap_initialRLanalysis_NORMALIZED.csv') # export results

#### Figure 4: HIV RL Analysis with clustered DMP groups

In [None]:
# read in data generated by Esther's clustering strategy

cpg_infec_high = pd.read_csv('Data/Processed/infection_cpgs_high.csv', index_col=0)
df_infec_high = df_group1[df_group1['CpG_ID'].isin(cpg_infec_high['CpG_ID'])]
df_infec_high.reset_index(inplace=True)

cpg_infec_low = pd.read_csv('Data/Processed/infection_cpgs_low.csv',index_col=0)
cpg_infec_low.rename({'Low':'CpG_ID'},axis=1,inplace=True)
df_infec_low = df_group1[df_group1['CpG_ID'].isin(cpg_infec_low['CpG_ID'])]
df_infec_low.reset_index(inplace=True)

cpg_treat_high = pd.read_csv('Data/Processed/treatment_cpgs_high.csv',index_col=0)
df_treat_high = df_group2[df_group2['CpG_ID'].isin(cpg_treat_high['CpG_ID'])]
df_treat_high.reset_index(inplace=True)

cpg_treat_low = pd.read_csv('Data/Processed/treatment_cpgs_low.csv',index_col=0)
cpg_treat_low.rename({'Low':'CpG_ID'},axis=1,inplace=True)
df_treat_low = df_group2[df_group2['CpG_ID'].isin(cpg_treat_low['CpG_ID'])]
df_treat_low.reset_index(inplace=True)


DHS

In [None]:
# perform comparative RL analysis for INFECTION to DHS

infection_DHS = RL_analysis_comparative(df_infec_high, df_infec_low, df_DHS)
df_infection_DHS = decode_comparative(infection_DHS)

for row in range(0,len(df_infection_DHS)): # annotate data based on experimental group
    if df_infection_DHS.loc[row,'group'] == 'Experimental Group 1':
        df_infection_DHS.loc[row,'group'] = 'Infection, Cluster 2'
    elif df_infection_DHS.loc[row,'group'] == 'Experimental Group 2':
        df_infection_DHS.loc[row,'group'] = 'Infection, Cluster 1'
graph_DHS(df_infection_DHS)

In [None]:
# perform comparative RL analysis for TREATMENT to DHS

treatment_DHS = RL_analysis_comparative(df_treat_high, df_treat_low, df_DHS)
df_treatment_DHS = decode_comparative(treatment_DHS)

for row in range(0,len(df_treatment_DHS)): # annotate data based on experimental group
    if df_treatment_DHS.loc[row,'group'] == 'Experimental Group 1':
        df_treatment_DHS.loc[row,'group'] = 'Treatment, Cluster 2'
    elif df_treatment_DHS.loc[row,'group'] == 'Experimental Group 2':
        df_treatment_DHS.loc[row,'group'] = 'Treatment, Cluster 1'

graph_DHS(df_treatment_DHS)

Enhancer

In [None]:
# perform comparative RL analysis for INFECTION to EHNHANCER

infection_enhancer = RL_analysis_comparative(df_infec_high, df_infec_low, df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
df_infection_enhancer = decode_comparative(infection_enhancer)

for row in range(0,len(df_infection_enhancer)): # annotate data based on experimental group
    if df_infection_enhancer.loc[row,'group'] == 'Experimental Group 1':
        df_infection_enhancer.loc[row,'group'] = 'Infection, Cluster 2'
    elif df_infection_enhancer.loc[row,'group'] == 'Experimental Group 2':
        df_infection_enhancer.loc[row,'group'] = 'Infection, Cluster 1'
graph_enhancer(df_infection_enhancer)

In [None]:
# perform comparative RL analysis for TREATMENT to ENHANCER

treatment_enhancer = RL_analysis_comparative(df_treat_high, df_treat_low, df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
df_treatment_enhancer = decode_comparative(treatment_enhancer)

for row in range(0,len(df_treatment_enhancer)): # annotate data based on experimental group
    if df_treatment_enhancer.loc[row,'group'] == 'Experimental Group 1':
        df_treatment_enhancer.loc[row,'group'] = 'Treatment, Cluster 2'
    elif df_treatment_enhancer.loc[row,'group'] == 'Experimental Group 2':
        df_treatment_enhancer.loc[row,'group'] = 'Treatment, Cluster 1'
graph_enhancer(df_treatment_enhancer)

Promoter

In [None]:
# perform comparative RL analysis for INFECTION to PROMOTER

infection_promoter = RL_analysis_comparative(df_infec_high, df_infec_low, df_chromhmm[df_chromhmm['RL'] == 'promoter'])
df_infection_promoter = decode_comparative(infection_promoter)

for row in range(0,len(df_infection_promoter)): # annotate data based on experimental group
    if df_infection_promoter.loc[row,'group'] == 'Experimental Group 1':
        df_infection_promoter.loc[row,'group'] = 'Infection, Cluster 2'
    elif df_infection_promoter.loc[row,'group'] == 'Experimental Group 2':
        df_infection_promoter.loc[row,'group'] = 'Infection, Cluster 1'
graph_promoter(df_infection_promoter)

In [None]:
# perform comparative RL analysis for TREATMENT to PROMOTER

treatment_promoter = RL_analysis_comparative(df_treat_high, df_treat_low, df_chromhmm[df_chromhmm['RL'] == 'promoter'])
df_treatment_promoter = decode_comparative(treatment_promoter)

for row in range(0,len(df_treatment_promoter)): # annotate data based on experimental group
    if df_treatment_promoter.loc[row,'group'] == 'Experimental Group 1':
        df_treatment_promoter.loc[row,'group'] = 'Treatment, Cluster 2'
    elif df_treatment_promoter.loc[row,'group'] == 'Experimental Group 2':
        df_treatment_promoter.loc[row,'group'] = 'Treatment, Cluster 1'
graph_promoter(df_treatment_promoter)

CTCFbs_strong

In [None]:
# perform comparative RL analysis for INFECTION to STRONG CTCFBS

infection_ctcf_strong = RL_analysis_comparative(df_infec_high, df_infec_low, df_ctcf_strong)
df_infection_ctcf_strong = decode_comparative(infection_ctcf_strong)

for row in range(0,len(df_infection_ctcf_strong)): # annotate data based on experimental group
    if df_infection_ctcf_strong.loc[row,'group'] == 'Experimental Group 1':
        df_infection_ctcf_strong.loc[row,'group'] = 'Infection, Cluster 2'
    elif df_infection_ctcf_strong.loc[row,'group'] == 'Experimental Group 2':
        df_infection_ctcf_strong.loc[row,'group'] = 'Infection, Cluster 1'
graph_ctcf_strong(df_infection_ctcf_strong)

In [None]:
# perform comparative RL analysis for TREATMENT to STRONG CTCFBS

treatment_ctcf_strong = RL_analysis_comparative(df_treat_high, df_treat_low, df_ctcf_strong)
df_treatment_ctcf_strong = decode_comparative(treatment_ctcf_strong)

for row in range(0,len(df_treatment_ctcf_strong)): # annotate data based on experimental group
    if df_treatment_ctcf_strong.loc[row,'group'] == 'Experimental Group 1':
        df_treatment_ctcf_strong.loc[row,'group'] = 'Treatment, Cluster 2'
    elif df_treatment_ctcf_strong.loc[row,'group'] == 'Experimental Group 2':
        df_treatment_ctcf_strong.loc[row,'group'] = 'Treatment, Cluster 1'
graph_ctcf_strong(df_treatment_ctcf_strong)

##### Figure 4b: How often do CpGs from the clustered DMP groups overlap with their nearest RL?

In [None]:
df_overlap = pd.read_csv('Data/Processed/overlap_data_highlowclusters.csv')

In [None]:
df_proportion = pd.DataFrame(columns=['DHS','enhancer','promoter','CTCFbs_strong'])

df_proportion['DHS'] = df_overlap['DHS'] / df_overlap['total_count']
df_proportion['enhancer'] = df_overlap['enhancer'] / df_overlap['total_count']
df_proportion['promoter'] = df_overlap['promoter'] / df_overlap['total_count']
df_proportion['CTCFbs_strong'] = df_overlap['CTCFbs_strong'] / df_overlap['total_count']
df_proportion['Group'] = ''
df_proportion.loc[0,'Group'] = 'Infection High'
df_proportion.loc[1,'Group'] = 'Infection Low'
df_proportion.loc[2,'Group'] = 'Treatment High'
df_proportion.loc[3,'Group'] = 'Treatment Low'
df_proportion.set_index(df_proportion['Group'])

In [None]:
# normalize the data

df_proportion_normalized = df_proportion.copy()
count_DHS = len(df_DHS)
count_enhancer = len(df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
count_promoter = len(df_chromhmm[df_chromhmm['RL'] == 'promoter'])
count_ctcf_strong = len(df_ctcf_strong)
count_sum = count_DHS + count_enhancer + count_promoter + count_ctcf_strong

scale_DHS = count_DHS/count_sum
scale_enhancer = count_enhancer/count_sum
scale_promoter= count_promoter/count_sum
scale_ctcf_strong = count_ctcf_strong/count_sum

# df_proportion_normalized['DHS'] = df_proportion_normalized['DHS'] * scale_DHS
# df_proportion_normalized['enhancer'] = df_proportion_normalized['enhancer'] * scale_enhancer
# df_proportion_normalized['promoter'] = df_proportion_normalized['promoter'] * scale_promoter
# df_proportion_normalized['CTCFbs_strong'] = df_proportion_normalized['CTCFbs_strong'] * scale_ctcf_strong

df_proportion_normalized['DHS'] = df_proportion_normalized['DHS'] / count_DHS
df_proportion_normalized['enhancer'] = df_proportion_normalized['enhancer'] / count_enhancer
df_proportion_normalized['promoter'] = df_proportion_normalized['promoter'] / count_promoter
df_proportion_normalized['CTCFbs_strong'] = df_proportion_normalized['CTCFbs_strong'] / count_ctcf_strong
# df_proportion_normalized['count'] = df_proportion_normalized['count'] 

# df_proportion_normalized['sum'] = df_proportion_normalized['DHS'] + df_proportion_normalized['enhancer'] + df_proportion_normalized['promoter'] + df_proportion_normalized['CTCFbs_strong']

In [None]:
df_ttest

In [None]:
df_ttest = df_proportion.copy()
df_ttest.set_index(df_ttest['Group'],inplace=True)

t, p = sp.mannwhitneyu(x = df_proportion['promoter'], y = df_ttest.loc['Treatment','promoter'], alternative="two-sided")
print('P-value from two sided mannwhitneyu t-test: {}'.format(p))

In [None]:
# https://stackoverflow.com/questions/51882279/seaborn-barplot-with-two-y-axis

columns = ['DHS','enhancer','promoter','CTCFbs_strong','Group']
data_melted = pd.melt(df_proportion, id_vars=columns[4], var_name = 'RL', value_name = 'values')

sns.set_theme(font="arial", font_scale=2, style="white")
plt.figure(figsize=(24,20))
sns.barplot(data=data_melted, x=columns[4],y = 'values', hue = 'RL').set(
    xlabel = 'Group',
    ylabel = 'Proportion'
)

In [None]:
# https://stackoverflow.com/questions/51882279/seaborn-barplot-with-two-y-axis

columns = ['DHS','enhancer','promoter','CTCFbs_strong','Group']
data_melted = pd.melt(df_proportion_normalized, id_vars=columns[4], var_name = 'RL', value_name = 'values')

sns.set_theme(font="arial", font_scale=2, style="white")
plt.figure(figsize=(24,20))
sns.barplot(data=data_melted, x=columns[4],y = 'values', hue = 'RL').set(
    xlabel = 'Group',
    ylabel = 'Proportion'
)

#### Figure 6: Initial HPV RL Analysis

In [None]:
df_hpv = pd.read_csv('Data/HPV_loci.csv', index_col=0) # read in HPV methylation data as df
# remove non-autosomal chromosomes or samples with non-canonical notation
df_hpv = df_hpv[df_hpv['CHR'] != 'X']
df_hpv = df_hpv[df_hpv['CHR'] != 'Y']
df_hpv = df_hpv[df_hpv['CHR'] != '22_KI270879v1_alt']
df_hpv = df_hpv[df_hpv['CHR'] != '19_KI270938v1_alt']

In [None]:
# perform HPV infection DHS analysis 

hpv_DHS = RL_analysis_otherVIR(df_hpv,df_DHS,df_CpG_all,num_sims)
df_hpv_DHS = decode(hpv_DHS)
graph_DHS(df_hpv_DHS)

In [None]:
# perform HPV infection ENHANCER analysis 

hpv_enhancer = RL_analysis_otherVIR(df_hpv,df_chromhmm[df_chromhmm['RL'] == 'enhancer'],df_CpG_all,num_sims)
df_hpv_enhancer = decode(hpv_enhancer)
graph_enhancer(df_hpv_enhancer)

In [None]:
# perform HPV infection PROMOTER analysis 

hpv_promoter = RL_analysis_otherVIR(df_hpv,df_chromhmm[df_chromhmm['RL'] == 'promoter'],df_CpG_all,num_sims)
df_hpv_promoter = decode(hpv_promoter)
graph_promoter(df_hpv_promoter)

In [None]:
# perform HPV infection STRONG CTCFBS analysis 

hpv_ctcf_strong = RL_analysis_otherVIR(df_hpv,df_ctcf_strong,df_CpG_all,num_sims)
df_hpv_ctcf_strong = decode(hpv_ctcf_strong)
graph_ctcf_strong(df_hpv_ctcf_strong)

In [None]:
# initialize df to store all HPV data for overlap analysis

df_store_HPV = pd.DataFrame(columns=['CpG_ID','DHS','enhancer','promoter','CTCFbs_strong'])
df_store_HPV['CpG_ID'] = df_hpv['CpG_ID']
df_store_HPV.reset_index(inplace=True)
df_store_HPV.drop(columns=['index'], inplace=True)

In [None]:
# fill empty df with calculated data for overlap analysis 

df_intermediate = df_hpv_DHS[df_hpv_DHS['group']=="Experimental Group"]
df_store_HPV['DHS'] = df_intermediate['distance_(kb)']

df_intermediate = df_hpv_enhancer[df_hpv_enhancer['group']=="Experimental Group"]
df_store_HPV['enhancer'] = df_intermediate['distance_(kb)']

df_intermediate = df_hpv_promoter[df_hpv_promoter['group']=="Experimental Group"]
df_store_HPV['promoter'] = df_intermediate['distance_(kb)']

df_intermediate = df_hpv_ctcf_strong[df_hpv_ctcf_strong['group']=="Experimental Group"]
df_store_HPV['CTCFbs_strong'] = df_intermediate['distance_(kb)']

In [None]:
# calculate and store closest RL and export data as csv

df_intermediate = df_store_HPV.copy()
df_intermediate = df_intermediate.drop('CpG_ID',axis=1)
df_array = np.array(df_intermediate.values,dtype=float)
indices_of_minima = df_array.argmin(axis=1)
column_names_min = df_intermediate.columns[indices_of_minima]
df_store_HPV['closest_RL'] = column_names_min
df_store_HPV.to_csv('Data/Processed/HPV_CpGs_closestRL.csv')

##### Figure 6b: How often do HPV differentially methylated CpGs overlap with their nearest RL?

In [None]:
df_overlap_hpv = pd.read_csv('Data/Processed/overlap_hpv.csv') # read in overlap data HPV

In [None]:
# calculate proportion of HPV DMPs that fall within nearest RL of each type

df_proportion = pd.DataFrame(columns=['DHS','enhancer','promoter','CTCFbs_strong'])

df_proportion['DHS'] = df_overlap_hpv['DHS'] / df_overlap_hpv['total_count']
df_proportion['enhancer'] = df_overlap_hpv['enhancer'] / df_overlap_hpv['total_count']
df_proportion['promoter'] = df_overlap_hpv['promoter'] / df_overlap_hpv['total_count']
df_proportion['CTCFbs_strong'] = df_overlap_hpv['CTCFbs_strong'] / df_overlap_hpv['total_count']
df_proportion['Group'] = ''
df_proportion.loc[0,'Group'] = 'HPV'
df_proportion.set_index(df_proportion['Group'])

In [None]:
# normalize the data based on frequency of RL occurance along the genome

df_proportion_normalized = df_proportion.copy()
count_DHS = len(df_DHS)
count_enhancer = len(df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
count_promoter = len(df_chromhmm[df_chromhmm['RL'] == 'promoter'])
count_ctcf_strong = len(df_ctcf_strong)
count_sum = count_DHS + count_enhancer + count_promoter + count_ctcf_strong

scale_DHS = count_DHS/count_sum
scale_enhancer = count_enhancer/count_sum
scale_promoter= count_promoter/count_sum
scale_ctcf_strong = count_ctcf_strong/count_sum

df_proportion_normalized['DHS'] = df_proportion_normalized['DHS'] / count_DHS
df_proportion_normalized['enhancer'] = df_proportion_normalized['enhancer'] / count_enhancer
df_proportion_normalized['promoter'] = df_proportion_normalized['promoter'] / count_promoter
df_proportion_normalized['CTCFbs_strong'] = df_proportion_normalized['CTCFbs_strong'] / count_ctcf_strong

df_proportion_normalized_HPV = df_proportion_normalized.copy()
df_proportion_normalized_HPV.to_csv('Data/Processed/Overlap For Esther/overlap_HPV_NORMALIZED.csv') # export for graphing in prism

##### Figure 6d: HPV RL analysis with clustered groups of DMPs

In [None]:
# read in clustered HPV positions from Esther's clustering analysis

HPV_high = pd.read_csv('Data/Processed/hpv_cpgs_high.csv',index_col=0, dtype=str)
df_HPV_high = df_hpv[df_hpv['CpG_ID'].isin(HPV_high['CpG_ID'])]
df_HPV_high.reset_index(inplace=True)

HPV_low = pd.read_csv('Data/Processed/hpv_cpgs_low.csv',index_col=0)
df_HPV_low = df_hpv[df_hpv['CpG_ID'].isin(HPV_low['CpG_ID'])]
df_HPV_low.reset_index(inplace=True)

In [None]:
# perform RL analysis for HPV clusters vs DHS

hpv_cluster_DHS = RL_analysis_comparative_otherVIR(df_HPV_high, df_HPV_low, df_DHS)
df_hpv_cluster_DHS = decode_comparative(hpv_cluster_DHS)
for row in range(0,len(df_hpv_cluster_DHS)):
    if df_hpv_cluster_DHS.loc[row,'group'] == 'Experimental Group 1':
        df_hpv_cluster_DHS.loc[row,'group'] = 'HPV, Cluster 2'
    elif df_hpv_cluster_DHS.loc[row,'group'] == 'Experimental Group 2':
        df_hpv_cluster_DHS.loc[row,'group'] = 'HPV, Cluster 1'
graph_DHS(df_hpv_cluster_DHS)

In [None]:
# perform RL analysis for HPV clusters vs ENHANCER

hpv_cluster_enhancer = RL_analysis_comparative_otherVIR(df_HPV_high, df_HPV_low, df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
df_hpv_cluster_enhancer = decode_comparative(hpv_cluster_enhancer)
for row in range(0,len(df_hpv_cluster_enhancer)):
    if df_hpv_cluster_enhancer.loc[row,'group'] == 'Experimental Group 1':
        df_hpv_cluster_enhancer.loc[row,'group'] = 'HPV, Cluster 2'
    elif df_hpv_cluster_enhancer.loc[row,'group'] == 'Experimental Group 2':
        df_hpv_cluster_enhancer.loc[row,'group'] = 'HPV, Cluster 1'
graph_enhancer(df_hpv_cluster_enhancer)

In [None]:
# perform RL analysis for HPV clusters vs PROMOTER

hpv_cluster_promoter = RL_analysis_comparative_otherVIR(df_HPV_high, df_HPV_low, df_chromhmm[df_chromhmm['RL'] == 'promoter'])
df_hpv_cluster_promoter = decode_comparative(hpv_cluster_promoter)
for row in range(0,len(df_hpv_cluster_promoter)):
    if df_hpv_cluster_promoter.loc[row,'group'] == 'Experimental Group 1':
        df_hpv_cluster_promoter.loc[row,'group'] = 'HPV, Cluster 2'
    elif df_hpv_cluster_promoter.loc[row,'group'] == 'Experimental Group 2':
        df_hpv_cluster_promoter.loc[row,'group'] = 'HPV, Cluster 1'
graph_promoter(df_hpv_cluster_promoter)

In [None]:
# perform RL analysis for HPV clusters vs STRONG CTCFBS

hpv_cluster_ctcf_strong = RL_analysis_comparative_otherVIR(df_HPV_high, df_HPV_low, df_ctcf_strong)
df_hpv_cluster_ctcf_strong = decode_comparative(hpv_cluster_ctcf_strong)
for row in range(0,len(df_hpv_cluster_ctcf_strong)):
    if df_hpv_cluster_ctcf_strong.loc[row,'group'] == 'Experimental Group 1':
        df_hpv_cluster_ctcf_strong.loc[row,'group'] = 'HPV, Cluster 2'
    elif df_hpv_cluster_ctcf_strong.loc[row,'group'] == 'Experimental Group 2':
        df_hpv_cluster_ctcf_strong.loc[row,'group'] = 'HPV, Cluster 1'
graph_ctcf_strong(df_hpv_cluster_ctcf_strong)

#### Figure 7: Initial SARS2 RL Analysis

In [None]:
df_sars2 = pd.read_csv('Data/SARS2_loci.csv', index_col=0) # read in HPV methylation data as df

# remove non-autosomal chromosomes
df_sars2 = df_sars2[df_sars2['CHR'] != 'X']
df_sars2 = df_sars2[df_sars2['CHR'] != 'Y']

In [None]:
# perform SARS2 infection DHS RL analysis 

sars2_DHS = RL_analysis_otherVIR(df_sars2,df_DHS,df_CpG_all,num_sims)
df_sars2_DHS = decode(sars2_DHS)
graph_DHS(df_sars2_DHS)

In [None]:
# perform SARS2 infection ENHANCER RL analysis 

sars2_enhancer = RL_analysis_otherVIR(df_sars2,df_chromhmm[df_chromhmm['RL'] == 'enhancer'],df_CpG_all,num_sims)
df_sars2_enhancer = decode(sars2_enhancer)
graph_enhancer(df_sars2_enhancer)

In [None]:
# perform SARS2 infection PROMOTER RL analysis 

sars2_promoter = RL_analysis_otherVIR(df_sars2,df_chromhmm[df_chromhmm['RL'] == 'promoter'],df_CpG_all,num_sims)
df_sars2_promoter = decode(sars2_promoter)
graph_promoter(df_sars2_promoter)

In [None]:
# perform SARS2 infection STRONG CTCFBS RL analysis 

sars2_ctcf_strong = RL_analysis_otherVIR(df_sars2,df_ctcf_strong,df_CpG_all,num_sims)
df_sars2_ctcf_strong = decode(sars2_ctcf_strong)
graph_ctcf_strong(df_sars2_ctcf_strong)

In [None]:
# create empty df to store SARS2 analysis

df_store_SARS2 = pd.DataFrame(columns=['CpG_ID','DHS','enhancer','promoter','CTCFbs_strong'])
df_store_SARS2['CpG_ID'] = df_sars2['CpG_ID']
df_store_SARS2.reset_index(inplace=True)
df_store_SARS2.drop(columns=['index'], inplace=True)

In [None]:
# fill empty df with SARS2 data with the end goal of performing overlap analysis

df_intermediate = df_sars2_DHS[df_sars2_DHS['group']=="Experimental Group"]
df_store_SARS2['DHS'] = df_intermediate['distance_(kb)']

df_intermediate = df_sars2_enhancer[df_sars2_enhancer['group']=="Experimental Group"]
df_store_SARS2['enhancer'] = df_intermediate['distance_(kb)']

df_intermediate = df_sars2_promoter[df_sars2_promoter['group']=="Experimental Group"]
df_store_SARS2['promoter'] = df_intermediate['distance_(kb)']

df_intermediate = df_sars2_ctcf_strong[df_sars2_ctcf_strong['group']=="Experimental Group"]
df_store_SARS2['CTCFbs_strong'] = df_intermediate['distance_(kb)']

In [None]:
# calculate closest RL type to each CpG site for SARS2; export data

df_intermediate = df_store_SARS2.copy()
df_intermediate = df_intermediate.drop('CpG_ID',axis=1)
df_array = np.array(df_intermediate.values,dtype=float)
indices_of_minima = df_array.argmin(axis=1)
column_names_min = df_intermediate.columns[indices_of_minima]
df_store_SARS2['closest_RL'] = column_names_min
df_store_SARS2.to_csv('Data/Processed/SARS2_CpGs_closestRL.csv')

##### Figure 7b: How often do differentially methylated CpGs from SARS2 infection overlap with their nearest RL?

In [None]:
df_overlap_sars2 = pd.read_csv('Data/Processed/overlap_data_sars2.csv') # read in overlap data 

In [None]:
# calculate proportion of CpG sites that overlap with each type of RL 

df_proportion = pd.DataFrame(columns=['DHS','enhancer','promoter','CTCFbs_strong'])

df_proportion['DHS'] = df_overlap_sars2['DHS'] / df_overlap_sars2['total_count']
df_proportion['enhancer'] = df_overlap_sars2['enhancer'] / df_overlap_sars2['total_count']
df_proportion['promoter'] = df_overlap_sars2['promoter'] / df_overlap_sars2['total_count']
df_proportion['CTCFbs_strong'] = df_overlap_sars2['CTCFbs_strong'] / df_overlap_sars2['total_count']
df_proportion['Group'] = ''
df_proportion.loc[0,'Group'] = 'SARS2'
df_proportion.set_index(df_proportion['Group'])

In [None]:
# normalize the propotion data based on frequency with which different RL classes appear on the genome

df_proportion_normalized = df_proportion.copy()
count_DHS = len(df_DHS)
count_enhancer = len(df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
count_promoter = len(df_chromhmm[df_chromhmm['RL'] == 'promoter'])
count_ctcf_strong = len(df_ctcf_strong)
count_sum = count_DHS + count_enhancer + count_promoter + count_ctcf_strong

scale_DHS = count_DHS/count_sum
scale_enhancer = count_enhancer/count_sum
scale_promoter= count_promoter/count_sum
scale_ctcf_strong = count_ctcf_strong/count_sum

df_proportion_normalized['DHS'] = df_proportion_normalized['DHS'] / count_DHS
df_proportion_normalized['enhancer'] = df_proportion_normalized['enhancer'] / count_enhancer
df_proportion_normalized['promoter'] = df_proportion_normalized['promoter'] / count_promoter
df_proportion_normalized['CTCFbs_strong'] = df_proportion_normalized['CTCFbs_strong'] / count_ctcf_strong

df_proportion_normalized_SARS2 = df_proportion_normalized.copy()
df_proportion_normalized_SARS2.to_csv('Data/Processed/Overlap For Esther/overlap_sars2_NORMALIZED.csv') # export for graphing in prism

##### Figure 7d: RL Analysis with SARS2 DMP clustered groups

In [None]:
# read in SARS2 cluster positions from Esther's clusering analysis

SARS2_high = pd.read_csv('Data/Processed/sars_cpgs_high.csv',index_col=0, dtype=str)
df_SARS2_high = df_sars2[df_sars2['CpG_ID'].isin(SARS2_high['CpG_ID'])]
df_SARS2_high.reset_index(inplace=True)

SARS2_low = pd.read_csv('Data/Processed/sars_cpgs_low.csv',index_col=0, dtype=str)
df_SARS2_low = df_sars2[df_sars2['CpG_ID'].isin(SARS2_low['CpG_ID'])]
df_SARS2_low.reset_index(inplace=True)

In [None]:
# perform RL analysis for SARS2 clusters vs DHS

sars2_cluster_DHS = RL_analysis_comparative_otherVIR(df_SARS2_high, df_SARS2_low, df_DHS)
df_sars2_cluster_DHS = decode_comparative(sars2_cluster_DHS)
for row in range(0,len(df_sars2_cluster_DHS)):
    if df_sars2_cluster_DHS.loc[row,'group'] == 'Experimental Group 1':
        df_sars2_cluster_DHS.loc[row,'group'] = 'SARS2, Cluster 2'
    elif df_sars2_cluster_DHS.loc[row,'group'] == 'Experimental Group 2':
        df_sars2_cluster_DHS.loc[row,'group'] = 'SARS2, Cluster 1'
graph_DHS(df_sars2_cluster_DHS)

In [None]:
# perform RL analysis for SARS2 clusters vs ENHANCER

sars2_cluster_enhancer = RL_analysis_comparative_otherVIR(df_SARS2_high, df_SARS2_low, df_chromhmm[df_chromhmm['RL'] == 'enhancer'])
df_sars2_cluster_enhancer = decode_comparative(sars2_cluster_enhancer)
for row in range(0,len(df_sars2_cluster_enhancer)):
    if df_sars2_cluster_enhancer.loc[row,'group'] == 'Experimental Group 1':
        df_sars2_cluster_enhancer.loc[row,'group'] = 'SARS2, Cluster 2'
    elif df_sars2_cluster_enhancer.loc[row,'group'] == 'Experimental Group 2':
        df_sars2_cluster_enhancer.loc[row,'group'] = 'SARS2, Cluster 1'
graph_enhancer(df_sars2_cluster_enhancer)

In [None]:
# perform RL analysis for SARS2 clusters vs PROMOTER

sars2_cluster_promoter = RL_analysis_comparative_otherVIR(df_SARS2_high, df_SARS2_low, df_chromhmm[df_chromhmm['RL'] == 'promoter'])
df_sars2_cluster_promoter = decode_comparative(sars2_cluster_promoter)
for row in range(0,len(df_sars2_cluster_DHS)):
    if df_sars2_cluster_promoter.loc[row,'group'] == 'Experimental Group 1':
        df_sars2_cluster_promoter.loc[row,'group'] = 'SARS2, Cluster 2'
    elif df_sars2_cluster_promoter.loc[row,'group'] == 'Experimental Group 2':
        df_sars2_cluster_promoter.loc[row,'group'] = 'SARS2, Cluster 1'
graph_promoter(df_sars2_cluster_promoter)

In [None]:
# perform RL analysis for SARS2 clusters vs STRONG CTCFBS 

sars2_cluster_ctcf_strong = RL_analysis_comparative_otherVIR(df_SARS2_high, df_SARS2_low, df_ctcf_strong)
df_sars2_cluster_ctcf_strong = decode_comparative(sars2_cluster_ctcf_strong)
for row in range(0,len(df_sars2_cluster_ctcf_strong)):
    if df_sars2_cluster_ctcf_strong.loc[row,'group'] == 'Experimental Group 1':
        df_sars2_cluster_ctcf_strong.loc[row,'group'] = 'SARS2, Cluster 2'
    elif df_sars2_cluster_ctcf_strong.loc[row,'group'] == 'Experimental Group 2':
        df_sars2_cluster_ctcf_strong.loc[row,'group'] = 'SARS2, Cluster 1'
graph_ctcf_strong(df_sars2_cluster_ctcf_strong)