## Generate a table of avg region values for our master region list and all TCGA files of interest

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle
#from core.gcloud.file import opener

import seaborn as sns
import matplotlib
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
from collections import defaultdict
# load probe metadata
probe_metadata = pickle.load(open("../df_keep_probes_metadata.pkl", "rb"))
probe_metadata

# make probe metadata into a dict by chrm
probe_dict = defaultdict(list)
for i, row in probe_metadata.iterrows():
    probe_dict[row.Chromosome].append((i, int(row.Start), int(row.End)))

In [12]:
probe_metadata.head()

Unnamed: 0_level_0,Chromosome,Start,End,Gene_Symbol,Gene_Type,Transcript_ID,Position_to_TSS,CGI_Coordinate,Feature_Type
Composite Element REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cg00000029,chr16,53434200,53434201,RBL2;RBL2;RBL2,protein_coding;protein_coding;protein_coding,ENST00000262133.9;ENST00000544405.5;ENST000005...,-221;-1420;222,CGI:chr16:53434489-53435297,N_Shore
cg00000108,chr3,37417715,37417716,C3orf35;C3orf35;C3orf35;C3orf35;C3orf35;C3orf3...,lincRNA;lincRNA;lincRNA;lincRNA;lincRNA;lincRN...,ENST00000328376.8;ENST00000332506.6;ENST000004...,18552;18552;6505;31445;18143;447;18552;18552,CGI:chr3:37451927-37453047,.
cg00000109,chr3,172198247,172198248,FNDC3B;FNDC3B;FNDC3B;FNDC3B;FNDC3B;FNDC3B,protein_coding;protein_coding;protein_coding;p...,ENST00000336824.7;ENST00000415807.5;ENST000004...,157692;158618;151333;71272;158587;71273,CGI:chr3:172039703-172040934,.
cg00000165,chr1,90729117,90729118,.,.,.,.,CGI:chr1:90724932-90727247,S_Shore
cg00000236,chr8,42405776,42405777,VDAC3,protein_coding,ENST00000022615.7,13872,CGI:chr8:42410918-42411241,.


In [26]:
# collate stats per region

def calc_region_averages(df_clean_regions, df_tcga_probes, probe_dict, title):
    global region_stats
    region_stats = {}
    i=0
    region_avs_list = []
    region_names = []
    # for region in infile:
    for i,region in df_clean_regions.iterrows():
        #print(region)
        if i % 100 == 0:
            print(i)
        # determine which probes are in the region
        probes = []
        for name, start, stop in probe_dict[region.chrom]:
            if int(start) >= int(region.start) and int(stop) <= int(region.end):
                if name in df_tcga_probes.index:
                    probes.append(name)
        #print(probes)
        if len(probes)>0:
            region_names.append(region.real_name)
            df_tcga_subset = df_tcga_probes.loc[probes,:]
            sample_averages = df_tcga_subset.mean(axis=0, skipna=True)

            region_avs_list.append(sample_averages)

            sample_stdvs = df_tcga_subset.std(axis=0, skipna=True)
            pct_samples_meth = sum([m >= 0.2 for m in sample_averages])/len(df_tcga_subset)
            mean_sample_stdv = np.mean(sample_stdvs)

            region_stats[region.real_name] = {"pct_samples_meth":pct_samples_meth,
                                        "mean_sample_stdv": mean_sample_stdv,
                                    "sample_average_beta": np.mean(sample_averages)}
            i += 1
        else:
            continue
    
    # compile each list of region averages per sample into a matrix for heatmap
    global df_region_avs
    df_region_avs = pd.DataFrame(region_avs_list) #regions by samples
    df_region_avs.colunns = df_tcga_probes.columns
    df_region_avs.index = region_names
    

    df_region_avs = df_region_avs.fillna(0)
    df_region_avs.head()

    with open(f"./{title}.csv","w") as fout:
        df_region_avs.to_csv(fout)
        
    with open(f"./{title}_region_summary_stats.csv","w") as fout:
        region_dict = pd.DataFrame(region_stats)
        region_dict.to_csv(fout, index=None)
    
    print("Complete") 
    return df_region_avs, region_stats

In [27]:
# read in master regions file
with open("../biomarker_lock_june_2020_hmf_regions_merged_1phealthy_metilene.source_annotated_formatted.bed","r") as fin:
    df_master_regions = pd.read_csv(fin, sep="\t", header=None)
    
df_master_regions.columns = ["chrom","start","end"]
df_master_regions["real_name"] = df_master_regions["chrom"]+":"+df_master_regions["start"].astype(str)+":"+df_master_regions["end"].astype(str)
df_master_regions.head()

Unnamed: 0,chrom,start,end,real_name
0,chr1,827683,827862,chr1:827683:827862
1,chr1,905064,905611,chr1:905064:905611
2,chr1,925421,925756,chr1:925421:925756
3,chr1,1059023,1059377,chr1:1059023:1059377
4,chr1,1115135,1115467,chr1:1115135:1115467


In [9]:
#read in test file
df_test_reg = pd.read_csv(open("../test_regions.bed", "r"), sep="\t", header=None)

df_test_reg.columns = ["chrom","start","end"]
df_test_reg["real_name"] = df_test_reg["chrom"] + ":" + df_test_reg["start"].astype(str) + ":" +df_test_reg["end"].astype(str)

df_test_reg.iloc[:,2] = df_test_reg.iloc[:,2].astype(int)
df_test_reg.iloc[:,1] = df_test_reg.iloc[:,1].astype(int)

df_test_reg.head()



Unnamed: 0,chrom,start,end,real_name
0,chr1,827037,827904,chr1:827037:827904
1,chr1,1324214,1325305,chr1:1324214:1325305
2,chr1,2545946,2547720,chr1:2545946:2547720
3,chr1,3899793,3901520,chr1:3899793:3901520
4,chr1,6198771,6199823,chr1:6198771:6199823


In [20]:
df_tcga_paad = pickle.load(open("../df_edu_PAAD_raw_beta_vals.pkl","rb"))
df_tcga_paad.head()

Unnamed: 0_level_0,TCGA-IB-AAUR-01A-21D-A38H-05,TCGA-HV-A5A6-01A-11D-A26Q-05,TCGA-3E-AAAY-01A-11D-A38H-05,TCGA-F2-A8YN-01A-11D-A378-05,TCGA-HZ-8005-01A-11D-2202-05,TCGA-US-A77J-01A-11D-A32S-05,TCGA-IB-8126-01A-11D-2399-05,TCGA-HZ-A77O-01A-11D-A33U-05,TCGA-2J-AABU-01A-11D-A40Y-05,TCGA-FB-A545-01A-11D-A26Q-05,...,TCGA-HZ-A4BK-01A-11D-A26Q-05,TCGA-HZ-7925-01A-11D-2157-05,TCGA-F2-6880-01A-11D-2157-05,TCGA-3A-A9IH-01A-12D-A398-05,TCGA-IB-AAUT-01A-11D-A378-05,TCGA-IB-A5SS-01A-11D-A32S-05,TCGA-IB-7891-01A-11D-2202-05,TCGA-Q3-A5QY-01A-12D-A32S-05,TCGA-HZ-7920-01A-11D-2202-05,TCGA-US-A776-01A-13D-A33U-05
Composite Element REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg00000029,0.629938,0.138164,0.315506,0.102873,0.195025,0.571671,0.432601,0.159592,0.160771,0.132146,...,0.150753,0.211368,0.172837,0.141774,0.254292,0.119392,0.292381,0.512129,0.354043,0.086268
cg00000108,,,,,,,,,,,...,,,,,,,,,,
cg00000109,,,,,,,,,,,...,,,,,,,,,,
cg00000165,0.142251,0.346018,0.216611,0.431157,0.250825,0.165749,0.301502,0.419678,0.399122,0.542289,...,0.345913,0.400752,0.163814,0.243549,0.126286,0.145263,0.311933,0.266608,0.132543,0.150964
cg00000236,0.907267,0.857427,0.887274,0.896766,0.91337,0.897943,0.902092,0.879288,0.841911,0.929909,...,0.900374,0.877398,0.892853,0.927821,0.857115,0.868308,0.906295,0.885154,0.817279,0.840524


In [None]:
df_paad, paad_region_dict = calc_region_averages(df_master_regions, df_tcga_paad, probe_dict, "PAAD_master_region_averages")

In [24]:
df_paad.head()

Unnamed: 0,TCGA-IB-AAUR-01A-21D-A38H-05,TCGA-HV-A5A6-01A-11D-A26Q-05,TCGA-3E-AAAY-01A-11D-A38H-05,TCGA-F2-A8YN-01A-11D-A378-05,TCGA-HZ-8005-01A-11D-2202-05,TCGA-US-A77J-01A-11D-A32S-05,TCGA-IB-8126-01A-11D-2399-05,TCGA-HZ-A77O-01A-11D-A33U-05,TCGA-2J-AABU-01A-11D-A40Y-05,TCGA-FB-A545-01A-11D-A26Q-05,...,TCGA-HZ-A4BK-01A-11D-A26Q-05,TCGA-HZ-7925-01A-11D-2157-05,TCGA-F2-6880-01A-11D-2157-05,TCGA-3A-A9IH-01A-12D-A398-05,TCGA-IB-AAUT-01A-11D-A378-05,TCGA-IB-A5SS-01A-11D-A32S-05,TCGA-IB-7891-01A-11D-2202-05,TCGA-Q3-A5QY-01A-12D-A32S-05,TCGA-HZ-7920-01A-11D-2202-05,TCGA-US-A776-01A-13D-A33U-05
chr1:827683:827862,0.017097,0.012764,0.020421,0.02103,0.017838,0.024811,0.014587,0.024896,0.023745,0.015544,...,0.014182,0.014843,0.013963,0.028728,0.021399,0.018284,0.019396,0.020952,0.018015,0.020715
chr1:905064:905611,0.018705,0.015934,0.016582,0.04647,0.016597,0.02088,0.019931,0.017857,0.018851,0.019757,...,0.014849,0.019559,0.020146,0.016922,0.029526,0.016194,0.017733,0.017256,0.021504,0.021815
chr1:925421:925756,0.019585,0.013283,0.094054,0.022227,0.206229,0.015569,0.017839,0.016839,0.018653,0.646637,...,0.016472,0.016253,0.018989,0.015501,0.025052,0.016782,0.03121,0.017706,0.025869,0.016031
chr1:1059023:1059377,0.034045,0.027933,0.030345,0.054337,0.049905,0.035542,0.039545,0.044216,0.043935,0.039165,...,0.03243,0.033097,0.037252,0.138297,0.047807,0.029758,0.043434,0.036914,0.042501,0.042992
chr1:1115135:1115467,0.069022,0.045375,0.060741,0.15308,0.323991,0.07848,0.104504,0.080153,0.070445,0.188924,...,0.068312,0.070375,0.051755,0.130445,0.072111,0.205107,0.083947,0.106751,0.085339,0.151829


In [25]:
with open(f"./PAAD_master_region_averages.csv","w") as fout:
        df_paad.to_csv(fout)
        
with open(f"./PAAD_master_region_averages_region_summary_stats.pkl","w") as fout:
    df_paad_region_dict = pd.DataFrame(paad_region_dict)
    df_paad_region_dict.to_csv(fout, index=None)