In [1]:
import numpy as np
import pandas as pd

In [3]:
import xenaPython as xena

In [4]:
df = pd.read_csv('~/TCGA-DLBC.GDC_phenotype.tsv', sep='\t', encoding = "ISO-8859-1")

In [5]:
df.head()

Unnamed: 0,submitter_id.samples,age_at_initial_pathologic_diagnosis,b_lymphocyte_genotyping_method,b_symptoms,batch_number,bcr,bcr_followup_barcode,bcr_followup_uuid,submitter_id,bone_marrow_biopsy_done,...,intermediate_dimension.samples,is_ffpe.samples,longest_dimension.samples,oct_embedded.samples,preservation_method.samples,sample_type.samples,sample_type_id.samples,shortest_dimension.samples,state.samples,tissue_type.samples
0,TCGA-FA-A6HN-01A,73,,NO,326.45.0,Nationwide Children's Hospital,TCGA-FA-A6HN-F63567,2FF9624C-C1FF-4A3B-9FFB-0A2E1136A1CD,TCGA-FA-A6HN,NO,...,,False,,True,,Primary Tumor,1,,released,Not Reported
1,TCGA-GR-A4D4-01A,57,,YES,326.45.0,Nationwide Children's Hospital,TCGA-GR-A4D4-F55574,C7009E8E-9B88-4AC1-BCBE-5F909CCB7159,TCGA-GR-A4D4,YES,...,,False,,False,,Primary Tumor,1,,released,Not Reported
2,TCGA-GS-A9TT-01A,70,,NO,397.39.0,Nationwide Children's Hospital,TCGA-GS-A9TT-F66756,01292404-F955-478D-8493-A93DA649AF6C,TCGA-GS-A9TT,YES,...,,False,,True,,Primary Tumor,1,,released,Not Reported
3,TCGA-FF-A7CQ-01A,74,,YES,397.39.0,Nationwide Children's Hospital,TCGA-FF-A7CQ-F50787,4B8ED15F-FE05-4943-AA0E-F97D130A9D26,TCGA-FF-A7CQ,YES,...,,False,,False,,Primary Tumor,1,,released,Not Reported
4,TCGA-FF-8046-01A,51,,NO,212.48.0,Nationwide Children's Hospital,TCGA-FF-8046-F30762,85B1822D-B3B9-4102-9E8C-DCF8052EA79A,TCGA-FF-8046,,...,,False,,,,Primary Tumor,1,,released,Not Reported


In [6]:
df = df.set_index('submitter_id.samples')

In [7]:
df_S4 = df.query('clinical_stage=="Stage IV"')

df_S1 = df.query('clinical_stage=="Stage I"')

In [8]:
hub = "https://gdc.xenahubs.net"
cohort = "GDC TCGA Large B-cell Lymphoma (DLBC)"
dataset = "TCGA-DLBC.htseq_counts.tsv"

In [9]:
# Fetch the sample names in the dataset
samples=xena.dataset_samples (hub, dataset, None)
print("len(samples): {}".format(len(samples)))

len(samples): 48


In [10]:
stage4 = [s for s in samples if s in df_S4.index.values]
stage1 = [s for s in samples if s in df_S1.index.values]

In [11]:
print("len(stage4): {}, len(stage1): {}".format(len(stage4),len(stage1)))

len(stage4): 12, len(stage1): 8


In [12]:
# Create phenotype file
columns = ['clinical_stage']
pheno = pd.concat([df.loc[stage4, columns], df.loc[stage1, columns]])

pheno.to_csv("~/phenotype.tsv", sep='\t')

In [13]:
probes = xena.dataset_field(hub, dataset)

In [14]:
# Because of connection timeout, fetch the values of probes by batch
def divide_probes_to_batches(num_probes, limit=10000):
    num_batches = num_probes // limit
    batches = [limit for i in range(num_batches)]
    if len(batches) == 0:
        batches.append(num_probes)
    else:
        remainder = num_probes - sum(batches)
        if remainder > 0:
            batches.append(remainder)
    return batches

In [15]:
# Fetch counts from xena db
def dataset_fetch(probes, samples, prefix=''):
    counts = xena.dataset_fetch(hub, dataset, samples, probes)
    df = pd.DataFrame.from_records(counts)
    df.index.name = 'Probes'
    df.index = probes
    if prefix != '':
        samples = [prefix + str(i+1) for i in range(len(samples))]
    df.columns = samples
    
    return df

In [16]:
df_stage4_counts = pd.DataFrame()
df_stage1_counts = pd.DataFrame()
batches = divide_probes_to_batches(len(probes),limit=2000)
i = 0
for batch in batches:
    # fetch stage 4 batch probes counts
    df_stage4_batch = dataset_fetch(probes[i:i+batch],stage4,prefix='stage4_')
    df_stage4_counts = pd.concat([df_stage4_counts,df_stage4_batch])
    # fetch stage 1 batch probes counts
    df_stage1_batch = dataset_fetch(probes[i:i+batch],stage1,prefix='stage1_')
    df_stage1_counts = pd.concat([df_stage1_counts,df_stage1_batch])
    i += batch

In [17]:
df_counts = pd.concat([df_stage4_counts, df_stage1_counts], axis=1, join='inner')
df_counts.to_csv("~/log_counts.tsv", sep='\t')

In [18]:
def valuation_formula(x):
    for i, v in enumerate(x):
        if i == 0: continue
        x[i] = round(2**v) -1
    return x

In [19]:
def norm_counts_to_raw(df):
    df = df.apply(lambda row: valuation_formula(row), axis=1)
    return df

In [20]:
df_log = pd.read_csv('~/log_counts.tsv', sep='\t', encoding = "ISO-8859-1")

In [21]:
df_delog = norm_counts_to_raw(df_log)

In [22]:
df_delog.head()

Unnamed: 0.1,Unnamed: 0,stage4_1,stage4_2,stage4_3,stage4_4,stage4_5,stage4_6,stage4_7,stage4_8,stage4_9,...,stage4_11,stage4_12,stage1_1,stage1_2,stage1_3,stage1_4,stage1_5,stage1_6,stage1_7,stage1_8
0,ENSG00000000003.13,48,76,90,69,78,163,376,66,149,...,161,258,86,258,127,97,219,261,307,63
1,ENSG00000000005.5,1,0,0,2,0,4,1,0,5,...,0,5,0,0,0,0,1,0,11,2
2,ENSG00000000419.11,1977,2573,1143,1745,451,2503,2538,2319,914,...,1104,2134,1008,1360,2047,1573,2855,821,1584,1183
3,ENSG00000000457.12,640,1089,340,1192,205,1757,904,945,314,...,540,932,181,683,1167,254,512,137,1709,509
4,ENSG00000000460.15,587,2033,171,1360,90,2434,598,1488,131,...,278,1686,408,679,643,671,1052,270,836,547


In [23]:
df_delog.to_csv("~/counts.csv")

In [24]:
t_df = pd.DataFrame()
t_df['id'] = df_delog.columns[1:]

In [25]:
condition = []
samples = []
i, j = 0, 0
for rep in t_df['id']:
    if rep.startswith("stage4"):
        condition.append('S4')
        samples.append(stage4[i])
        i += 1
    else:
        condition.append('S1')
        samples.append(stage1[j])
        j += 1

In [26]:
t_df['conditionName'] = condition
t_df['sampleId'] = samples

In [27]:
t_df.to_csv('~/pData.txt', sep='\t', encoding="ISO-8859-1", index=False)