In [1]:
import pandas as pd, os, re

BASE = '../data/raw/cbioportal/'
clin = pd.read_csv(os.path.join(BASE, 'clinical.tsv'), sep='\t')

print(clin.columns.tolist()[:30])       


['Study ID', 'Patient ID', 'Sample ID', 'Diagnosis Age', 'American Joint Committee on Cancer Metastasis Stage Code', 'Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code', 'Neoplasm Disease Stage American Joint Committee on Cancer Code', 'American Joint Committee on Cancer Publication Version Type', 'American Joint Committee on Cancer Tumor Stage Code', 'Alcohol Consumption Frequency', 'Alcohol History Documented', 'Amplification Status', 'Angiolymphatic Invasion', 'Cancer Type', 'Cancer Type Detailed', 'Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage', 'Neoplasm American Joint Committee on Cancer Clinical Regional Lymph Node N Stage', 'Neoplasm American Joint Committee on Cancer Clinical Primary Tumor T Stage', 'Neoplasm American Joint Committee on Cancer Clinical Group Stage', 'Daily Alcohol', 'Days to Sample Collection.', 'Last Alive Less Initial Pathologic Diagnosis Date Calculated Day Value', 'Days to Sample Procurement', 'Dise

In [2]:
[col for col in clin.columns if 'hpv' in col.lower()]

['Hpv status ish', 'Hpv status p16']

In [3]:
[col for col in clin.columns if 'stage' in col.lower()]

['American Joint Committee on Cancer Metastasis Stage Code',
 'Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code',
 'Neoplasm Disease Stage American Joint Committee on Cancer Code',
 'American Joint Committee on Cancer Tumor Stage Code',
 'Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage',
 'Neoplasm American Joint Committee on Cancer Clinical Regional Lymph Node N Stage',
 'Neoplasm American Joint Committee on Cancer Clinical Primary Tumor T Stage',
 'Neoplasm American Joint Committee on Cancer Clinical Group Stage',
 'Stage Other']

In [4]:
hpv_col   = 'Hpv status p16'
stage_col = 'Neoplasm Disease Stage American Joint Committee on Cancer Code'

mask = (
    clin[hpv_col].str.upper().eq('NEGATIVE') &
    clin[stage_col].str.contains(r'III|IV', flags=re.I, na=False)
)
subset_ids = clin.loc[mask, 'Sample ID'].tolist()
print(len(subset_ids))

53


In [10]:
import pandas as pd, os
BASE = '../data/raw/cbioportal/'

expr_head = pd.read_csv(os.path.join(BASE, 'mrna_z.txt'), sep='\t', nrows=3)
print(expr_head.columns[:5].tolist())

['STUDY_ID', 'SAMPLE_ID', 'TAP1', 'B2M', 'IRF1']


In [11]:
cna_head = pd.read_csv(os.path.join(BASE, 'cna.txt'), sep='\t', nrows=3)
print(cna_head.columns[:5].tolist())

['STUDY_ID', 'SAMPLE_ID', 'TAP1', 'B2M', 'IRF1']


In [13]:
BASE = '../data/raw/cbioportal/'

genes = ['TAP1','B2M','IRF1','STAT1','CXCL9',
         'CXCL10','PSMB9','NLRC5','HLA-A','HLA-B']

expr = (pd.read_csv(f'{BASE}mrna_z.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .T)

cna  = (pd.read_csv(f'{BASE}cna.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .T)

mut  = (pd.read_csv(f'{BASE}mutations.txt', sep='\t', low_memory=False)
          .query('Hugo_Symbol in @genes'))
mut  = mut[mut['SAMPLE_ID'].isin(subset_ids)]

low_expr = expr <= -1.5
damage   = (mut.assign(dam=mut['Variant_Classification']
                                 .str.contains('Frame|Nonsense|Splice', case=False))
               .groupby('Hugo_Symbol')['dam'].mean())
del_pct  = (cna == -2).mean(axis=1)

score = (pd.DataFrame({'low_pct':     low_expr.mean(axis=1),
                       'mut_pct':     damage.reindex(genes).fillna(0),
                       'cnv_del_pct': del_pct})
           .assign(rescue_score=lambda d: d.low_pct * (1 - d.mut_pct - d.cnv_del_pct))
           .sort_values('rescue_score', ascending=False))

out_path = '../data/processed/initial_rescue_scores.csv'
os.makedirs('../data/processed', exist_ok=True)
score.to_csv(out_path)
print(out_path)

UndefinedVariableError: name 'Hugo_Symbol' is not defined

In [14]:
BASE = '../data/raw/cbioportal/'

genes = ['TAP1','B2M','IRF1','STAT1','CXCL9',
         'CXCL10','PSMB9','NLRC5','HLA-A','HLA-B']

expr = (pd.read_csv(f'{BASE}mrna_z.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .T)

cna  = (pd.read_csv(f'{BASE}cna.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .T)

mut  = pd.read_csv(f'{BASE}mutations.txt', sep='\t', low_memory=False)
mut  = mut[mut['Hugo_Symbol'].isin(genes) & mut['SAMPLE_ID'].isin(subset_ids)]

low_expr = expr <= -1.5
damage   = (mut.assign(dam=mut['Variant_Classification']
                                 .str.contains('Frame|Nonsense|Splice', case=False))
               .groupby('Hugo_Symbol')['dam'].mean())
del_pct  = (cna == -2).mean(axis=1)

score = (pd.DataFrame({'low_pct':     low_expr.mean(axis=1),
                       'mut_pct':     damage.reindex(genes).fillna(0),
                       'cnv_del_pct': del_pct})
           .assign(rescue_score=lambda d: d.low_pct * (1 - d.mut_pct - d.cnv_del_pct))
           .sort_values('rescue_score', ascending=False))

out_path = '../data/processed/initial_rescue_scores.csv'
os.makedirs('../data/processed', exist_ok=True)
score.to_csv(out_path)
print(out_path)

KeyError: 'Hugo_Symbol'

In [15]:
# inspect mutation file header
mut_head = pd.read_csv(f'{BASE}mutations.txt', sep='\t', nrows=3)
print(mut_head.columns.tolist())

['STUDY_ID', 'SAMPLE_ID', 'TAP1', 'B2M', 'IRF1', 'STAT1', 'CXCL9', 'CXCL10', 'PSMB9', 'NLRC5', 'HLA-A', 'HLA-B']


In [17]:
BASE = '../data/raw/cbioportal/'

genes = ['TAP1','B2M','IRF1','STAT1','CXCL9',
         'CXCL10','PSMB9','NLRC5','HLA-A','HLA-B']

expr = (pd.read_csv(f'{BASE}mrna_z.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .astype(float)
          .T)

cna  = (pd.read_csv(f'{BASE}cna.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .astype(float)
          .T)

mut  = (pd.read_csv(f'{BASE}mutations.txt', sep='\t')
          .set_index('SAMPLE_ID')
          .loc[subset_ids, genes]
          .apply(pd.to_numeric, errors='coerce')
          .fillna(0)
          .T)

low_expr = expr <= -1.5
mut_pct  = (mut != 0).mean(axis=1)
del_pct  = (cna == -2).mean(axis=1)

score = (pd.DataFrame({'low_pct':     low_expr.mean(axis=1),
                       'mut_pct':     mut_pct,
                       'cnv_del_pct': del_pct})
           .assign(rescue_score=lambda d: d.low_pct * (1 - d.mut_pct - d.cnv_del_pct))
           .sort_values('rescue_score', ascending=False))

out_path = '../data/processed/initial_rescue_scores.csv'
os.makedirs('../data/processed', exist_ok=True)
score.to_csv(out_path)
print(out_path)

../data/processed/initial_rescue_scores.csv


In [18]:
git add notebooks/0filter_and_score.ipynb
git add data/processed/initial_rescue_scores.csv
git commit -m "Compute initial rescuer scores for 53 HPV− Stage III/IV samples"
git push

SyntaxError: invalid decimal literal (3507173426.py, line 1)