In [2]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Data import CodonTable
import os

def read_gene_expression(file_path):
    """
    Read gene expression data with RNA-Seq RSEM normalized values

    Parameters:
    file_path (str): Path to gene expression file

    Returns:
    pandas.DataFrame: DataFrame with patient_id, sample_id and gene expression values
    """
    # Read the tab-separated file
    df = pd.read_csv(file_path, sep='\t')
    print(f"Gene expression data loaded: {df.shape[0]} samples, {df.shape[1]-2} genes")
    return df

def read_somatic_mutations(file_path):
    """
    Read somatic mutations data in MAF format

    Parameters:
    file_path (str): Path to mutation annotation file
    2`x
    Returns:
    pandas.DataFrame: DataFrame with mutation information
    """
    # Read the tab-separated MAF file
    df = pd.read_csv(file_path, sep='\t')
    print(f"Somatic mutations data loaded: {df.shape[0]} mutations")
    return df

def read_patient_metadata(file_path):
    """
    Read patient clinical and pathological metadata

    Parameters:
    file_path (str): Path to patient metadata file

    Returns:
    pandas.DataFrame: DataFrame with patient clinical information
    """
    # Read the tab-separated metadata file
    df = pd.read_csv(file_path, sep='\t')
    print(f"Patient metadata loaded: {df.shape[0]} patients")
    return df

def explore_data(gene_expr, somatic_mut, patient_meta):
    """
    Basic exploration of the three datasets
    """
    # Explore patient metadata
    print("\nPatient demographics:")
    print(f"Gender distribution:\n{patient_meta['gender'].value_counts()}")
    print(f"Age statistics:\n{patient_meta['age_at_initial_pathologic_diagnosis'].describe()}")

    # Analyze survival data
    print("\nSurvival statistics:")
    print(f"Overall survival events: {patient_meta['OS'].sum()} deaths out of {patient_meta['OS'].count()} patients")

    # Look at mutations per patient
    if 'patient_id' in somatic_mut.columns:
        mutations_per_patient = somatic_mut['patient_id'].value_counts()
        print(f"\nMutations per patient statistics:")
        print(mutations_per_patient.describe())

    # For gene expression, we could look at the most variable genes
    if gene_expr is not None and gene_expr.shape[1] > 2:
        print("\nGene expression statistics:")
        gene_columns = gene_expr.columns[2:]  # Skipping patient_id and sample_id
        gene_variance = gene_expr[gene_columns].var().sort_values(ascending=False)
        print(f"Top 5 most variable genes:\n{gene_variance.head(5)}")

gene_expression_path = os.path.join(os.getcwd(), 'Team_4_STAD', 'TCGA.STAD.expression.txt')
somatic_mutations_path = os.path.join(os.getcwd(), 'Team_4_STAD', 'TCGA.STAD.mutations.txt')
metadata_path = os.path.join(os.getcwd(),'Team_4_STAD', 'TCGA.STAD.metadata.txt')

gene_expr = read_gene_expression(gene_expression_path)
somatic_mut = read_somatic_mutations(somatic_mutations_path)
patient_meta = read_patient_metadata(metadata_path)

explore_data(gene_expr, somatic_mut, patient_meta)

Gene expression data loaded: 447 samples, 20531 genes
Somatic mutations data loaded: 234941 mutations
Patient metadata loaded: 415 patients

Patient demographics:
Gender distribution:
gender
MALE      265
FEMALE    150
Name: count, dtype: int64
Age statistics:
count    410.000000
mean      65.787805
std       10.746983
min       30.000000
25%       58.000000
50%       67.000000
75%       73.000000
max       90.000000
Name: age_at_initial_pathologic_diagnosis, dtype: float64

Survival statistics:
Overall survival events: 160.0 deaths out of 415 patients

Mutations per patient statistics:
count      415.000000
mean       566.122892
std       1163.583650
min          1.000000
25%        108.000000
50%        173.000000
75%        383.000000
max      14110.000000
Name: count, dtype: float64

Gene expression statistics:
Top 5 most variable genes:
ADAM6|8755     1.094407e+11
KRT4|3851      4.444138e+10
KRT13|3860     3.805078e+10
PGC|5225       2.561217e+10
PGA3|643834    1.474750e+10
dtype:

In [3]:
varient_types = somatic_mut['Variant_Classification'].unique()
print(varient_types)

['Missense_Mutation' 'Silent' 'Frame_Shift_Del' "3'UTR" 'In_Frame_Del'
 'Splice_Site' 'Nonsense_Mutation' 'Intron' "5'UTR"
 'Translation_Start_Site' 'Frame_Shift_Ins' 'In_Frame_Ins' "5'Flank" 'RNA'
 'Nonstop_Mutation' "3'Flank"]


In [5]:

print(patient_meta.info())
print(somatic_mut[['Hugo_Symbol','Variant_Classification']].value_counts())

# clear the rows with no gene name or no chromosome or no start position
muts = somatic_mut.dropna(subset=["Hugo_Symbol","Chromosome","Start_Position"])
meta = patient_meta.dropna(subset=["patient_id"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   patient_id                           415 non-null    object 
 1   type                                 415 non-null    object 
 2   age_at_initial_pathologic_diagnosis  410 non-null    float64
 3   gender                               415 non-null    object 
 4   race                                 415 non-null    object 
 5   ajcc_pathologic_tumor_stage          415 non-null    object 
 6   histological_type                    415 non-null    object 
 7   histological_grade                   415 non-null    object 
 8   tumor_status                         373 non-null    object 
 9   vital_status                         415 non-null    object 
 10  OS                                   415 non-null    float64
 11  OS.time                         

In [13]:
# Step 1.5: Keep only SNPs
mut_df = muts[muts['Variant_Type'] == 'SNP']

# Step 2: Filter for coding mutations of interest
syn = ['Silent']
nonsyn = ['Missense_Mutation', 'Nonsense_Mutation']
mut_df = mut_df[mut_df['Variant_Classification'].isin(syn + nonsyn)]

# Step 3: Remove hypermutator samples (e.g. top 1% by mutation load)
mut_counts = mut_df['Tumor_Sample_Barcode'].value_counts()
threshold = mut_counts.quantile(0.99)
keep_samples = mut_counts[mut_counts <= threshold].index
muts = mut_df[mut_df['Tumor_Sample_Barcode'].isin(keep_samples)]

In [14]:
# define which classifications are "synonymous" and which are "nonsynonymous"
synonymous = {
    "Silent",
    # "Intron",           # can be extended if needed
    # "IGR",              # Intergenic
    # "RNA"               # non-coding RNA
}
nonsynonymous = {
    "Missense_Mutation",
    "Nonsense_Mutation",
    "Splice_Site",
    "Frame_Shift_Del",
    "Frame_Shift_Ins",
    "Translation_Start_Site",
}

# mark the mutations as synonymous or nonsynonymous
def classify(row):
    if row["Variant_Classification"] in synonymous:
        return False  # synonymous
    elif row["Variant_Classification"] in nonsynonymous:
        return True   # nonsynonymous
    else:
        return None   # cannot be classified

muts["is_nonsynonymous"] = muts.apply(classify, axis=1)
# drop the rows that cannot be classified
muts = muts.dropna(subset=["is_nonsynonymous"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  muts["is_nonsynonymous"] = muts.apply(classify, axis=1)


In [None]:
# force the column to be a boolean
muts["is_nonsynonymous"] = muts["is_nonsynonymous"].astype("boolean")

# ~ is the logical negation
dnds = (
    muts
    .groupby("Hugo_Symbol")["is_nonsynonymous"]
    .agg(
        dN = "sum",             # True is 1
        dS = lambda s: (~s).sum()  # False is 1
    )
    .reset_index()
)
dnds["dN/dS"] = dnds["dN"] / (dnds["dS"] + 1e-6)

# filter out dS == 0
dnds = dnds[dnds["dS"] != 0]

# filter out dN/dS > 100
dnds = dnds[dnds["dN/dS"] < 100]
dnds.sort_values("dN/dS", ascending=False, inplace=True)

print(dnds.head(20))

      Hugo_Symbol   dN  dS      dN/dS
15491        TP53  169   2  84.499958
4625        ERBB2   33   1  32.999967
13885       SMAD4   29   1  28.999971
2759          CFH   28   1  27.999972
11132      PIK3CA   82   3  27.333324
2587         CDH9   25   1  24.999975
3112        CNTN4   25   1  24.999975
6100       GRIN3A   24   1  23.999976
15013       THSD1   23   1  22.999977
620         AMY2B   21   1  20.999979
10300      OR4C16   19   1  18.999981
10272      OR2T12   19   1  18.999981
4381       EGFLAM   19   1  18.999981
6381          HGF   19   1  18.999981
16416       VWA5A   19   1  18.999981
9284         MYH1   19   1  18.999981
3656        DAAM1   18   1  17.999982
2066        CADM1   18   1  17.999982
3070        CNGA4   18   1  17.999982
12243       RASA1   18   1  17.999982


In [19]:
dnds.to_csv("dnds_simple.tsv", sep="\t", index=False)

In [10]:
top_20_genes = dnds.head(20)["Hugo_Symbol"].tolist()

# find all the patient_id with mutations in the top 20 genes
# (a gene may have mutations in multiple patients, we merge and remove duplicates)
patients_top20 = (
    muts[muts["Hugo_Symbol"].isin(top_20_genes)]
    ["patient_id"]
    .unique()
    .tolist()
)
print(f"total patients: {len(patients_top20)}")


# filter the patients in meta
meta_top20 = meta[meta["patient_id"].isin(patients_top20)].copy()

# 4a. see their cancer type (type column)
print(meta_top20["type"].value_counts())

# 4b. (optional) see their tumor status and vital status
print(meta_top20["tumor_status"].value_counts())
print(meta_top20["vital_status"].value_counts())



total patients: 163
type
STAD    163
Name: count, dtype: int64
tumor_status
TUMOR FREE       99
WITH TUMOR       44
[Discrepancy]     1
Name: count, dtype: int64
vital_status
Alive            105
Dead              57
[Discrepancy]      1
Name: count, dtype: int64
