In [95]:
# Step 1: Load mutation file FINAL
mut_df = pd.read_csv("TCGA.STAD.mutations.txt", sep="\t")

# Step 1.5: Keep only SNPs
mut_df = mut_df[mut_df['Variant_Type'] == 'SNP']

# Step 2: Filter for coding mutations of interest
syn = ['Silent']
nonsyn = ['Missense_Mutation', 'Nonsense_Mutation']
mut_df = mut_df[mut_df['Variant_Classification'].isin(syn + nonsyn)]

# Step 3: Remove hypermutator samples (e.g. top 1% by mutation load)
mut_counts = mut_df['Tumor_Sample_Barcode'].value_counts()
threshold = mut_counts.quantile(0.99)
keep_samples = mut_counts[mut_counts <= threshold].index
mut_df = mut_df[mut_df['Tumor_Sample_Barcode'].isin(keep_samples)]

# Step 4: Classify mutations as synonymous or nonsynonymous
mut_df['class'] = mut_df['Variant_Classification'].apply(lambda x: 'syn' if x in syn else 'nonsyn')

# Step 5: Count N and S per gene
counts = mut_df.groupby(['Hugo_Symbol', 'class']).size().unstack(fill_value=0)
counts = counts.rename(columns={'syn': 'S', 'nonsyn': 'N'})

# Ensure both columns exist
if 'N' not in counts.columns:
    counts['N'] = 0
if 'S' not in counts.columns:
    counts['S'] = 0

# Step 6: Compute naive dN/dS (exclude genes with S = 0)
counts = counts[counts['S'] > 0]
counts['dNdS_naive'] = counts['N'] / counts['S']

# Step 7: View top candidates
top = counts.sort_values('dNdS_naive', ascending=False).head(10)
print(top[['N', 'S', 'dNdS_naive']])


class          N  S  dNdS_naive
Hugo_Symbol                    
TP53         169  2   84.500000
ERBB2         33  1   33.000000
SMAD4         29  1   29.000000
CFH           28  1   28.000000
PIK3CA        82  3   27.333333
CDH9          25  1   25.000000
CNTN4         25  1   25.000000
GRIN3A        24  1   24.000000
THSD1         23  1   23.000000
AMY2B         21  1   21.000000


In [96]:
import warnings
warnings.filterwarnings("ignore", category=ResourceWarning)


In [97]:
pip install mysql-connector-python


[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [98]:
import mysql.connector
import pandas as pd

# Connect to UCSC
conn = mysql.connector.connect(
    host="genome-mysql.soe.ucsc.edu",
    user="genome",
    database="hg19"
)
cursor = conn.cursor()

# Corrected SQL query
query = """
SELECT name2, cdsStart, cdsEnd, exonStarts, exonEnds
FROM refGene;
"""
cursor.execute(query)
rows = cursor.fetchall()

# Create DataFrame
df = pd.DataFrame(rows, columns=["Hugo_Symbol", "cdsStart", "cdsEnd", "exonStarts", "exonEnds"])

# Compute CDS length per transcript
def compute_cds_length(row):
    exon_starts = row['exonStarts']
    exon_ends = row['exonEnds']

    # Decode from bytes to string if needed
    if isinstance(exon_starts, bytes):
        exon_starts = exon_starts.decode("utf-8")
    if isinstance(exon_ends, bytes):
        exon_ends = exon_ends.decode("utf-8")

    # Process comma-separated strings
    starts = list(map(int, exon_starts.strip(',').split(',')))
    ends = list(map(int, exon_ends.strip(',').split(',')))
    cds_start, cds_end = row['cdsStart'], row['cdsEnd']

    length = 0
    for s, e in zip(starts, ends):
        overlap_start = max(s, cds_start)
        overlap_end = min(e, cds_end)
        if overlap_end > overlap_start:
            length += (overlap_end - overlap_start)
    return length


df['cds_length'] = df.apply(compute_cds_length, axis=1)

# Collapse multiple transcripts to max CDS per gene
gene_lengths = (
    df.groupby("Hugo_Symbol")['cds_length']
    .max()
    .reset_index()
)

# Drop 0-length entries
gene_lengths = gene_lengths[gene_lengths['cds_length'] > 0]

# Save
gene_lengths.to_csv("gene_coding_lengths.tsv", sep="\t", index=False)

# Close connection
cursor.close()
conn.close()


In [99]:
# Step 5: Count N and S per gene
counts = (
    mut_df.groupby(['Hugo_Symbol', 'class'])
    .size()
    .unstack(fill_value=0)
    .rename(columns={'syn': 'S', 'nonsyn': 'N'})
    .reset_index()
)

# ✅ Merge mutation counts with gene sizes
df = counts.merge(gene_lengths, on='Hugo_Symbol', how='inner')

# Step 1: Estimate number of sites (opportunities)
df['L_N'] = 0.75 * df['cds_length']
df['L_S'] = 0.25 * df['cds_length']

# Step 2: Compute normalized rates
df['dN'] = df['N'] / df['L_N']
df['dS'] = df['S'] / df['L_S']

# Step 3: Compute size-adjusted dN/dS
df = df[df['dS'] > 0]  # avoid division by zero
df['dNdS_corrected'] = df['dN'] / df['dS']

# Optional: sort for potential drivers
top = df.sort_values('dNdS_corrected', ascending=False).head(10)
print(top[['Hugo_Symbol', 'N', 'S', 'cds_length', 'dNdS_corrected']])


      Hugo_Symbol    N  S  cds_length  dNdS_corrected
14396        TP53  169  2        1182       28.166667
4223        ERBB2   33  1        3885       11.000000
12847       SMAD4   29  1        1659        9.666667
2426          CFH   28  1        3696        9.333333
10207      PIK3CA   82  3        3207        9.111111
2257         CDH9   25  1        2370        8.333333
2773        CNTN4   25  1        3081        8.333333
5570       GRIN3A   24  1        3348        8.000000
13947       THSD1   23  1        2559        7.666667
566         AMY2B   21  1        1536        7.000000


In [100]:
# Step 1: Estimate number of sites (opportunities)
df['L_N'] = 0.75 * df['cds_length']
df['L_S'] = 0.25 * df['cds_length']

# Step 2: Compute normalized rates
df['dN'] = df['N'] / df['L_N']
df['dS'] = df['S'] / df['L_S']

# Step 3: Compute size-adjusted dN/dS
df = df[df['dS'] > 0]  # avoid division by zero
df['dNdS_corrected'] = df['dN'] / df['dS']

# Optional: sort for potential drivers
top = df.sort_values('dNdS_corrected', ascending=False).head(30)
print(top[['Hugo_Symbol', 'N', 'S', 'cds_length', 'dNdS_corrected']])


      Hugo_Symbol    N  S  cds_length  dNdS_corrected
14396        TP53  169  2        1182       28.166667
4223        ERBB2   33  1        3885       11.000000
12847       SMAD4   29  1        1659        9.666667
2426          CFH   28  1        3696        9.333333
10207      PIK3CA   82  3        3207        9.111111
2257         CDH9   25  1        2370        8.333333
2773        CNTN4   25  1        3081        8.333333
5570       GRIN3A   24  1        3348        8.000000
13947       THSD1   23  1        2559        7.666667
566         AMY2B   21  1        1536        7.000000
5827          HGF   19  1        2187        6.333333
15217       VWA5A   19  1        2361        6.333333
3987       EGFLAM   19  1        3054        6.333333
8394         MYH1   19  1        5820        6.333333
9394       OR4C16   19  1         933        6.333333
9366       OR2T12   19  1         963        6.333333
3287        DAAM1   18  1        3237        6.000000
2732        CNGA4   18  1   

In [101]:
def shuffle_genes(mut_df):
    shuffled_df = mut_df.copy()
    original_genes = shuffled_df['Hugo_Symbol'].tolist()
    np.random.shuffle(original_genes)
    shuffled_df['Hugo_Symbol'] = original_genes
    return shuffled_df

In [None]:
import numpy as np ### warning this takes a really long time!!!
n_permutations = 1000
null_dnds_distributions = {}

for _ in range(n_permutations):
    shuffled_mut_df = shuffle_genes(mut_df.copy()) # Important: shuffle a copy
    shuffled_counts = (
        shuffled_mut_df.groupby(['Hugo_Symbol', 'class'])
        .size()
        .unstack(fill_value=0)
        .rename(columns={'syn': 'S', 'nonsyn': 'N'})
        .reset_index()
    )
    if 'N' not in shuffled_counts.columns:
        shuffled_counts['N'] = 0
    if 'S' not in shuffled_counts.columns:
        shuffled_counts['S'] = 0
    shuffled_df_merged = shuffled_counts.merge(gene_lengths, on='Hugo_Symbol', how='inner')
    shuffled_df_merged['L_N'] = 0.75 * shuffled_df_merged['cds_length']
    shuffled_df_merged['L_S'] = 0.25 * shuffled_df_merged['cds_length']
    shuffled_df_merged['dN'] = shuffled_df_merged['N'] / shuffled_df_merged['L_N']
    shuffled_df_merged = shuffled_df_merged[shuffled_df_merged['L_S'] > 0] # Avoid division by zero
    shuffled_df_merged['dS'] = shuffled_df_merged['S'] / shuffled_df_merged['L_S']
    shuffled_df_merged = shuffled_df_merged[shuffled_df_merged['dS'] > 0] # Avoid division by zero
    shuffled_df_merged['dNdS_corrected'] = shuffled_df_merged['dN'] / shuffled_df_merged['dS']

    for gene in shuffled_df_merged['Hugo_Symbol']:
        dnds = shuffled_df_merged.loc[shuffled_df_merged['Hugo_Symbol'] == gene, 'dNdS_corrected'].iloc[0]
        if gene not in null_dnds_distributions:
            null_dnds_distributions[gene] = []
        null_dnds_distributions[gene].append(dnds)

In [None]:
p_values = {}
for gene, observed_dnds in df.set_index('Hugo_Symbol')['dNdS_corrected'].items():
    if gene in null_dnds_distributions:
        null_distribution = np.array(null_dnds_distributions[gene])
        if not np.isnan(observed_dnds) and len(null_distribution) > 0:
            # For positive selection (dN/dS > 1)
            p_greater = np.mean(null_distribution >= observed_dnds)
            # For negative selection (dN/dS < 1)
            p_less = np.mean(null_distribution <= observed_dnds)
            p_values[gene] = {'p_greater': p_greater, 'p_less': p_less}
        else:
            p_values[gene] = {'p_greater': np.nan, 'p_less': np.nan}
    else:
        p_values[gene] = {'p_greater': np.nan, 'p_less': np.nan}

print("\nP-values based on permutation:")
for gene, p_vals in p_values.items():
    print(f"{gene}: p_greater={p_vals['p_greater']:.4f}, p_less={p_vals['p_less']:.4f}")

In [None]:
from statsmodels.sandbox.stats.multicomp import multipletests

all_p_greater = np.array([p['p_greater'] for p in p_values.values() if not np.isnan(p['p_greater'])])
gene_names_with_p = [gene for gene, p in p_values.items() if not np.isnan(p['p_greater'])]

if len(all_p_greater) > 0:
    reject, p_corrected, _, _ = multipletests(all_p_greater, method='fdr_bh')
    corrected_p_greater_dict = dict(zip(gene_names_with_p, p_corrected))
    print("\nFDR-corrected p-values (greater):")
    for gene, p in corrected_p_greater_dict.items():
        print(f"{gene}: {p:.4f}")

# Repeat for p_less if you are interested in negative selection

In [None]:
import pandas as pd

# Assuming you have 'corrected_p_greater_dict' and your 'df' DataFrame

# Create a Pandas Series from the corrected p-values
corrected_p_series = pd.Series(corrected_p_greater_dict)

# Define a significance threshold
significance_threshold = 0.1

# Filter for significant genes based on FDR
significant_p_values = corrected_p_series[corrected_p_series < significance_threshold]

if not significant_p_values.empty:
    print("\nTop Potential Driver Genes (FDR < {}):".format(significance_threshold))
    # Merge the significant p-values with the dNdS_corrected from your original df
    significant_drivers_df = pd.DataFrame({'FDR_p_value': significant_p_values})
    drivers_with_dnds = significant_drivers_df.merge(
        df[['Hugo_Symbol', 'dNdS_corrected']].set_index('Hugo_Symbol'),
        left_index=True,
        right_index=True,
        how='inner'
    ).sort_values(by='FDR_p_value')
    print(drivers_with_dnds)
else:
    print("No genes found to be significant at FDR < {}".format(significance_threshold))

# Optional: Print top N by FDR with dN/dS
top_n = 40
top_genes_by_p = corrected_p_series.sort_values(ascending=True).head(top_n)
if not top_genes_by_p.empty:
    print(f"\nTop {top_n} Potential Driver Genes (by lowest FDR) with dN/dS:")
    top_n_drivers_df = pd.DataFrame({'FDR_p_value': top_genes_by_p})
    top_n_with_dnds = top_n_drivers_df.merge(
        df[['Hugo_Symbol', 'dNdS_corrected']].set_index('Hugo_Symbol'),
        left_index=True,
        right_index=True,
        how='inner'
    )
    print(top_n_with_dnds)