# Multi-Mapping Reads: Gene Expression

### *E. coli*

In [1]:
import urllib.request
import gzip
import pandas as pd
import numpy as np
from scipy.stats import norm

# Retrieve Gene Expression Data

In [2]:
# -----------------------------
# Step 1: Download files
# -----------------------------
# GFF file for E. coli K-12 MG1655
gff_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gff.gz"
urllib.request.urlretrieve(gff_url, "E_coli_K12.gff")

# Example WIG files (forward and reverse strands, pooled mRNA)
wig_f_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE53nnn/GSE53767/suppl/GSE53767_mrna-rdm-pooled_f.wig.gz"
wig_r_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE53nnn/GSE53767/suppl/GSE53767_mrna-rdm-pooled_r.wig.gz"
urllib.request.urlretrieve(wig_f_url, "GSE53767_mrna_rdm_f.wig.gz")
urllib.request.urlretrieve(wig_r_url, "GSE53767_mrna_rdm_r.wig.gz")

('GSE53767_mrna_rdm_r.wig.gz', <http.client.HTTPMessage at 0x7bb788866750>)

In [3]:
# -----------------------------
# Step 2: Parse GFF gene annotations
# -----------------------------
gff_df = pd.read_csv(
    "/kaggle/working/E_coli_K12.gff",   # Use .gz file
    sep="\t",
    comment="#",
    header=None,
    compression='gzip'
)
gff_df.columns = ["seqid","source","type","start","end","score","strand","phase","attributes"]
genes_df = gff_df[gff_df["type"] == "gene"].copy()
genes_df["gene_id"] = genes_df["attributes"].str.extract(r"ID=([^;]+)")
genes_df = genes_df[["gene_id","start","end","strand"]].reset_index(drop=True)


In [4]:
# -----------------------------
# Step 3: Parse WIG files
# -----------------------------
def read_wig(file_path):
    data = []
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            if line.startswith("track") or line.startswith("variableStep") or line.startswith("fixedStep"):
                continue
            parts = line.strip().split()
            if len(parts) == 2:
                pos, val = parts
                data.append((int(pos), float(val)))
    return pd.DataFrame(data, columns=["position", "coverage"])

# Forward and reverse strands
wig_f_df = read_wig("GSE53767_mrna_rdm_f.wig.gz")
wig_r_df = read_wig("GSE53767_mrna_rdm_r.wig.gz")

In [5]:
# -----------------------------
# Step 4: Aggregate per-gene expression
# -----------------------------
def compute_gene_expression(wig_df, genes_df):
    expr_list = []
    for idx, row in genes_df.iterrows():
        cov = wig_df[(wig_df["position"] >= row["start"]) & (wig_df["position"] <= row["end"])]["coverage"].sum()
        expr_list.append(cov)
    return pd.Series(expr_list, name="expression")

# Sum forward and reverse strand coverage
genes_df["expr_f"] = compute_gene_expression(wig_f_df, genes_df)
genes_df["expr_r"] = compute_gene_expression(wig_r_df, genes_df)
genes_df["expression"] = genes_df["expr_f"] + genes_df["expr_r"]

# Keep relevant columns
final_df = genes_df[["gene_id","start","end","expression"]]
print(final_df.head())

      gene_id  start   end    expression
0  gene-b0001    190   255   1095.808969
1  gene-b0002    337  2799  11621.629880
2  gene-b0003   2801  3733   3581.617976
3  gene-b0004   3734  5020   5462.200000
4  gene-b0005   5234  5530    204.196429


# z-Score & p-Value Calculation

In [6]:
# Ensure we work on a copy to avoid SettingWithCopyWarning
final_df = final_df.copy()

# -------------------------------
# Parameters
# -------------------------------
num_random = 100     # number of random genes for null distribution
p_threshold = 0.05   # significance cutoff

# -------------------------------
# Step 1: Null distribution from random genes
# -------------------------------
random_genes = final_df.sample(n=num_random, random_state=42)
null_mean = random_genes["expression"].mean()
null_std = random_genes["expression"].std()

# -------------------------------
# Step 2: Z-score for each gene
# -------------------------------
final_df.loc[:, "z_score"] = (final_df["expression"] - null_mean) / null_std

# -------------------------------
# Step 3: Convert Z-score to p-value (one-sided test)
# -------------------------------
final_df.loc[:, "p_value"] = 1 - norm.cdf(final_df["z_score"])

# -------------------------------
# Step 4: Determine significance
# -------------------------------
final_df.loc[:, "significant"] = final_df["p_value"] < p_threshold

# -------------------------------
# Inspect results
# -------------------------------
print(final_df.head())

      gene_id  start   end    expression   z_score   p_value  significant
0  gene-b0001    190   255   1095.808969 -0.131232  0.552204        False
1  gene-b0002    337  2799  11621.629880  0.319856  0.374539        False
2  gene-b0003   2801  3733   3581.617976 -0.024701  0.509853        False
3  gene-b0004   3734  5020   5462.200000  0.055892  0.477714        False
4  gene-b0005   5234  5530    204.196429 -0.169442  0.567275        False


# Statistics

In [7]:
final_df.describe()

Unnamed: 0,start,end,expression,z_score,p_value
count,4506.0,4506.0,4506.0,4506.0,4506.0
mean,2309520.0,2310417.0,3148.631972,-0.043257,0.538475
std,1332803.0,1332810.0,16098.116576,0.68989,0.090167
min,190.0,255.0,0.0,-0.178193,0.0
25%,1174426.0,1175530.0,40.0,-0.176479,0.55127
50%,2293931.0,2295142.0,247.627069,-0.167581,0.566543
75%,3450469.0,3451154.0,1150.902931,-0.12887,0.570041
max,4640942.0,4641628.0,346421.447409,14.667815,0.570714


In [8]:
final_df.to_csv('/kaggle/working/final_ecoli.csv')