In [None]:
#Load Expression Data

import pandas as pd

# Load expression matrix (tab-separated, quotes present)
expr_data = pd.read_csv("GSE5281_series_matrix.txt", sep="\t", quotechar='"')

# Set the probe ID as index
expr_data.set_index("ID_REF", inplace=True)

# Preview data
expr_data.iloc[:5, :5]


Unnamed: 0_level_0,GSM119615,GSM119616,GSM119617,GSM119618,GSM119619
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1007_s_at,693.9733,807.14514,1722.625,1668.0575,2231.2969
1053_at,9.755318,19.05227,13.060389,14.652245,138.67683
117_at,13.346846,18.217276,97.42547,98.197945,106.46844
121_at,173.09245,289.3154,537.1828,496.34656,477.86517
1255_g_at,52.61695,110.375984,99.021736,17.306772,628.81616


In [None]:
# Load GPL570 annotation and map probe IDs to gene symbols

# Load GPL570 annotation file (skip commented lines)
gpl = pd.read_csv("GPL570-55999.txt", sep="\t", comment="#", low_memory=False)

# Keep only relevant columns
gpl_subset = gpl[["ID", "Gene Symbol"]].dropna()

# Handle cases where multiple gene symbols are listed
gpl_subset["Gene Symbol"] = gpl_subset["Gene Symbol"].apply(lambda x: x.split(" /// ")[0])

# Create a mapping dictionary: Probe ID → Gene Symbol
probe_to_gene = dict(zip(gpl_subset["ID"], gpl_subset["Gene Symbol"]))


In [None]:
# Map probe IDs to gene symbols and group duplicates

# Copy expression data to avoid modifying original
gene_expr = expr_data.copy()

# Reset index to move 'ID_REF' (probe ID) into a column
gene_expr.reset_index(inplace=True)

# Map probe IDs to gene symbols
gene_expr["Gene Symbol"] = gene_expr["ID_REF"].map(probe_to_gene)

# Drop rows where no gene symbol was found
gene_expr.dropna(subset=["Gene Symbol"], inplace=True)

# Drop the old 'ID_REF' column (probe ID)
gene_expr.drop(columns=["ID_REF"], inplace=True)

# Group by gene symbol and average duplicate probes
gene_expr = gene_expr.groupby("Gene Symbol").mean()

# Show final shape and preview
print("Shape after mapping to gene symbols:", gene_expr.shape)
gene_expr.head()


Shape after mapping to gene symbols: (22880, 161)


Unnamed: 0_level_0,GSM119615,GSM119616,GSM119617,GSM119618,GSM119619,GSM119620,GSM119621,GSM119622,GSM119623,GSM119624,...,GSM238945,GSM238946,GSM238947,GSM238948,GSM238949,GSM238951,GSM238952,GSM238953,GSM238955,GSM238963
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,16.092667,34.991314,10.746553,72.35921,109.71797,34.677658,75.050835,40.150936,47.761227,76.548485,...,109.429,53.0127,45.0397,10.7033,79.8856,18.0125,8.59821,33.2066,112.204,11.6542
A1BG-AS1,5.116955,67.80051,21.632008,175.54417,79.79171,171.9951,93.9859,228.9866,87.37928,193.31186,...,5.92621,12.7464,10.8453,37.0154,63.8156,43.6563,8.34086,4.38778,11.2991,201.288
A1CF,13.562965,40.780445,131.987224,55.218665,57.709375,69.129072,112.32379,47.167341,32.731592,43.284466,...,41.40465,2.886165,18.16635,34.83335,76.5694,23.83176,34.6981,8.58483,19.648335,49.599955
A2M,315.357861,133.123514,289.263175,243.74955,940.70905,211.150755,133.338405,161.728978,56.483171,65.616185,...,394.3604,135.0515,550.881,299.4503,424.3655,129.9565,205.4498,242.84525,761.7817,1418.275
A2M-AS1,111.023705,80.44576,107.21052,107.729126,178.1987,101.63902,112.68627,76.16195,97.21381,56.894566,...,93.3006,151.017,178.294,226.079,95.162,90.6083,185.984,63.234,224.083,98.7805


In [14]:
# Noramlize the gene expression data

from sklearn.preprocessing import StandardScaler

# Z-score normalize each gene across samples (rows)
scaler = StandardScaler()
normalized_data = scaler.fit_transform(gene_expr)

# Convert back to DataFrame with same index and columns
normalized_df = pd.DataFrame(normalized_data, index=gene_expr.index, columns=gene_expr.columns)

# Preview
normalized_df.iloc[:5, :5]


Unnamed: 0_level_0,GSM119615,GSM119616,GSM119617,GSM119618,GSM119619
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1BG,-0.349162,-0.28366,-0.174479,-0.150433,-0.308049
A1BG-AS1,-0.367884,-0.236762,-0.165546,-0.050606,-0.321817
A1CF,-0.353477,-0.275385,-0.074989,-0.167016,-0.331976
A2M,0.161333,-0.143388,0.054071,0.01538,0.074249
A2M-AS1,-0.187226,-0.218687,-0.095321,-0.116214,-0.276545


In [15]:
# Save to CSV
normalized_df.to_csv("GSE5281_normalized_gene_expression.csv")
