In [128]:
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import StandardScaler

Reading all the ADNI files

In [2]:
import gzip
# Read the entire file to understand structure
with gzip.open('ADNI_GEO_formatted.csv.gz', 'rt') as f:
    lines = f.readlines()

print(f"Total lines: {len(lines)}")
print("\nFirst 100 lines:")
print("="*60)

for i in range(min(100, len(lines))):
    print(f"{i}: {lines[i].strip()[:200]}")

# Look for data sections
print("\n" + "="*60)
print("SEARCHING FOR DATA SECTIONS")
print("="*60)

for i, line in enumerate(lines):
    if 'expression_section' in line.lower() or 'table_begin' in line.lower():
        print(f"Found at line {i}: {line.strip()}")
        # Show next 10 lines after this marker
        print("\nNext 10 lines:")
        for j in range(i, min(i+10, len(lines))):
            print(f"{j}: {lines[j].strip()[:200]}")
        break

Total lines: 1497

First 100 lines:
0: # ADNI Gene Expression Dataset
1: # Format: GEO-like
2: 
3: !Sample_phenotype_section
4: PTID,AGE,PTGENDER,PTEDUCAT,Diagnosis
5: 002_S_0413,76.3,Female,16,Control
6: 002_S_0685,89.6,Female,16,Control
7: 002_S_0729,65.1,Female,16,MCI
8: 002_S_1155,57.8,Male,20,MCI
9: 002_S_1261,71.1,Female,16,Control
10: 002_S_1268,82.7,Male,20,MCI
11: 002_S_1280,70.7,Female,14,Control
12: 002_S_2043,72.2,Female,20,MCI
13: 002_S_2073,63.4,Female,20,MCI
14: 002_S_4171,69.4,Male,16,MCI
15: 002_S_4213,78.0,Female,14,Control
16: 002_S_4219,79.4,Female,17,MCI
17: 002_S_4225,69.9,Male,20,Control
18: 002_S_4229,66.4,Male,15,MCI
19: 002_S_4237,80.9,Female,13,MCI
20: 002_S_4251,71.9,Male,19,MCI
21: 002_S_4270,74.6,Female,16,Control
22: 002_S_4447,67.5,Female,19,MCI
23: 002_S_4473,74.8,Male,16,MCI
24: 002_S_4521,70.0,Male,18,MCI
25: 002_S_4654,75.4,Female,18,MCI
26: 003_S_0907,88.6,Female,20,Control
27: 003_S_0908,62.9,Female,16,MCI
28: 003_S_0981,84.3,Female,16,Control
29: 

This file has TWO sections:

Phenotype section (lines 4-750): Clinical data (Age, Sex, Education, Diagnosis)
Expression section (lines 752+): Gene expression data

In [3]:
try:
    with gzip.open('ADNI_combined_dataset.csv.gz', 'rt') as f:
        lines2 = f.readlines()

    print(f"Total lines: {len(lines2)}")
    print(f"\nFirst 50 lines:")
    for i in range(min(50, len(lines2))):
        print(f"  {i}: {lines2[i].strip()[:150]}")

    # Try to load as CSV
    print("\nTrying to load as CSV...")
    adni_combined = pd.read_csv('ADNI_combined_dataset.csv.gz',
                                compression='gzip',
                                on_bad_lines='skip')
    print(f"✓ Loaded successfully!")
    print(f"Shape: {adni_combined.shape}")
    print(f"Columns: {adni_combined.columns.tolist()}")
    print(f"\nFirst 10 rows:")
    print(adni_combined.head(10))

except Exception as e:
    print(f"✗ Error: {e}")

Total lines: 745

First 50 lines:
  0: PTID,HIST1H3G_11715100_at,HIST1H3G_11715101_s_at,HIST1H3G_11715102_x_at,TNFAIP8L1_11715103_x_at,OTOP2_11715104_s_at,C17ORF78_11715105_at,CTAGE6 || CTA
  1: 009_S_4612,2.497,2.275,2.067,3.589,1.808,2.153,2.317,4.704,2.219,2.796,5.827,2.253,2.584,4.986,5.183,1.99,2.416,1.78,2.358,2.356,2.185,2.144,4.732,2.1
  2: 114_S_0416,2.331,2.506,2.545,3.718,2.35,1.99,2.429,4.765,2.337,2.828,4.996,2.135,2.336,5.25,5.25,2.009,2.637,2.522,2.665,2.302,2.324,2.31,5.61,2.252,3
  3: 128_S_1407,2.431,2.25,1.938,3.141,2.347,1.883,2.136,4.705,2.294,2.507,4.816,2.232,2.39,5.039,5.206,1.948,2.195,1.733,2.392,2.77,2.431,2.375,4.523,2.28
  4: 072_S_4610,2.516,2.642,2.067,3.249,2.081,2.074,2.305,4.461,2.514,2.558,5.318,2.234,2.426,5.111,5.111,1.762,2.127,1.858,2.491,2.206,2.126,2.44,4.489,2.
  5: 033_S_1116,2.342,2.394,2.127,3.282,2.311,1.995,2.367,4.947,2.268,2.742,5.754,2.325,2.404,5.09,5.131,1.961,2.33,1.748,2.459,2.337,2.119,2.102,5.356,2.2
  6: 941_S_4036,3.059,2.841,2.

  adni_combined = pd.read_csv('ADNI_combined_dataset.csv.gz',


✓ Loaded successfully!
Shape: (744, 48275)
Columns: ['PTID', 'HIST1H3G_11715100_at', 'HIST1H3G_11715101_s_at', 'HIST1H3G_11715102_x_at', 'TNFAIP8L1_11715103_x_at', 'OTOP2_11715104_s_at', 'C17ORF78_11715105_at', 'CTAGE6 || CTAGE15_11715106_x_at', 'F8A2 || F8A3 || F8A1_11715107_s_at', 'LINC01098_11715108_x_at', 'SAMD7_11715109_at', 'ARRDC5_11715110_at', 'CGB || CGB1 || CGB2 || CGB5 || CGB7 || CGB8_11715111_s_at', 'C1ORF173_11715112_at', 'FAM86C1_11715113_x_at', 'FAM86C1_11715114_x_at', 'HIST1H2BI_11715115_s_at', 'HIST1H4E_11715116_s_at', 'HIST1H2AJ_11715117_x_at', 'HIST1H2BF_11715118_s_at', 'C2CD4B_11715119_s_at', 'HIST1H2BE_11715120_s_at', 'HIST1H3A_11715121_s_at', 'RAB3D_11715122_at', 'PDZD9_11715123_s_at', 'CACNG8_11715124_s_at', 'MORC2_11715125_at', 'HIST1H2BN_11715126_s_at', 'GPR32_11715127_s_at', 'PCDHGB5_11715128_s_at', 'ZNF600_11715129_s_at', 'KRTAP6-2_11715130_s_at', 'KRTAP20-1_11715131_s_at', 'HIST1H2BH_11715132_x_at', 'PCDHGA1_11715133_s_at', 'GLIPR1L2_11715134_s_at', 'SNAPC5_

ADNI_combined_dataset.csv.gz → Gene expression (745 patients × 48,275 probes)

In [4]:
adni_annotations = pd.read_csv('ADNI_gene_annotations.csv')

In [5]:
adni_annotations.head()

Unnamed: 0,ProbeSet,LocusLink,Symbol,GeneID
0,11715100_at,LOC8355,HIST1H3G,HIST1H3G_11715100_at
1,11715101_s_at,LOC8355,HIST1H3G,HIST1H3G_11715101_s_at
2,11715102_x_at,LOC8355,HIST1H3G,HIST1H3G_11715102_x_at
3,11715103_x_at,LOC126282,TNFAIP8L1,TNFAIP8L1_11715103_x_at
4,11715104_s_at,LOC92736,OTOP2,OTOP2_11715104_s_at


In [6]:
adni_annotations.shape

(48158, 4)

ADNI_gene_annotations.csv → Map probe IDs to gene symbols

# Extract phenotype

In [7]:
# Load ADNI phenotype
with gzip.open('ADNI_GEO_formatted.csv.gz', 'rt') as f:
    lines = f.readlines()

# Find phenotype section
pheno_start = None
pheno_end = None

for i, line in enumerate(lines):
    if '!Sample_phenotype_section' in line:
        pheno_start = i + 1
    if '!Sample_expression_section' in line:
        pheno_end = i
        break

# Extract phenotype
pheno_text = ''.join(lines[pheno_start:pheno_end])
adni_pheno = pd.read_csv(StringIO(pheno_text))

In [8]:
adni_pheno.head()

Unnamed: 0,PTID,AGE,PTGENDER,PTEDUCAT,Diagnosis
0,002_S_0413,76.3,Female,16,Control
1,002_S_0685,89.6,Female,16,Control
2,002_S_0729,65.1,Female,16,MCI
3,002_S_1155,57.8,Male,20,MCI
4,002_S_1261,71.1,Female,16,Control


In [9]:
adni_pheno.shape

(744, 5)

In [10]:
adni_pheno.isnull().sum()

Unnamed: 0,0
PTID,0
AGE,0
PTGENDER,0
PTEDUCAT,0
Diagnosis,44


In [11]:
adni_pheno.Diagnosis.unique()

array(['Control', 'MCI', nan], dtype=object)

In [12]:
adni_combined.shape

(744, 48275)

Check if the patient IDs match between phenotype and combined expression:

In [13]:
print("="*60)
print("CHECKING PATIENT ID OVERLAP")
print("="*60)

# Get patient IDs from each
pheno_ptids = set(adni_pheno['PTID'])
combined_ptids = set(adni_combined['PTID'])

print(f"Phenotype patients: {len(pheno_ptids)}")
print(f"Combined expression patients: {len(combined_ptids)}")

# Check overlap
common_ptids = pheno_ptids & combined_ptids
only_pheno = pheno_ptids - combined_ptids
only_combined = combined_ptids - pheno_ptids

print(f"\nCommon patients: {len(common_ptids)}")
print(f"Only in phenotype: {len(only_pheno)}")
print(f"Only in combined: {len(only_combined)}")

if pheno_ptids == combined_ptids:
    print("\n✓ IDENTICAL patient IDs!")
    print("  → Can directly merge phenotype + expression")
else:
    print("\n⚠ DIFFERENT patient IDs!")
    print(f"  → Need to filter to {len(common_ptids)} common patients")

    if len(only_pheno) > 0:
        print(f"\nSample patients only in phenotype: {list(only_pheno)[:5]}")
    if len(only_combined) > 0:
        print(f"Sample patients only in combined: {list(only_combined)[:5]}")

# Check diagnosis distribution for common patients
print("\n" + "="*60)
print("DIAGNOSIS DISTRIBUTION FOR COMMON PATIENTS")
print("="*60)

if len(common_ptids) > 0:
    adni_pheno_common = adni_pheno[adni_pheno['PTID'].isin(common_ptids)]
    print(adni_pheno_common['Diagnosis'].value_counts())
    print(f"Missing diagnosis in common patients: {adni_pheno_common['Diagnosis'].isna().sum()}")

CHECKING PATIENT ID OVERLAP
Phenotype patients: 744
Combined expression patients: 744

Common patients: 744
Only in phenotype: 0
Only in combined: 0

✓ IDENTICAL patient IDs!
  → Can directly merge phenotype + expression

DIAGNOSIS DISTRIBUTION FOR COMMON PATIENTS
Diagnosis
MCI        439
Control    261
Name: count, dtype: int64
Missing diagnosis in common patients: 44


In [14]:
print(adni_annotations.shape)
print(adni_combined.shape)
print(adni_pheno.shape)

(48158, 4)
(744, 48275)
(744, 5)


In [15]:
print("="*80)
print("MAPPING ADNI PROBES TO GENE SYMBOLS")
print("="*80)

print(f"adni_combined shape: {adni_combined.shape}")
print(f"adni_annotations shape: {adni_annotations.shape}")

# ============================================================
# Create probe-to-gene mapping
# ============================================================

print("\n" + "="*60)
print("CREATING PROBE-TO-GENE MAPPING")
print("="*60)

# Create mapping dictionary
probe_to_gene_adni = dict(zip(adni_annotations['ProbeSet'],
                              adni_annotations['Symbol']))

print(f"✓ Mapping dictionary created: {len(probe_to_gene_adni):,} mappings")

# Show sample
print("\nSample mappings from annotation:")
for i in range(10):
    probe = adni_annotations['ProbeSet'].iloc[i]
    gene = adni_annotations['Symbol'].iloc[i]
    print(f"  {probe} → {gene}")

# ============================================================
# Map ADNI expression column names
# ============================================================

print("\n" + "="*60)
print("MAPPING ADNI EXPRESSION COLUMNS")
print("="*60)

# Get probe columns (exclude PTID)
probe_columns = adni_combined.columns[1:].tolist()

print(f"Total columns in adni_combined: {len(adni_combined.columns)}")
print(f"Probe columns (excluding PTID): {len(probe_columns)}")

print(f"\nSample column names:")
print(probe_columns[:10])

# Try to match column names to probes
# Column format: "HIST1H3G_11715100_at"
# ProbeSet format: "11715100_at"

column_to_gene = {}
matched = 0
unmatched = 0

for col in probe_columns:
    # Extract probe ID (after first underscore)
    if '_' in col:
        probe_id = col.split('_', 1)[1]  # Get everything after first _
    else:
        probe_id = col

    # Look up gene symbol
    if probe_id in probe_to_gene_adni:
        column_to_gene[col] = probe_to_gene_adni[probe_id]
        matched += 1
    else:
        column_to_gene[col] = None
        unmatched += 1

print(f"\n✓ Mapping complete!")
print(f"  Matched: {matched:,} ({matched/len(probe_columns)*100:.1f}%)")
print(f"  Unmatched: {unmatched:,} ({unmatched/len(probe_columns)*100:.1f}%)")

# Show sample results
print(f"\nSample column→gene mappings:")
for i, (col, gene) in enumerate(list(column_to_gene.items())[:10]):
    print(f"  {col} → {gene}")

print("\nReady to apply mapping to ADNI dataset!")

MAPPING ADNI PROBES TO GENE SYMBOLS
adni_combined shape: (744, 48275)
adni_annotations shape: (48158, 4)

CREATING PROBE-TO-GENE MAPPING
✓ Mapping dictionary created: 48,158 mappings

Sample mappings from annotation:
  11715100_at → HIST1H3G
  11715101_s_at → HIST1H3G
  11715102_x_at → HIST1H3G
  11715103_x_at → TNFAIP8L1
  11715104_s_at → OTOP2
  11715105_at → C17ORF78
  11715106_x_at → CTAGE6 || CTAGE15
  11715107_s_at → F8A2 || F8A3 || F8A1
  11715108_x_at → LINC01098
  11715109_at → SAMD7

MAPPING ADNI EXPRESSION COLUMNS
Total columns in adni_combined: 48275
Probe columns (excluding PTID): 48274

Sample column names:
['HIST1H3G_11715100_at', 'HIST1H3G_11715101_s_at', 'HIST1H3G_11715102_x_at', 'TNFAIP8L1_11715103_x_at', 'OTOP2_11715104_s_at', 'C17ORF78_11715105_at', 'CTAGE6 || CTAGE15_11715106_x_at', 'F8A2 || F8A3 || F8A1_11715107_s_at', 'LINC01098_11715108_x_at', 'SAMD7_11715109_at']

✓ Mapping complete!
  Matched: 48,151 (99.7%)
  Unmatched: 123 (0.3%)

Sample column→gene mappings

Apply mapping

In [16]:
# Rename columns
adni_renamed = adni_combined.copy()

# Create new column names
new_columns = ['PTID']  # Keep PTID as is
for col in adni_combined.columns[1:]:
    if col in column_to_gene and column_to_gene[col] is not None:
        new_columns.append(column_to_gene[col])
    else:
        new_columns.append(None)  # Mark for removal

# Apply new column names
adni_renamed.columns = new_columns

# Remove columns with None (unmapped probes)
adni_renamed = adni_renamed.loc[:, adni_renamed.columns.notna()]

In [17]:
adni_renamed.shape

(744, 48152)

In [18]:

adni_renamed.head()

Unnamed: 0,PTID,HIST1H3G,HIST1H3G.1,HIST1H3G.2,TNFAIP8L1,OTOP2,C17ORF78,CTAGE6 || CTAGE15,F8A2 || F8A3 || F8A1,LINC01098,...,GAPDH,GAPDH.1,STAT1,STAT1.1,STAT1.2,STAT1.3,RNA18S5 || RNA45S5,RNA18S5 || RNA45S5.1,RNA18S5 || RNA45S5.2,RNA28S5 || RNA45S5
0,009_S_4612,2.497,2.275,2.067,3.589,1.808,2.153,2.317,4.704,2.219,...,10.948,11.196,9.823,9.625,7.204,6.408,8.001,8.927,11.967,5.496
1,114_S_0416,2.331,2.506,2.545,3.718,2.35,1.99,2.429,4.765,2.337,...,11.205,11.538,9.603,9.645,7.469,6.573,7.782,8.849,11.929,5.351
2,128_S_1407,2.431,2.25,1.938,3.141,2.347,1.883,2.136,4.705,2.294,...,10.865,11.015,10.12,9.676,7.362,7.166,7.367,8.587,11.734,4.752
3,072_S_4610,2.516,2.642,2.067,3.249,2.081,2.074,2.305,4.461,2.514,...,10.543,10.87,9.405,8.573,5.796,5.599,8.218,9.197,11.955,5.575
4,033_S_1116,2.342,2.394,2.127,3.282,2.311,1.995,2.367,4.947,2.268,...,10.924,11.228,9.424,9.754,7.123,6.548,7.936,8.727,11.948,5.375


In [19]:
# Set PTID as column names and transpose
adni_transposed = adni_renamed.set_index('PTID').T

In [20]:
# Set index name
adni_transposed.index.name = 'Gene_Symbol'

# Handle duplicate genes (average)

In [21]:
duplicates = adni_transposed.index.duplicated().sum()
unique_genes = adni_transposed.index.nunique()

print(f"Total rows: {len(adni_transposed)}")
print(f"Unique genes: {unique_genes:,}")

Total rows: 48151
Unique genes: 20,088


In [22]:
adni_final = adni_transposed.groupby(adni_transposed.index).mean()

In [23]:
adni_final.shape

(20088, 744)

In [24]:
adni_final.head()

PTID,009_S_4612,114_S_0416,128_S_1407,072_S_4610,033_S_1116,941_S_4036,024_S_4280,021_S_4421,072_S_4206,126_S_4686,...,006_S_0731,941_S_4365,127_S_2213,130_S_4660,023_S_1046,007_S_4611,027_S_2245,016_S_4353,036_S_4491,007_S_4272
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-Dec,2.787,2.791,2.785,2.792,3.145,2.93,2.759,2.958,2.673,2.825,...,2.718,2.911,3.165,2.706,2.797,2.829,2.535,2.83,3.048,2.87
1-Mar,6.71375,6.5615,7.14375,6.47125,6.86375,6.54475,6.5655,6.9,6.5435,6.45375,...,7.20475,6.62125,7.13475,6.3625,7.39125,6.837,6.77075,6.34975,5.62675,6.69825
1-Sep,7.2785,7.039,6.6125,6.5965,6.91,7.221,7.3575,6.9755,6.4455,7.284,...,6.5715,7.255,6.605,6.955,7.273,6.9,6.946,6.806,7.4735,7.2145
10-Mar,2.108,2.053,2.07,1.957,1.841,1.959,2.019,2.342,2.3,2.142,...,2.375,1.903,1.984,2.149,2.333,1.955,2.095,2.012,1.986,1.981
10-Sep,2.8622,2.1668,3.5736,4.0072,3.621,2.5436,2.6338,3.277,2.4782,2.3846,...,3.0104,3.1752,3.5444,2.4196,2.528,3.158,2.8792,2.6224,2.6202,2.94


###Reading GSE dataset

In [27]:
# Load the txt files directly
df1 = pd.read_csv('GSE110226_series_matrix.txt', sep='\t', comment='!')
df2 = pd.read_csv('GSE63060_series_matrix.txt', sep='\t', comment='!')
df3 = pd.read_csv('GSE85426_series_matrix.txt', sep='\t', comment='!')

In [28]:
df1.head()

Unnamed: 0,ID_REF,GSM2982966,GSM2982967,GSM2982968,GSM2982969,GSM2982970,GSM2982971,GSM2982972,GSM2982973,GSM2982974,...,GSM2982976,GSM2982977,GSM2982978,GSM2982979,GSM2982980,GSM2982981,GSM2982982,GSM2982983,GSM2982984,GSM2982985
0,100121619_TGI_at,10.6,10.0947,9.7352,9.9345,9.552,9.6255,9.8473,9.9985,10.1182,...,9.4863,10.5809,9.3915,10.1396,9.6465,10.2163,9.6905,9.9437,9.508,9.8577
1,100121620_TGI_at,8.4341,8.4733,8.2953,8.9725,8.268,7.9887,8.1406,8.5426,8.6006,...,9.0652,9.2629,8.2649,8.3215,8.6814,8.9863,8.0667,8.8329,8.7048,8.7147
2,100121621_TGI_at,7.2959,6.4806,7.5204,5.8395,6.788,6.8215,5.4302,5.4316,6.3003,...,6.8911,6.4997,5.4497,6.7203,5.6509,7.6414,7.5428,5.8877,6.5964,6.8925
3,100121622_TGI_at,3.9982,3.1535,2.8409,3.1074,2.7183,2.965,2.4411,2.6701,3.4411,...,3.1047,2.3837,3.3391,3.6272,3.1876,2.591,2.9501,4.8632,4.1583,3.4706
4,100121623_TGI_at,9.5218,10.5117,9.7821,9.8826,9.7382,9.6683,10.2143,9.8672,9.6203,...,9.5969,9.6159,9.9517,9.9085,9.9684,9.4336,9.8635,9.8085,10.0965,9.6134


###Preprocess df1

In [31]:
# Find df1's platform ID
print("\nFinding platform ID for df1...")
with open('GSE110226_series_matrix.txt', 'r') as f:
    for line in f:
        if 'platform_id' in line.lower():
            print(line.strip())
            break


Finding platform ID for df1...
!Series_platform_id	"GPL10379"


In [37]:
#download the annotation file
import os
import urllib.request
platform_id_df1 = "GPL10379"
url = f"https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL10nnn/{platform_id_df1}/annot/{platform_id_df1}.annot.gz"
print(f"Downloading...")

try:
    urllib.request.urlretrieve(url, f'{platform_id_df1}.annot.gz')
    print("✓ Download successful!")

    file_size = os.path.getsize(f'{platform_id_df1}.annot.gz')
    print(f"File size: {file_size:,} bytes ({file_size/1024:.2f} KB)")

except Exception as e:
    print(f"✗ Download failed: {e}")
    print("\nTrying alternative soft file format...")

    url_soft = f"https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL10nnn/{platform_id_df1}/soft/{platform_id_df1}_family.soft.gz"
    print(f"Alternative URL: {url_soft}")

    try:
        urllib.request.urlretrieve(url_soft, f'{platform_id_df1}_family.soft.gz')
        print("✓ Soft file download successful!")
    except Exception as e2:
        print(f"✗ Also failed: {e2}")

Downloading...
✓ Download successful!
File size: 8,197,525 bytes (8005.40 KB)


In [38]:
# First, check the structure
with gzip.open(f'{platform_id_df1}.annot.gz', 'rt') as f:
        lines = f.readlines()

print(f"Total lines: {len(lines)}")

# Find table start
for i, line in enumerate(lines[:50]):
        if 'platform_table_begin' in line.lower():
            print(f"\nTable begins at line {i}")
            print(f"Header (line {i+1}): {lines[i+1].strip()[:200]}")
            skiprows = i + 1
            break

# Read the annotation
annotation_df1 = pd.read_csv(
            f'{platform_id_df1}.annot.gz',
            sep='\t',
            compression='gzip',
            skiprows=skiprows,
            low_memory=False
        )

Total lines: 52407

Table begins at line 26
Header (line 27): ID	Gene title	Gene symbol	Gene ID	UniGene title	UniGene symbol	UniGene ID	Nucleotide Title	GI	GenBank Accession	Platform_CLONEID	Platform_ORF	Platform_SPOTID	Chromosome location	Chromosome annotation	


In [39]:
# Create mapping dictionary (remove rows with empty gene symbols)
annotation_clean_df1 = annotation_df1[annotation_df1['Gene symbol'].notna()].copy()
probe_to_gene_df1 = dict(zip(annotation_clean_df1['ID'], annotation_clean_df1['Gene symbol']))

In [40]:
# Test the mapping with actual probes from df

test_probes = df1['ID_REF'].head(10).tolist()
print("\nProbes from df1 and their gene symbols:")
for probe in test_probes:
    gene = probe_to_gene_df1.get(probe, 'NOT FOUND')
    print(f"  {probe} --> {gene}")

# Save the annotation
annotation_clean_df1.to_csv(f'{platform_id_df1}_annotation.csv', index=False)


Probes from df1 and their gene symbols:
  100121619_TGI_at --> NOT FOUND
  100121620_TGI_at --> MPV17L
  100121621_TGI_at --> ALDH8A1
  100121622_TGI_at --> VSIG1
  100121623_TGI_at --> PLD3
  100121624_TGI_at --> NUDT15
  100121625_TGI_at --> TRMT1
  100121626_TGI_at --> NOT FOUND
  100121627_TGI_at --> CLTC
  100121628_TGI_at --> PSORS1C1


In [41]:
# Step 1: Map probe IDs to gene symbols
df1_mapped = df1.copy()
df1_mapped['Gene_Symbol'] = df1_mapped['ID_REF'].map(probe_to_gene_df1)

# Step 2: Check mapping statistics
total_probes = len(df1_mapped)
mapped_probes = df1_mapped['Gene_Symbol'].notna().sum()
unmapped_probes = df1_mapped['Gene_Symbol'].isna().sum()
print(f"  Total probes: {total_probes}")
print(f"  Successfully mapped: {mapped_probes} ({mapped_probes/total_probes*100:.2f}%)")
print(f"  Not mapped (NaN): {unmapped_probes} ({unmapped_probes/total_probes*100:.2f}%)")

  Total probes: 51627
  Successfully mapped: 37229 (72.11%)
  Not mapped (NaN): 14398 (27.89%)


In [42]:
# Step 3: Remove rows with NaN gene symbols
df1_clean = df1_mapped[df1_mapped['Gene_Symbol'].notna()].copy()

In [43]:
df1_clean.shape

(37229, 22)

In [44]:
# Step 4: Check for duplicates
df1_clean['Gene_Symbol'].duplicated().sum()

np.int64(16125)

In [45]:
df1_clean['Gene_Symbol'].nunique()

21104

In [46]:
# Drop ID_REF column and set Gene_Symbol as index
df1_clean = df1_clean.drop('ID_REF', axis=1)
df1_clean = df1_clean.set_index('Gene_Symbol')

In [47]:
# Average duplicate genes
df1_final = df1_clean.groupby(df1_clean.index).mean()

In [48]:
df1_final.shape

(21104, 20)

In [49]:
# Verify no more duplicates
df1_final.index.duplicated().sum()

np.int64(0)

In [50]:
df1_final.head()

Unnamed: 0_level_0,GSM2982966,GSM2982967,GSM2982968,GSM2982969,GSM2982970,GSM2982971,GSM2982972,GSM2982973,GSM2982974,GSM2982975,GSM2982976,GSM2982977,GSM2982978,GSM2982979,GSM2982980,GSM2982981,GSM2982982,GSM2982983,GSM2982984,GSM2982985
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A1BG,6.38405,5.9938,5.9515,6.15115,5.9254,5.82265,5.8589,5.78565,5.94785,5.75805,5.8643,6.2001,5.92855,5.90335,5.8044,6.0858,6.3423,5.83295,5.56765,5.80935
A1CF,2.7798,3.0661,2.7121,2.9528,2.5599,2.8811,2.848,2.8135,2.6521,2.8395,2.9067,2.7306,3.1158,3.0829,2.7137,2.6637,3.3392,2.7044,2.6415,3.0545
A2M,13.1877,13.2798,13.0801,13.0747,12.8458,13.2404,12.777,12.874,12.9681,12.7054,13.0213,12.7222,13.1842,13.2304,12.6122,12.6349,12.8189,12.8731,13.0954,13.0251
A2M-AS1,9.2113,8.985,8.9882,8.776,8.6572,9.843,8.2795,8.8575,8.9511,8.7841,8.8383,8.526,9.201,9.4945,8.7533,8.7106,9.3894,8.5111,8.7429,9.1307
A2ML1,6.0655,5.81215,5.46145,5.60465,6.04595,5.9338,6.13745,6.79165,6.50155,6.92975,6.57275,6.5616,6.06905,5.83155,6.9574,5.9733,5.9784,6.12335,5.97865,7.00165


###Preprocess for df2

In [29]:
df2.head()

Unnamed: 0,ID_REF,GSM1539080,GSM1539081,GSM1539082,GSM1539083,GSM1539084,GSM1539085,GSM1539086,GSM1539087,GSM1539088,...,GSM1539399,GSM1539400,GSM1539401,GSM1539402,GSM1539403,GSM1539404,GSM1539405,GSM1539406,GSM1539407,GSM1539408
0,ILMN_1343291,14.611769,14.427104,14.624553,14.442078,14.127102,14.412088,14.569419,14.624553,14.655069,...,14.668559,14.526182,14.695354,15.04979,14.025296,14.91529,14.655069,14.869782,14.249922,14.891158
1,ILMN_1343295,11.944514,11.930664,11.777748,12.420057,11.725034,11.941696,11.944514,12.608218,12.266326,...,12.126935,11.892791,12.043099,12.019189,12.049508,12.104508,11.903194,11.560549,11.625297,12.064573
2,ILMN_1651209,7.517737,7.546532,7.64779,7.74514,7.520212,7.716517,7.480014,7.598107,7.815103,...,7.607597,7.622884,7.635975,7.699531,7.64924,7.636703,7.605595,7.581313,7.587167,7.61463
3,ILMN_1651221,7.419066,7.403191,7.382467,7.438868,7.43015,7.441058,7.386403,7.476285,7.42264,...,7.419287,7.395029,7.404132,7.455708,7.435492,7.462115,7.498538,7.470538,7.511239,7.509601
4,ILMN_1651228,11.50717,10.506885,12.058497,10.649464,11.958259,10.947954,10.669642,9.963996,10.862309,...,11.016573,11.178761,10.796267,10.828405,10.375336,10.188892,11.489647,11.358234,10.361057,10.521751


In [51]:
# Find platform ID for df2
print("Searching for platform ID in GSE63060_series_matrix.txt...")
with open('GSE63060_series_matrix.txt', 'r') as f:
    for line in f:
        if 'platform_id' in line.lower():
            print(line.strip())

Searching for platform ID in GSE63060_series_matrix.txt...
!Series_platform_id	"GPL6947"
!Sample_platform_id	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"GPL6947"	"

In [52]:
import urllib.request
import gzip

# Download GPL6947 annotation file
platform_id = "GPL6947"
url = f"https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL6nnn/{platform_id}/annot/{platform_id}.annot.gz"

print(f"Downloading annotation for {platform_id}...")
try:
    urllib.request.urlretrieve(url, f'{platform_id}.annot.gz')
    print("Download successful!")

    # Read the annotation file (skip comment lines starting with #)
    annotation_df2 = pd.read_csv(f'{platform_id}.annot.gz', sep='\t', comment='#', compression='gzip')

    print(f"\nAnnotation file shape: {annotation_df2.shape}")
    print(f"\nColumn names:")
    print(annotation_df2.columns.tolist())

    print(f"\nFirst few rows:")
    print(annotation_df2.head())

    # Check which columns contain probe ID and gene symbol
    if 'ID' in annotation_df2.columns and 'Symbol' in annotation_df2.columns:
        print("\n✓ Found ID and Symbol columns!")
        print("\nSample probe-to-gene mapping:")
        print(annotation_df2[['ID', 'Symbol']].head(10))
    else:
        print("\nAvailable columns - we'll identify the right ones:")
        for col in annotation_df2.columns:
            print(f"  - {col}")

except Exception as e:
    print(f"Error downloading: {e}")
    print("\nTrying alternative method...")

Downloading annotation for GPL6947...
Download successful!
Error downloading: Error tokenizing data. C error: Expected 1 fields in line 29, saw 22


Trying alternative method...


In [53]:
import gzip

platform_id = "GPL6947"

print("Reading file manually...")

# Read the raw content
with gzip.open(f'{platform_id}.annot.gz', 'rt') as f:
    lines = f.readlines()

print(f"Total lines: {len(lines)}")

# Show first 50 lines to understand structure
print("\nFirst 50 lines:")
print("="*60)
for i, line in enumerate(lines[:50]):
    print(f"{i}: {line.strip()[:150]}")  # First 150 chars

# Find where the actual data table starts
print("\n" + "="*60)
print("Looking for table markers...")
print("="*60)

for i, line in enumerate(lines):
    if 'platform_table_begin' in line.lower():
        print(f"Table begins at line {i}")
        print(f"Header (line {i+1}): {lines[i+1].strip()[:200]}")
        print(f"First data (line {i+2}): {lines[i+2].strip()[:200]}")
        break

Reading file manually...
Total lines: 49606

First 50 lines:
0: ^Annotation
1: !Annotation_date = Aug 09 2016
2: !Annotation_platform = GPL6947
3: !Annotation_platform_title = Illumina HumanHT-12 V3.0 expression beadchip
4: !Annotation_platform_organism = Homo sapiens
5: #ID = ID from Platform data table
6: #Gene title = Entrez Gene name
7: #Gene symbol = Entrez Gene symbol
8: #Gene ID = Entrez Gene identifier
9: #UniGene title = Entrez UniGene name
10: #UniGene symbol = Entrez UniGene symbol
11: #UniGene ID = Entrez UniGene identifier
12: #Nucleotide Title = Entrez Nucleotide title
13: #GI = GenBank identifier
14: #GenBank Accession = GenBank accession
15: #Platform_CLONEID = CLONE_ID from Platform data table
16: #Platform_ORF = ORF from Platform data table
17: #Platform_SPOTID = SPOT_ID from Platform data table
18: #Chromosome location = Entrez gene chromosome and location
19: #Chromosome annotation = Entrez gene chromosome annotation
20: #GO:Function = Gene Ontology Function term
21

In [54]:
import pandas as pd
import gzip

platform_id = "GPL6947"

print("Parsing annotation file correctly...")

# Read from line 28 onwards (where the actual table starts)
annotation_df2 = pd.read_csv(
    f'{platform_id}.annot.gz',
    sep='\t',
    compression='gzip',
    skiprows=28,  # Skip first 28 lines (metadata and header marker)
    low_memory=False
)

print(f"✓ Successfully parsed!")
print(f"Shape: {annotation_df2.shape}")

print(f"\nColumn names:")
print(annotation_df2.columns.tolist())

print(f"\nFirst 10 rows:")
print(annotation_df2.head(10))

# Focus on the important columns: ID and Gene symbol
print("\n" + "="*60)
print("PROBE ID to GENE SYMBOL MAPPING")
print("="*60)

print(f"\nSample mappings (first 20):")
print(annotation_df2[['ID', 'Gene symbol']].head(20))

# Check for empty gene symbols
empty_genes = annotation_df2['Gene symbol'].isna().sum()
print(f"\nProbes without gene symbols: {empty_genes} / {len(annotation_df2)} ({empty_genes/len(annotation_df2)*100:.2f}%)")

# Create mapping dictionary (remove rows with empty gene symbols)
annotation_clean = annotation_df2[annotation_df2['Gene symbol'].notna()].copy()
probe_to_gene_df2 = dict(zip(annotation_clean['ID'], annotation_clean['Gene symbol']))

print(f"\n✓ Created mapping dictionary with {len(probe_to_gene_df2)} probe-to-gene mappings")

# Test the mapping with actual probes from df2
print("\n" + "="*60)
print("TESTING MAPPING WITH ACTUAL DF2 PROBES")
print("="*60)

test_probes = df2['ID_REF'].head(10).tolist()
print("\nProbes from df2 and their gene symbols:")
for probe in test_probes:
    gene = probe_to_gene_df2.get(probe, 'NOT FOUND')
    print(f"  {probe} --> {gene}")

# Save the annotation for later use
annotation_clean.to_csv('GPL6947_annotation.csv', index=False)
print("\n✓ Annotation saved to GPL6947_annotation.csv")

Parsing annotation file correctly...
✓ Successfully parsed!
Shape: (49577, 22)

Column names:
['ID', 'Gene title', 'Gene symbol', 'Gene ID', 'UniGene title', 'UniGene symbol', 'UniGene ID', 'Nucleotide Title', 'GI', 'GenBank Accession', 'Platform_CLONEID', 'Platform_ORF', 'Platform_SPOTID', 'Chromosome location', 'Chromosome annotation', 'GO:Function', 'GO:Process', 'GO:Component', 'GO:Function ID', 'GO:Process ID', 'GO:Component ID', 'Platform_SEQUENCE']

First 10 rows:
             ID                              Gene title Gene symbol Gene ID  \
0  ILMN_1725881                                     NaN         NaN     NaN   
1  ILMN_1910180                            neuropilin 2        NRP2    8828   
2  ILMN_1804174         Fc fragment of IgG receptor IIb      FCGR2B    2213   
3  ILMN_1796063          tripartite motif containing 44      TRIM44   54765   
4  ILMN_1811966                                     NaN         NaN     NaN   
5  ILMN_1668162  acyl-CoA wax alcohol acyltransfer

In [55]:
annotation_saved = pd.read_csv('GPL6947_annotation.csv')
annotation_saved.head()

Unnamed: 0,ID,Gene title,Gene symbol,Gene ID,UniGene title,UniGene symbol,UniGene ID,Nucleotide Title,GI,GenBank Accession,...,Platform_SPOTID,Chromosome location,Chromosome annotation,GO:Function,GO:Process,GO:Component,GO:Function ID,GO:Process ID,GO:Component ID,Platform_SEQUENCE
0,ILMN_1910180,neuropilin 2,NRP2,8828,,,,"Homo sapiens cDNA: FLJ21027 fis, clone CAE07110",10437021.0,AK024680,...,,2q33.3,"Chromosome 2, NC_000002.12 (205682450..205798133)",cytokine binding///growth factor binding///hep...,angiogenesis///axon extension involved in axon...,extracellular region///integral component of m...,GO:0019955///GO:0019838///GO:0008201///GO:0046...,GO:0001525///GO:0048846///GO:0007411///GO:0007...,GO:0005576///GO:0016021///GO:0016020///GO:0005...,ACACCTTCAGGAGGGAAGCCCTTATTTCTGGGTTGAACTCCCCTTC...
1,ILMN_1804174,Fc fragment of IgG receptor IIb,FCGR2B,2213,,,,Homo sapiens Fc fragment of IgG receptor IIb (...,299890843.0,NM_001002273,...,,1q23,"Chromosome 1, NC_000001.11 (161647243..161678654)",IgG binding///protein binding,immune response///regulation of immune respons...,integral component of membrane///plasma membrane,GO:0019864///GO:0005515,GO:0006955///GO:0050776///GO:0007165///GO:0016032,GO:0016021///GO:0005886,TAGGGGCAATAGGCTATACGCTACAGCCTAGGTGTGTAGTAGGCCA...
2,ILMN_1796063,tripartite motif containing 44,TRIM44,54765,,,,Homo sapiens tripartite motif containing 44 (T...,725798593.0,NM_017583,...,,11p13,"Chromosome 11, NC_000011.10 (35662692..35811053)",protein binding///zinc ion binding,,intracellular,GO:0005515///GO:0008270,,GO:0005622,CCTGCCTGTCTGCCTGTGACCTGTGTACGTATTACAGGCTTTAGGA...
3,ILMN_1668162,acyl-CoA wax alcohol acyltransferase 1,AWAT1,158833,,,,Homo sapiens acyl-CoA wax alcohol acyltransfer...,254039656.0,NM_001013579,...,,Xq13.1,"Chromosome X, NC_000023.11 (70233489..70240661)",long-chain-alcohol O-fatty-acyltransferase act...,arachidonic acid metabolic process///wax biosy...,endoplasmic reticulum membrane///integral comp...,GO:0047196///GO:0103095,GO:0019369///GO:0010025,GO:0005789///GO:0016021,GTCAAGGCTCCACTGGGCTCCTGCCATACTCCAGGCCTATTGTCAC...
4,ILMN_1912287,succinate receptor 1,SUCNR1,56670,,,,BX093329 Soares_parathyroid_tumor_NbHPA Homo s...,27826545.0,BX093329,...,,3q25.1,"Chromosome 3, NC_000003.12 (151873643..151884619)",G-protein coupled receptor activity,G-protein coupled receptor signaling pathway//...,extracellular exosome///integral component of ...,GO:0004930,GO:0007186///GO:0008150,GO:0070062///GO:0016021///GO:0005886,GTGCCAGCTGCCATTGCACTGCCTCACATTTTCCTTTAGATGTTTG...


In [56]:
# Create a copy of df2
df2_mapped = df2.copy()

# Map ID_REF (probe IDs) to gene symbols using the dictionary we created
df2_mapped['Gene_Symbol'] = df2_mapped['ID_REF'].map(probe_to_gene_df2)

In [57]:
df2_mapped.head()

Unnamed: 0,ID_REF,GSM1539080,GSM1539081,GSM1539082,GSM1539083,GSM1539084,GSM1539085,GSM1539086,GSM1539087,GSM1539088,...,GSM1539400,GSM1539401,GSM1539402,GSM1539403,GSM1539404,GSM1539405,GSM1539406,GSM1539407,GSM1539408,Gene_Symbol
0,ILMN_1343291,14.611769,14.427104,14.624553,14.442078,14.127102,14.412088,14.569419,14.624553,14.655069,...,14.526182,14.695354,15.04979,14.025296,14.91529,14.655069,14.869782,14.249922,14.891158,EEF1A1
1,ILMN_1343295,11.944514,11.930664,11.777748,12.420057,11.725034,11.941696,11.944514,12.608218,12.266326,...,11.892791,12.043099,12.019189,12.049508,12.104508,11.903194,11.560549,11.625297,12.064573,GAPDH
2,ILMN_1651209,7.517737,7.546532,7.64779,7.74514,7.520212,7.716517,7.480014,7.598107,7.815103,...,7.622884,7.635975,7.699531,7.64924,7.636703,7.605595,7.581313,7.587167,7.61463,SLC35E2
3,ILMN_1651221,7.419066,7.403191,7.382467,7.438868,7.43015,7.441058,7.386403,7.476285,7.42264,...,7.395029,7.404132,7.455708,7.435492,7.462115,7.498538,7.470538,7.511239,7.509601,
4,ILMN_1651228,11.50717,10.506885,12.058497,10.649464,11.958259,10.947954,10.669642,9.963996,10.862309,...,11.178761,10.796267,10.828405,10.375336,10.188892,11.489647,11.358234,10.361057,10.521751,RPS28


In [58]:
# Step 3: Check how many probes were successfully mapped
mapped_count = df2_mapped['Gene_Symbol'].notna().sum()
unmapped_count = df2_mapped['Gene_Symbol'].isna().sum()

print(f"  Successfully mapped: {mapped_count} probes")
print(f"  Not mapped (NaN): {unmapped_count} probes")
print(f"  Success rate: {mapped_count/len(df2_mapped)*100:.2f}%")

  Successfully mapped: 25074 probes
  Not mapped (NaN): 13249 probes
  Success rate: 65.43%


PROCEED with removing unmapped probes!

Reasons:

25,074 genes is plenty for Alzheimer's classification

Unmapped probes don't have biological meaning

Keeping NaN values will cause ML model errors

Other studies do the same - this is standard practice

In [59]:
df2_mapped.shape

(38323, 331)

In [60]:
# Remove rows where Gene_Symbol is NaN
df2_clean = df2_mapped[df2_mapped['Gene_Symbol'].notna()].copy()

In [61]:
df2_clean.shape

(25074, 331)

In [62]:
#Checking for any remaining NaN:")
df2_clean['Gene_Symbol'].isna().sum()

np.int64(0)

In [63]:
df2_clean.head()

Unnamed: 0,ID_REF,GSM1539080,GSM1539081,GSM1539082,GSM1539083,GSM1539084,GSM1539085,GSM1539086,GSM1539087,GSM1539088,...,GSM1539400,GSM1539401,GSM1539402,GSM1539403,GSM1539404,GSM1539405,GSM1539406,GSM1539407,GSM1539408,Gene_Symbol
0,ILMN_1343291,14.611769,14.427104,14.624553,14.442078,14.127102,14.412088,14.569419,14.624553,14.655069,...,14.526182,14.695354,15.04979,14.025296,14.91529,14.655069,14.869782,14.249922,14.891158,EEF1A1
1,ILMN_1343295,11.944514,11.930664,11.777748,12.420057,11.725034,11.941696,11.944514,12.608218,12.266326,...,11.892791,12.043099,12.019189,12.049508,12.104508,11.903194,11.560549,11.625297,12.064573,GAPDH
2,ILMN_1651209,7.517737,7.546532,7.64779,7.74514,7.520212,7.716517,7.480014,7.598107,7.815103,...,7.622884,7.635975,7.699531,7.64924,7.636703,7.605595,7.581313,7.587167,7.61463,SLC35E2
4,ILMN_1651228,11.50717,10.506885,12.058497,10.649464,11.958259,10.947954,10.669642,9.963996,10.862309,...,11.178761,10.796267,10.828405,10.375336,10.188892,11.489647,11.358234,10.361057,10.521751,RPS28
5,ILMN_1651229,7.874504,8.223289,8.160895,7.971988,8.083689,8.132574,7.9492,7.980954,7.931701,...,7.874687,8.06133,7.813026,7.974436,8.115954,7.943615,8.026125,8.079845,7.911927,IPO13


In [64]:
# Check for duplicates
duplicate_count = df2_clean['Gene_Symbol'].duplicated().sum()
total_genes = len(df2_clean)
unique_genes = df2_clean['Gene_Symbol'].nunique()

print(f"Total rows (probes): {total_genes}")
print(f"Unique gene symbols: {unique_genes}")
print(f"Duplicate gene symbols: {duplicate_count}")

Total rows (probes): 25074
Unique gene symbols: 17654
Duplicate gene symbols: 7420


Average all duplicate genes.

Pros:

Reduces noise: Multiple probes may have measurement errors; averaging smooths them out Standard practice: Most published papers use this approach Balanced representation: No single probe dominates Simple and interpretable

Cons:

Loss of information: If one probe is better quality, we dilute it Assumes all probes are equally good: They might not be

In [65]:
# Step 1: Drop the ID_REF column (we don't need probe IDs anymore)
df2_clean = df2_clean.drop('ID_REF', axis=1)

# Step 2: Set Gene_Symbol as index
df2_clean = df2_clean.set_index('Gene_Symbol')

In [66]:
# Step 3: Average duplicate genes (group by index and take mean)
df2_final = df2_clean.groupby(df2_clean.index).mean()

In [67]:
# Verify no more duplicates
df2_final.index.duplicated().sum()

np.int64(0)

In [68]:
df2_final.shape

(17654, 329)

In [69]:
df2_final.shape

(17654, 329)

###Preprocess df3

In [30]:
df3.head()

Unnamed: 0,ID_REF,GSM2266610,GSM2266611,GSM2266612,GSM2266613,GSM2266614,GSM2266615,GSM2266616,GSM2266617,GSM2266618,...,GSM2266780,GSM2266781,GSM2266782,GSM2266783,GSM2266784,GSM2266785,GSM2266786,GSM2266787,GSM2266788,GSM2266789


In [70]:
# Check the entire file structure for df3
with open('GSE85426_series_matrix.txt', 'r') as f:
    lines = f.readlines()

print(f"Total lines in file: {len(lines)}")
print("\nFirst 30 lines:")
for i in range(min(30, len(lines))):
    print(f"{i}: {lines[i][:150]}")  # Print first 150 chars

print("\n\nLast 30 lines:")
for i in range(max(0, len(lines)-30), len(lines)):
    print(f"{i}: {lines[i][:150]}")

# Search for any line that doesn't start with !
print("\n\nLines that DON'T start with '!':")
for i, line in enumerate(lines):
    if not line.startswith('!'):
        print(f"Line {i}: {line[:150]}")
        if i > 100:  # Just show first few
            break

Total lines in file: 65

First 30 lines:
0: !Series_title	"Peripheral blood gene expression as a biomarker for early detection of Alzheimer’s disease"

1: !Series_geo_accession	"GSE85426"

2: !Series_status	"Public on Aug 11 2016"

3: !Series_submission_date	"Aug 10 2016"

4: !Series_last_update_date	"Oct 11 2016"

5: !Series_summary	"To further development of our gene expression profile of Alzheimer's disease, we have employed whole genome microarray expression pro
6: !Series_overall_design	"Total RNA from peripheral blood cells was extracted, reverse-transcribed and labelled, then analysed for gene expression using
7: !Series_type	"Expression profiling by array"

8: !Series_contributor	"Ainon,Z,Samsudin"

9: !Series_contributor	"Abu Bakar,,Abdul Majeed"

10: !Series_contributor	"Kalavathy,,Ramasamy"

11: !Series_sample_id	"GSM2266610 GSM2266611 GSM2266612 GSM2266613 GSM2266614 GSM2266615 GSM2266616 GSM2266617 GSM2266618 GSM2266619 GSM2266620 GSM2266621
12: !Series_contact_name	"Ainon

In [71]:
import urllib.request
import gzip
import shutil

# Use the NORMALIZED data (better for ML)
url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE85nnn/GSE85426/suppl/GSE85426_normalized_data.txt.gz"

print("Downloading GSE85426 normalized data...")
urllib.request.urlretrieve(url, 'GSE85426_normalized_data.txt.gz')

print("Extracting...")
with gzip.open('GSE85426_normalized_data.txt.gz', 'rb') as f_in:
    with open('GSE85426_normalized_data.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("Loading data...")
df3 = pd.read_csv('GSE85426_normalized_data.txt', sep='\t')

print(f"\nShape: {df3.shape}")
print(df3.head())

# Save as clean CSV
df3.to_csv('GSE85426_clean.csv', index=False)

Downloading GSE85426 normalized data...
Extracting...
Loading data...

Shape: (14113, 181)
          Gene_ID  US81403231_252800417012_S01_GE1_107_Sep09_1_1.txt  \
0  A_19_P00328650                                          -0.126716   
1  A_19_P00803901                                           0.700278   
2  A_19_P00322400                                           0.419577   
3  A_19_P00812377                                           0.504275   
4  A_19_P00805814                                           0.292399   

   US81403231_252800417012_S01_GE1_107_Sep09_1_2.txt  \
0                                          -0.088361   
1                                          -0.151993   
2                                           0.458900   
3                                           0.128170   
4                                           0.205929   

   US81403231_252800417012_S01_GE1_107_Sep09_1_4.txt  \
0                                           1.116998   
1                          

In [72]:
df3.Gene_ID.nunique()

14113

In [73]:
sum(df3.Gene_ID.isnull())

0

In [74]:
df3.Gene_ID.dtypes

dtype('O')

In [75]:
# Get only numeric columns (exclude Gene_ID)
expression_data = df3.drop('Gene_ID', axis=1)

In [76]:
expression_data.isnull().sum().sum()

np.int64(0)

In [77]:
# Find platform ID for df3
print("\nFinding platform ID for df3...")
with open('GSE85426_series_matrix.txt', 'r') as f:
    for line in f:
        if 'platform_id' in line.lower():
            print(line.strip())
            break


Finding platform ID for df3...
!Series_platform_id	"GPL14550"


In [78]:
platform_id_df3 = "GPL14550"
# Download annotation file
url = f"https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL14nnn/{platform_id_df3}/annot/{platform_id_df3}.annot.gz"
try:
    urllib.request.urlretrieve(url, f'{platform_id_df3}.annot.gz')
    print("✓ Download successful!")

    file_size = os.path.getsize(f'{platform_id_df3}.annot.gz')
    print(f"File size: {file_size:,} bytes ({file_size/1024:.2f} KB)")

except Exception as e:
    print(f"✗ Download failed: {e}")
    print("\nTrying alternative soft file format...")

    url_soft = f"https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL14nnn/{platform_id_df3}/soft/{platform_id_df3}_family.soft.gz"
    print(f"Alternative URL: {url_soft}")

    try:
        urllib.request.urlretrieve(url_soft, f'{platform_id_df3}_family.soft.gz')
        print("✓ Soft file download successful!")
    except Exception as e2:
        print(f"✗ Also failed: {e2}")

✗ Download failed: HTTP Error 404: Not Found

Trying alternative soft file format...
Alternative URL: https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL14nnn/GPL14550/soft/GPL14550_family.soft.gz
✓ Soft file download successful!


In [79]:
import os

# Check if file exists
soft_file = 'GPL14550_family.soft.gz'

if os.path.exists(soft_file):
    file_size = os.path.getsize(soft_file)
    print(f"✓ File exists: {soft_file}")
    print(f"File size: {file_size:,} bytes ({file_size/1024/1024:.2f} MB)")
else:
    print(f"✗ File not found: {soft_file}")
    print("\nChecking what GPL14550 files exist:")
    for f in os.listdir('.'):
        if 'GPL14550' in f:
            print(f"  - {f}")

✓ File exists: GPL14550_family.soft.gz
File size: 2,477,011,147 bytes (2362.26 MB)


In [80]:
import gzip

soft_file = 'GPL14550_family.soft.gz'

print("="*60)
print("READING LARGE FILE LINE BY LINE")
print("="*60)
print(f"File size: 2.36 GB - reading carefully...")

try:
    with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
        # Read only first 50 lines
        print("\nFirst 50 lines:")
        print("="*60)
        for i in range(50):
            line = f.readline()
            if not line:
                break
            print(f"{i}: {line.strip()[:150]}")

    print("\n✓ Successfully read first 50 lines!")

    # Now find the table start
    print("\n" + "="*60)
    print("SEARCHING FOR TABLE START (this may take a moment)...")
    print("="*60)

    with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
        for i, line in enumerate(f):
            if '!platform_table_begin' in line.lower():
                print(f"✓ Found table at line {i}")
                # Read next 2 lines
                header = f.readline()
                data = f.readline()
                print(f"Header: {header.strip()[:200]}")
                print(f"First data: {data.strip()[:200]}")
                print(f"\nUse skiprows={i+1} to read the table")
                break

            # Show progress every 10000 lines
            if i % 10000 == 0 and i > 0:
                print(f"  Searched {i:,} lines...")

except Exception as e:
    print(f"✗ Error: {e}")

READING LARGE FILE LINE BY LINE
File size: 2.36 GB - reading carefully...

First 50 lines:
0: ^DATABASE = GeoMiame
1: !Database_name = Gene Expression Omnibus (GEO)
2: !Database_institute = NCBI NLM NIH
3: !Database_web_link = http://www.ncbi.nlm.nih.gov/geo
4: !Database_email = geo@ncbi.nlm.nih.gov
5: ^PLATFORM = GPL14550
6: !Platform_title = Agilent-028004 SurePrint G3 Human GE 8x60K Microarray (Probe Name Version)
7: !Platform_geo_accession = GPL14550
8: !Platform_status = Public on Sep 08 2011
9: !Platform_submission_date = Sep 08 2011
10: !Platform_last_update_date = Oct 11 2016
11: !Platform_technology = in situ oligonucleotide
12: !Platform_distribution = commercial
13: !Platform_organism = Homo sapiens
14: !Platform_taxid = 9606
15: !Platform_manufacturer = Agilent Technologies
16: !Platform_manufacture_protocol = see manufacturer's web site at http://www.agilent.com/
17: !Platform_description = SurePrint G3 Human GE 8x60K Microarray
18: !Platform_description =
19: !Platform_de

In [81]:
import gzip

soft_file = 'GPL14550_family.soft.gz'

print("Building probe-to-gene mapping...")

probe_to_gene_df3 = {}

with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
    # Skip to line 8565
    for i in range(8565):
        next(f)

    # Read header
    header = next(f)
    print(f"Header: {header.strip()[:150]}")

    # Read data lines
    count = 0
    for line in f:
        if line.startswith('!') or line.startswith('^'):
            break  # End of table

        parts = line.strip().split('\t')
        if len(parts) >= 7:
            probe_id = parts[0]
            gene_symbol = parts[6]

            if gene_symbol and gene_symbol.strip():
                probe_to_gene_df3[probe_id] = gene_symbol

        count += 1
        if count % 10000 == 0:
            print(f"Processed {count:,} lines, {len(probe_to_gene_df3):,} mappings...")

print(f"\n✓ Created {len(probe_to_gene_df3):,} mappings")

# Test
print("\nTesting with df3 probes:")
for probe in df3['Gene_ID'].head(10):
    gene = probe_to_gene_df3.get(probe, 'NOT FOUND')
    print(f"  {probe} --> {gene}")

Building probe-to-gene mapping...
Header: ID	SPOT_ID	CONTROL_TYPE	REFSEQ	GB_ACC	GENE	GENE_SYMBOL	GENE_NAME	UNIGENE_ID	ENSEMBL_ID	TIGR_ID	ACCESSION_STRING	CHROMOSOMAL_LOCATION	CYTOBAND	DESCRIPT
Processed 10,000 lines, 2,302 mappings...
Processed 20,000 lines, 12,221 mappings...
Processed 30,000 lines, 20,958 mappings...
Processed 40,000 lines, 28,473 mappings...

✓ Created 30,436 mappings

Testing with df3 probes:
  A_19_P00328650 --> NOT FOUND
  A_19_P00803901 --> NOT FOUND
  A_19_P00322400 --> NOT FOUND
  A_19_P00812377 --> NOT FOUND
  A_19_P00805814 --> NOT FOUND
  A_19_P00325706 --> NOT FOUND
  A_33_P3337742 --> NOT FOUND
  A_19_P00327176 --> NOT FOUND
  A_19_P00327009 --> NOT FOUND
  A_33_P3216448,A_33_P3216442|1302 --> NOT FOUND


In [82]:
# Show some sample probe IDs from the annotation
print("="*60)
print("SAMPLE PROBE IDs FROM ANNOTATION")
print("="*60)

sample_probes = list(probe_to_gene_df3.keys())[:20]
print("\nFirst 20 probe IDs in annotation:")
for probe in sample_probes:
    gene = probe_to_gene_df3[probe]
    print(f"  {probe} --> {gene}")

print("\n" + "="*60)
print("COMPARISON")
print("="*60)

print("\nProbe IDs in df3:")
print(df3['Gene_ID'].head(10).tolist())

print("\nProbe IDs in annotation:")
print(sample_probes[:10])

print("\n" + "="*60)
print("PROBLEM IDENTIFIED")
print("="*60)
print("""
The probe ID formats don't match!
df3 has: A_19_P00328650 (with underscores)
Annotation might have different format

Let's check if the annotation has 'SPOT_ID' or another column that matches...
""")

# Check if probe IDs match SPOT_ID instead
print("\nLet's check the actual data in the file...")

SAMPLE PROBE IDs FROM ANNOTATION

First 20 probe IDs in annotation:
  A_19_P00315502 --> XLOC_008373
  A_19_P00315518 --> XLOC_005810
  A_19_P00315519 --> XLOC_004914
  A_19_P00315524 --> XLOC_014192
  A_19_P00315528 --> XLOC_008370
  A_19_P00315529 --> XLOC_008370
  A_19_P00315543 --> XLOC_005295
  A_19_P00315554 --> XLOC_006756
  A_19_P00315593 --> XLOC_004643
  A_19_P00315625 --> XLOC_005441
  A_19_P00315641 --> XLOC_008079
  A_19_P00315647 --> XLOC_009582
  A_19_P00315649 --> XLOC_009582
  A_19_P00315651 --> XLOC_013837
  A_19_P00315693 --> XLOC_014209
  A_19_P00315705 --> XLOC_005981
  A_19_P00315753 --> XLOC_002746
  A_19_P00315773 --> XLOC_002746
  A_19_P00315789 --> XLOC_003303
  A_19_P00315790 --> XLOC_003303

COMPARISON

Probe IDs in df3:
['A_19_P00328650', 'A_19_P00803901', 'A_19_P00322400', 'A_19_P00812377', 'A_19_P00805814', 'A_19_P00325706', 'A_33_P3337742', 'A_19_P00327176', 'A_19_P00327009', 'A_33_P3216448,A_33_P3216442|1302']

Probe IDs in annotation:
['A_19_P00315502'

In [83]:
print("="*60)
print("CHECKING IF DF3 PROBES EXIST IN ANNOTATION")
print("="*60)

# Check if ANY of df3's probes exist in the mapping
df3_probes = set(df3['Gene_ID'].tolist())
mapped_count = 0
sample_found = []

for probe in df3_probes:
    if probe in probe_to_gene_df3:
        mapped_count += 1
        if len(sample_found) < 10:
            sample_found.append((probe, probe_to_gene_df3[probe]))

print(f"Total probes in df3: {len(df3_probes)}")
print(f"Probes found in annotation: {mapped_count}")
print(f"Mapping rate: {mapped_count/len(df3_probes)*100:.2f}%")

if sample_found:
    print("\nSample of matched probes:")
    for probe, gene in sample_found:
        print(f"  {probe} --> {gene}")
else:
    print("\n⚠ WARNING: NO PROBES MATCHED!")
    print("\nLet's check if probe IDs are in SPOT_ID column instead...")

    # Re-read to check SPOT_ID
    print("\nRe-reading file to check SPOT_ID column...")

    probe_to_gene_df3_spotid = {}

    with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
        # Skip to line 8565
        for i in range(8565):
            next(f)

        # Read header
        header = next(f)

        # Read first 100 lines to test
        for i in range(100):
            line = f.readline()
            if line.startswith('!') or line.startswith('^'):
                break

            parts = line.strip().split('\t')
            if len(parts) >= 7:
                spot_id = parts[1]  # SPOT_ID column
                gene_symbol = parts[6]  # GENE_SYMBOL column

                if gene_symbol and gene_symbol.strip():
                    probe_to_gene_df3_spotid[spot_id] = gene_symbol

    print(f"\nChecking SPOT_ID column (first 100 entries):")
    print("Sample SPOT_IDs:", list(probe_to_gene_df3_spotid.keys())[:10])

    # Test with SPOT_ID
    print("\nTesting df3 probes against SPOT_ID:")
    for probe in df3['Gene_ID'].head(10):
        gene = probe_to_gene_df3_spotid.get(probe, 'NOT FOUND')
        print(f"  {probe} --> {gene}")

CHECKING IF DF3 PROBES EXIST IN ANNOTATION
Total probes in df3: 14113
Probes found in annotation: 214
Mapping rate: 1.52%

Sample of matched probes:
  A_19_P00329086 --> XLOC_005832
  A_33_P3338275 --> LOC440297
  A_33_P3321372 --> CNTNAP3
  A_33_P3370461 --> SUZ12P
  A_19_P00806637 --> XLOC_002473
  A_19_P00321789 --> XLOC_005464
  A_19_P00322544 --> XLOC_007868
  A_33_P3267195 --> KU-MEL-3
  A_19_P00321388 --> XLOC_012568
  A_33_P3264444 --> PFDN6


Trying with different column.

In [84]:
print("="*60)
print("TRYING SPOT_ID COLUMN FOR BETTER MAPPING")
print("="*60)

probe_to_gene_spotid = {}

with gzip.open(soft_file, 'rt', encoding='utf-8', errors='ignore') as f:
    # Skip to data start
    for i in range(8565):
        next(f)

    header = next(f)

    count = 0
    for line in f:
        if line.startswith('!') or line.startswith('^'):
            break

        parts = line.strip().split('\t')
        if len(parts) >= 7:
            spot_id = parts[1]  # SPOT_ID
            gene_symbol = parts[6]  # GENE_SYMBOL

            # Only add if gene symbol exists and is not XLOC (those are not real genes)
            if gene_symbol and gene_symbol.strip() and not gene_symbol.startswith('XLOC_'):
                probe_to_gene_spotid[spot_id] = gene_symbol

        count += 1
        if count % 10000 == 0:
            print(f"Processed {count:,} lines, {len(probe_to_gene_spotid):,} valid gene mappings...")

print(f"\n✓ Found {len(probe_to_gene_spotid):,} valid gene mappings (excluding XLOC)")

# Test with SPOT_ID
print("\n" + "="*60)
print("TESTING DF3 PROBES WITH SPOT_ID")
print("="*60)

test_probes = df3['Gene_ID'].head(10).tolist()
for probe in test_probes:
    gene = probe_to_gene_spotid.get(probe, 'NOT FOUND')
    print(f"  {probe} --> {gene}")

# Check mapping rate
df3_probes = set(df3['Gene_ID'].tolist())
mapped_spotid = sum(1 for p in df3_probes if p in probe_to_gene_spotid)

print(f"\nMapping with SPOT_ID:")
print(f"  df3 probes: {len(df3_probes)}")
print(f"  Matched: {mapped_spotid} ({mapped_spotid/len(df3_probes)*100:.2f}%)")

# If SPOT_ID works better, use it
if mapped_spotid > 214:
    print("\n✓ SPOT_ID gives better mapping! Using SPOT_ID...")
    probe_to_gene_df3 = probe_to_gene_spotid
else:
    print("\n⚠ SPOT_ID doesn't help. Will use original mapping...")

TRYING SPOT_ID COLUMN FOR BETTER MAPPING
Processed 10,000 lines, 1,699 valid gene mappings...
Processed 20,000 lines, 11,618 valid gene mappings...
Processed 30,000 lines, 20,355 valid gene mappings...
Processed 40,000 lines, 27,870 valid gene mappings...

✓ Found 29,833 valid gene mappings (excluding XLOC)

TESTING DF3 PROBES WITH SPOT_ID
  A_19_P00328650 --> NOT FOUND
  A_19_P00803901 --> NOT FOUND
  A_19_P00322400 --> NOT FOUND
  A_19_P00812377 --> NOT FOUND
  A_19_P00805814 --> NOT FOUND
  A_19_P00325706 --> NOT FOUND
  A_33_P3337742 --> NOT FOUND
  A_19_P00327176 --> NOT FOUND
  A_19_P00327009 --> NOT FOUND
  A_33_P3216448,A_33_P3216442|1302 --> NOT FOUND

Mapping with SPOT_ID:
  df3 probes: 14113
  Matched: 138 (0.98%)

⚠ SPOT_ID doesn't help. Will use original mapping...


The SPOT_ID gives even worse results (0.98%). The problem is clear: df3's probe IDs don't match the annotation file properly.

In [85]:
# Use the original probe_to_gene_df3 dictionary (which had 214 matches)
# Filter out XLOC genes (non-standard)
probe_to_gene_df3_filtered = {k: v for k, v in probe_to_gene_df3.items()
                                if not v.startswith('XLOC_')}
df3_mapped = df3.copy()
df3_mapped['Gene_Symbol'] = df3_mapped['Gene_ID'].map(probe_to_gene_df3_filtered)

# Check mapping statistics
mapped_count = df3_mapped['Gene_Symbol'].notna().sum()
unmapped_count = df3_mapped['Gene_Symbol'].isna().sum()

print(f"Total probes in df3: {len(df3_mapped)}")
print(f"Successfully mapped (with real genes): {mapped_count}")
print(f"Not mapped: {unmapped_count}")
print(f"Mapping rate: {mapped_count/len(df3_mapped)*100:.2f}%")
# Remove unmapped probes
print("\n" + "="*60)
print("KEEPING ONLY MAPPED PROBES")
print("="*60)

df3_clean = df3_mapped[df3_mapped['Gene_Symbol'].notna()].copy()
print(f"Shape after removing unmapped: {df3_clean.shape}")


Total probes in df3: 14113
Successfully mapped (with real genes): 138
Not mapped: 13975
Mapping rate: 0.98%

KEEPING ONLY MAPPED PROBES
Shape after removing unmapped: (138, 182)


In [86]:
# Check for duplicates
duplicate_count = df3_clean['Gene_Symbol'].duplicated().sum()
unique_genes = df3_clean['Gene_Symbol'].nunique()

print(f"\nUnique gene symbols: {unique_genes}")
print(f"Duplicate gene symbols: {duplicate_count}")


Unique gene symbols: 130
Duplicate gene symbols: 8


In [87]:
# Drop Gene_ID and set Gene_Symbol as index
df3_clean = df3_clean.drop('Gene_ID', axis=1)
df3_clean = df3_clean.set_index('Gene_Symbol')

In [88]:
df3_final = df3_clean.groupby(df3_clean.index).mean()

In [89]:
df3_final.shape

(130, 180)

Find out how many common genes accross all tables

In [90]:
# Get gene sets
genes_df1 = set(df1_final.index)
genes_df2 = set(df2_final.index)
genes_df3 = set(df3_final.index)

print(f"df1 genes: {len(genes_df1):,}")
print(f"df2 genes: {len(genes_df2):,}")
print(f"df3 genes: {len(genes_df3):,}")

df1 genes: 21,104
df2 genes: 17,654
df3 genes: 130


In [91]:
# Find common genes
common_df1_df2 = genes_df1 & genes_df2
common_all_3 = genes_df1 & genes_df2 & genes_df3
print(f"Common in df1 & df2: {len(common_df1_df2):,} genes")
print(f"Common in df1 & df2 & df3: {len(common_all_3):,} genes")

Common in df1 & df2: 16,196 genes
Common in df1 & df2 & df3: 39 genes


USE DF1 + DF2 ONLY

Why:

16,196 genes is EXCELLENT for ML (plenty for feature selection)

39 genes is way too few for meaningful Alzheimer's classification

You lose 180 patients from df3, but gain 16,157 genes

For gene expression ML:

16,196 genes × ~350 patients = Great dataset

39 genes × ~530 patients = Too few features

Next step: Merge df1 and df2 on the 16,196 common genes

In [93]:
# Get gene sets from each dataset
genes_adni = set(adni_final.index)

print(f"df1 genes: {len(genes_df1):,}")
print(f"df2 genes: {len(genes_df2):,}")
print(f"adni genes: {len(genes_adni):,}")

# Find common genes across all 3
common_genes_all = genes_df1 & genes_df2 & genes_adni

len(common_genes_all)


df1 genes: 21,104
df2 genes: 17,654
adni genes: 20,088


14907

In [94]:
# Also check pairwise for comparison
common_df1_df2 = genes_df1 & genes_df2
common_df1_adni = genes_df1 & genes_adni
common_df2_adni = genes_df2 & genes_adni

print(f"\nPairwise overlaps:")
print(f"  df1 ∩ df2: {len(common_df1_df2):,}")
print(f"  df1 ∩ adni: {len(common_df1_adni):,}")
print(f"  df2 ∩ adni: {len(common_df2_adni):,}")



Pairwise overlaps:
  df1 ∩ df2: 16,196
  df1 ∩ adni: 16,828
  df2 ∩ adni: 15,448


In [95]:
# Filter each dataset to common genes only
df1_filtered = df1_final.loc[list(common_genes_all)]
df2_filtered = df2_final.loc[list(common_genes_all)]
adni_filtered = adni_final.loc[list(common_genes_all)]

In [96]:
# Merge horizontally (combine all patients)
# Concatenate along columns (axis=1) to combine patients
merged_all_3 = pd.concat([df1_filtered, df2_filtered, adni_filtered], axis=1)

In [98]:
print(f"\nMerged dataset shape: {merged_all_3.shape}")
print(f"  Genes (rows): {merged_all_3.shape[0]:,}")
print(f"  Patients (columns): {merged_all_3.shape[1]}")

print(f"\nPatient breakdown:")
print(f"  From df1: {df1_filtered.shape[1]}")
print(f"  From df2: {df2_filtered.shape[1]}")
print(f"  From adni: {adni_filtered.shape[1]}")
print(f"  Total: {merged_all_3.shape[1]}")


Merged dataset shape: (14907, 1093)
  Genes (rows): 14,907
  Patients (columns): 1093

Patient breakdown:
  From df1: 20
  From df2: 329
  From adni: 744
  Total: 1093


In [99]:
# Check for any issues
print(f"Missing values: {merged_all_3.isnull().sum().sum()}")
print(f"Duplicate gene names: {merged_all_3.index.duplicated().sum()}")
print(f"Duplicate patient IDs: {merged_all_3.columns.duplicated().sum()}")

Missing values: 0
Duplicate gene names: 0
Duplicate patient IDs: 0


In [100]:
merged_all_3.head()

Unnamed: 0_level_0,GSM2982966,GSM2982967,GSM2982968,GSM2982969,GSM2982970,GSM2982971,GSM2982972,GSM2982973,GSM2982974,GSM2982975,...,006_S_0731,941_S_4365,127_S_2213,130_S_4660,023_S_1046,007_S_4611,027_S_2245,016_S_4353,036_S_4491,007_S_4272
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PPP1R14C,9.8348,10.2328,9.4136,9.5476,9.5162,8.7275,9.4793,9.4693,9.4479,9.838,...,2.5455,2.676,2.783,2.5985,2.718,2.4585,2.868,2.6315,2.942,2.6895
ZNF713,4.0247,3.2438,2.6882,3.1142,2.8545,2.955,3.3137,2.7505,3.4153,4.011,...,1.823,2.03,1.963,1.883,2.661,1.94,1.831,1.863,1.769,1.829
PPIA,6.8479,6.8912,6.8627,6.8061,6.679,6.401,6.2279,6.8036,6.9409,7.0345,...,7.876667,7.666,7.763833,7.630167,7.823667,7.653667,7.519667,7.782667,7.097667,7.620167
PRELP,11.119767,11.902267,11.1806,11.2342,11.684333,11.898867,11.261733,10.5108,10.7212,10.119767,...,2.44775,2.401,2.55925,2.59825,2.3115,2.3445,2.40175,2.484,2.4815,2.414
CMTM2,3.7212,4.8252,6.0325,5.686,5.5814,5.4153,7.6104,5.5505,6.3759,5.7153,...,11.066,10.906,10.962,11.096,10.847,11.208,10.899,11.519,11.349,11.041


This dataset has Gene expression as rows and patient ID as columns.
Next steps:

1. Transpose (patients as rows)

2. Extract labels from all 3 sources

3. Add clinical features

4. Continue with preprocessing pipeline

In [102]:
#Transpose the data (patients as rows, genes as columns)
print(f"  Before transpose Shape: {merged_all_3.shape}")

merged_data_transposed = merged_all_3.T

print(f"  After Transpose Shape: {merged_data_transposed.shape}")

  Before transpose Shape: (14907, 1093)
  After Transpose Shape: (1093, 14907)


In [103]:
merged_data_transposed.head()

Gene_Symbol,PPP1R14C,ZNF713,PPIA,PRELP,CMTM2,ODF4,SNX11,ZIM3,CKAP2L,CNST,...,BEGAIN,ZNF43,SPINK2,ATF2,PSMD10,CSGALNACT1,PRSS23,ZNF630,SFT2D2,TMEM175
GSM2982966,9.8348,4.0247,6.8479,11.119767,3.7212,5.1035,9.7325,2.44705,4.74545,8.636275,...,8.823,6.0461,9.6186,5.6665,9.4669,6.2479,7.628733,8.9419,7.8428,8.3567
GSM2982967,10.2328,3.2438,6.8912,11.902267,4.8252,5.5432,9.6834,2.7035,4.91425,8.848375,...,8.0978,6.113733,9.4342,5.6871,9.19605,8.7756,8.1483,8.0731,7.6957,8.6026
GSM2982968,9.4136,2.6882,6.8627,11.1806,6.0325,5.3749,9.4165,2.68285,4.5491,8.7817,...,8.7418,5.779767,8.6129,5.5993,9.6729,8.5262,8.018533,8.3812,8.0462,8.185
GSM2982969,9.5476,3.1142,6.8061,11.2342,5.686,5.823,9.7334,2.5605,5.4174,8.74295,...,9.0881,6.2799,8.9918,5.7373,9.47105,8.312,8.385367,8.0559,8.491,8.0209
GSM2982970,9.5162,2.8545,6.679,11.684333,5.5814,5.3972,9.3995,2.3826,4.66085,8.6541,...,8.9891,5.682767,9.1384,5.4833,9.12835,8.1771,7.607267,8.2137,7.9665,8.1095


In [104]:
#Now we need to extract the clinical labels (Alzheimer's vs Control) from the metadata files

# df1 patients are from GSE110226
# df2 patients are from GSE63060

# Read GSE110226 metadata
with open('GSE110226_series_matrix.txt', 'r') as f:
    lines = f.readlines()

# Find sample information
sample_ids_df1 = []
sample_labels_df1 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        # Extract sample IDs
        parts = line.strip().split('\t')[1:]
        sample_ids_df1 = [p.strip('"') for p in parts]
        print(f"Found {len(sample_ids_df1)} samples from df1")

    if line.startswith('!Sample_title') or line.startswith('!Sample_source_name_ch1'):
        # Extract sample labels/descriptions
        parts = line.strip().split('\t')[1:]
        sample_labels_df1 = [p.strip('"') for p in parts]
        print(f"Found labels: {line[:80]}...")
        break

# Show first few samples
print("\nFirst 10 samples from df1:")
for i in range(min(10, len(sample_ids_df1))):
    print(f"  {sample_ids_df1[i]}: {sample_labels_df1[i] if i < len(sample_labels_df1) else 'N/A'}")

print("\n" + "="*60)
print("Step 2: Extract labels from df2 (GSE63060)")
print("="*60)

# Read GSE63060 metadata
with open('GSE63060_series_matrix.txt', 'r') as f:
    lines = f.readlines()

sample_ids_df2 = []
sample_labels_df2 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        parts = line.strip().split('\t')[1:]
        sample_ids_df2 = [p.strip('"') for p in parts]
        print(f"Found {len(sample_ids_df2)} samples from df2")

    if line.startswith('!Sample_title') or line.startswith('!Sample_source_name_ch1'):
        parts = line.strip().split('\t')[1:]
        sample_labels_df2 = [p.strip('"') for p in parts]
        print(f"Found labels: {line[:80]}...")
        break

# Show first few samples
print("\nFirst 10 samples from df2:")
for i in range(min(10, len(sample_ids_df2))):
    print(f"  {sample_ids_df2[i]}: {sample_labels_df2[i] if i < len(sample_labels_df2) else 'N/A'}")

Found labels: !Sample_title	"CP_ALZ_015"	"CP_ALZ_017"	"CP_ALZ_018"	"CP_ALZ_019"	"CP_ALZ_020"	"...

First 10 samples from df1:

Step 2: Extract labels from df2 (GSE63060)
Found labels: !Sample_title	"4856050008_I"	"4856050047_D"	"4856076009_D"	"4856076040_F"	"48560...

First 10 samples from df2:


In [105]:
print("="*60)
print("EXTRACTING DIAGNOSIS FROM CHARACTERISTICS")
print("="*60)

# Extract from df1 (GSE110226)
print("\nSearching df1 (GSE110226) for diagnosis...")
with open('GSE110226_series_matrix.txt', 'r') as f:
    lines = f.readlines()

sample_ids_df1 = []
diagnosis_df1 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        parts = line.strip().split('\t')[1:]
        sample_ids_df1 = [p.strip('"') for p in parts]

    # Look for characteristics that contain diagnosis
    if line.startswith('!Sample_characteristics_ch1'):
        if 'diagnosis' in line.lower() or 'disease' in line.lower() or 'condition' in line.lower():
            parts = line.strip().split('\t')[1:]
            diagnosis_df1 = [p.strip('"') for p in parts]
            print(f"Found diagnosis line: {line[:150]}...")
            break

print(f"\ndf1 samples: {len(sample_ids_df1)}")
print(f"df1 diagnoses: {len(diagnosis_df1)}")

if diagnosis_df1:
    print("\nFirst 10 diagnoses from df1:")
    for i in range(min(10, len(diagnosis_df1))):
        print(f"  {sample_ids_df1[i]}: {diagnosis_df1[i]}")

# Extract from df2 (GSE63060)
print("\n" + "="*60)
print("Searching df2 (GSE63060) for diagnosis...")
with open('GSE63060_series_matrix.txt', 'r') as f:
    lines = f.readlines()

sample_ids_df2 = []
diagnosis_df2 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        parts = line.strip().split('\t')[1:]
        sample_ids_df2 = [p.strip('"') for p in parts]

    if line.startswith('!Sample_characteristics_ch1'):
        if 'diagnosis' in line.lower() or 'disease' in line.lower() or 'condition' in line.lower():
            parts = line.strip().split('\t')[1:]
            diagnosis_df2 = [p.strip('"') for p in parts]
            print(f"Found diagnosis line: {line[:150]}...")
            break

print(f"\ndf2 samples: {len(sample_ids_df2)}")
print(f"df2 diagnoses: {len(diagnosis_df2)}")

if diagnosis_df2:
    print("\nFirst 10 diagnoses from df2:")
    for i in range(min(10, len(diagnosis_df2))):
        print(f"  {sample_ids_df2[i]}: {diagnosis_df2[i]}")

# If no diagnosis found, show ALL characteristics lines
if not diagnosis_df1:
    print("\n⚠ No diagnosis found in df1. Showing all characteristic lines:")
    with open('GSE110226_series_matrix.txt', 'r') as f:
        for line in f:
            if line.startswith('!Sample_characteristics_ch1'):
                print(line[:200])

if not diagnosis_df2:
    print("\n⚠ No diagnosis found in df2. Showing all characteristic lines:")
    with open('GSE63060_series_matrix.txt', 'r') as f:
        for line in f:
            if line.startswith('!Sample_characteristics_ch1'):
                print(line[:200])

EXTRACTING DIAGNOSIS FROM CHARACTERISTICS

Searching df1 (GSE110226) for diagnosis...
Found diagnosis line: !Sample_characteristics_ch1	"disease state: AD (Braak III-IV)"	"disease state: AD (severe Braak V-VI)"	"disease state: AD (severe Braak V-VI)"	"diseas...

df1 samples: 20
df1 diagnoses: 20

First 10 diagnoses from df1:
  GSM2982966: disease state: AD (Braak III-IV)
  GSM2982967: disease state: AD (severe Braak V-VI)
  GSM2982968: disease state: AD (severe Braak V-VI)
  GSM2982969: disease state: AD (severe + Lewy body disease)
  GSM2982970: disease state: AD (severe Braak V-VI)
  GSM2982971: disease state: AD (severe Braak V-VI)
  GSM2982972: disease state: AD (severe Braak V-VI)
  GSM2982973: disease state: normal control
  GSM2982974: disease state: normal control
  GSM2982975: disease state: normal control

Searching df2 (GSE63060) for diagnosis...

df2 samples: 329
df2 diagnoses: 0

⚠ No diagnosis found in df2. Showing all characteristic lines:
!Sample_characteristics_ch1	"st

df1: Has "disease state: AD" or "disease state: normal control" df2: Has "status: MCI" (but we need to find AD and Control labels)

In [106]:
# df1 patients start with GSM2982...
# df2 patients start with GSM1539...

df1_patients_in_merged = [p for p in merged_data_transposed.index if p.startswith('GSM2982')]
df2_patients_in_merged = [p for p in merged_data_transposed.index if p.startswith('GSM1539')]

print(f"Patients from df1: {len(df1_patients_in_merged)}")
print(f"Patients from df2: {len(df2_patients_in_merged)}")
print(f"Total: {len(df1_patients_in_merged) + len(df2_patients_in_merged)}")

Patients from df1: 20
Patients from df2: 329
Total: 349


In [107]:
# Create df1 labels
labels_df1 = {}
for i, sample_id in enumerate(sample_ids_df1):
    diagnosis = diagnosis_df1[i]
    # Simplify to binary: AD vs Control
    if 'AD' in diagnosis:
        labels_df1[sample_id] = 'AD'
    elif 'normal control' in diagnosis or 'control' in diagnosis.lower():
        labels_df1[sample_id] = 'Control'
    else:
        labels_df1[sample_id] = diagnosis

print("df1 labels:")
print(pd.Series(labels_df1).value_counts())

df1 labels:
AD                                           7
Control                                      6
disease state: HD (grade IV)                 3
disease state: FTD                           2
disease state: FTD & motor neuron disease    1
disease state: FTD Pick's disease            1
Name: count, dtype: int64


In [108]:
with open('GSE63060_series_matrix.txt', 'r') as f:
    lines = f.readlines()

sample_ids_df2 = []
status_df2 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        parts = line.strip().split('\t')[1:]
        sample_ids_df2 = [p.strip('"') for p in parts]

    if line.startswith('!Sample_characteristics_ch1') and 'status:' in line:
        parts = line.strip().split('\t')[1:]
        status_df2 = [p.strip('"') for p in parts]
        break

# Create df2 labels
labels_df2 = {}
for i, sample_id in enumerate(sample_ids_df2):
    status = status_df2[i]
    status_value = status.split('status:')[1].strip()
    labels_df2[sample_id] = status_value

print("\ndf2 labels:")
print(pd.Series(labels_df2).value_counts())



df2 labels:
AD     145
CTL    104
MCI     80
Name: count, dtype: int64


In [112]:
print("="*60)
print("CREATING MULTI-CLASS LABELS FOR MERGED DATASET")
print("="*60)

# ============================================================
# Step 1: Extract ADNI labels from phenotype
# ============================================================

print("\nStep 1: Creating ADNI labels...")

labels_adni = {}
for i, row in adni_pheno.iterrows():
    patient_id = row['PTID']
    diagnosis = row['Diagnosis']

    if pd.notna(diagnosis):
        labels_adni[patient_id] = diagnosis  # Keep as is: AD, Control, MCI
    else:
        labels_adni[patient_id] = 'Unknown'

print(f"✓ ADNI labels created: {len(labels_adni)} patients")
print("\nADNI diagnosis distribution:")
print(pd.Series(list(labels_adni.values())).value_counts())

# ============================================================
# Step 2: Combine all labels (df1 + df2 + ADNI)
# ============================================================

print("\n" + "="*60)
print("Step 2: Combining all labels")
print("="*60)

# Combine all three
all_labels_complete = {**labels_df1, **labels_df2, **labels_adni}

print(f"Total labeled patients: {len(all_labels_complete)}")
print(f"  From df1: {len(labels_df1)}")
print(f"  From df2: {len(labels_df2)}")
print(f"  From ADNI: {len(labels_adni)}")

print("\nCombined diagnosis distribution:")
combined_diag = pd.Series(list(all_labels_complete.values()))
print(combined_diag.value_counts())

# ============================================================
# Step 3: Match labels to merged_data_transposed
# ============================================================

print("\n" + "="*60)
print("Step 3: Matching labels to merged dataset")
print("="*60)

all_patient_ids = merged_data_transposed.index.tolist()
print(f"Patients in merged dataset: {len(all_patient_ids)}")

diagnosis_list = []
unknown_count = 0

for patient_id in all_patient_ids:
    if patient_id in all_labels_complete:
        diagnosis_list.append(all_labels_complete[patient_id])
    else:
        diagnosis_list.append('Unknown')
        unknown_count += 1

print(f"✓ Matched labels for all patients")
print(f"Unknown labels: {unknown_count}")

CREATING MULTI-CLASS LABELS FOR MERGED DATASET

Step 1: Creating ADNI labels...
✓ ADNI labels created: 744 patients

ADNI diagnosis distribution:
MCI        439
Control    261
Unknown     44
Name: count, dtype: int64

Step 2: Combining all labels
Total labeled patients: 1093
  From df1: 20
  From df2: 329
  From ADNI: 744

Combined diagnosis distribution:
MCI                                          519
Control                                      267
AD                                           152
CTL                                          104
Unknown                                       44
disease state: HD (grade IV)                   3
disease state: FTD                             2
disease state: FTD Pick's disease              1
disease state: FTD & motor neuron disease      1
Name: count, dtype: int64

Step 3: Matching labels to merged dataset
Patients in merged dataset: 1093
✓ Matched labels for all patients
Unknown labels: 0


Merging Control and CTL → "Control"

Marking HD and FTD as "Unknown" (too few samples)

In [113]:
# ============================================================
# Step 1: Standardize diagnosis labels
# ============================================================

def clean_diagnosis(diagnosis):
    """Standardize and clean diagnosis labels"""

    # Convert to string and strip whitespace
    diag = str(diagnosis).strip()

    # Standardize Control/CTL
    if diag in ['Control', 'CTL']:
        return 'Control'

    # Keep AD as is
    elif diag == 'AD':
        return 'AD'

    # Keep MCI as is
    elif diag == 'MCI':
        return 'MCI'

    # Mark HD and FTD as Unknown (too few samples)
    elif 'HD' in diag or 'FTD' in diag:
        return 'Unknown'

    # Already unknown
    elif diag == 'Unknown' or diag == 'nan':
        return 'Unknown'

    # Any other diagnosis → Unknown
    else:
        return 'Unknown'

# Apply cleaning to all labels
all_labels_cleaned = {pid: clean_diagnosis(diag)
                      for pid, diag in all_labels_complete.items()}

print("After cleaning:")
print(pd.Series(list(all_labels_cleaned.values())).value_counts())

# ============================================================
# Step 2: Match to merged dataset
# ============================================================

print("\n" + "="*60)
print("MATCHING TO MERGED DATASET")
print("="*60)

all_patient_ids = merged_data_transposed.index.tolist()
diagnosis_list_clean = []

for patient_id in all_patient_ids:
    if patient_id in all_labels_cleaned:
        diagnosis_list_clean.append(all_labels_cleaned[patient_id])
    else:
        diagnosis_list_clean.append('Unknown')

diagnosis_series_clean = pd.Series(diagnosis_list_clean)

print("Final diagnosis distribution:")
print(diagnosis_series_clean.value_counts())

After cleaning:
MCI        519
Control    371
AD         152
Unknown     51
Name: count, dtype: int64

MATCHING TO MERGED DATASET
Final diagnosis distribution:
MCI        519
Control    371
AD         152
Unknown     51
Name: count, dtype: int64


In [118]:
# Create final dataset with categorical Diagnosis column
final_dataset_categorical = merged_data_transposed.copy()
final_dataset_categorical['Diagnosis'] = diagnosis_list_clean

print("\n✓ Creating dataset with categorical labels...")
print(f"Shape: {final_dataset_categorical.shape}")
print(f"  Patients: {final_dataset_categorical.shape[0]}")
print(f"  Genes: {final_dataset_categorical.shape[1] - 1:,}")
print(f"  Diagnosis column: Categorical (AD, Control, MCI, Unknown)")

print("\nSample:")
print(final_dataset_categorical[['Diagnosis']].head(20))


✓ Creating dataset with categorical labels...
Shape: (1093, 14908)
  Patients: 1093
  Genes: 14,907
  Diagnosis column: Categorical (AD, Control, MCI, Unknown)

Sample:
Gene_Symbol Diagnosis
GSM2982966         AD
GSM2982967         AD
GSM2982968         AD
GSM2982969         AD
GSM2982970         AD
GSM2982971         AD
GSM2982972         AD
GSM2982973    Control
GSM2982974    Control
GSM2982975    Control
GSM2982976    Control
GSM2982977    Control
GSM2982978    Control
GSM2982979    Unknown
GSM2982980    Unknown
GSM2982981    Unknown
GSM2982982    Unknown
GSM2982983    Unknown
GSM2982984    Unknown
GSM2982985    Unknown


In [119]:
final_dataset_categorical.head()

Gene_Symbol,PPP1R14C,ZNF713,PPIA,PRELP,CMTM2,ODF4,SNX11,ZIM3,CKAP2L,CNST,...,ZNF43,SPINK2,ATF2,PSMD10,CSGALNACT1,PRSS23,ZNF630,SFT2D2,TMEM175,Diagnosis
GSM2982966,9.8348,4.0247,6.8479,11.119767,3.7212,5.1035,9.7325,2.44705,4.74545,8.636275,...,6.0461,9.6186,5.6665,9.4669,6.2479,7.628733,8.9419,7.8428,8.3567,AD
GSM2982967,10.2328,3.2438,6.8912,11.902267,4.8252,5.5432,9.6834,2.7035,4.91425,8.848375,...,6.113733,9.4342,5.6871,9.19605,8.7756,8.1483,8.0731,7.6957,8.6026,AD
GSM2982968,9.4136,2.6882,6.8627,11.1806,6.0325,5.3749,9.4165,2.68285,4.5491,8.7817,...,5.779767,8.6129,5.5993,9.6729,8.5262,8.018533,8.3812,8.0462,8.185,AD
GSM2982969,9.5476,3.1142,6.8061,11.2342,5.686,5.823,9.7334,2.5605,5.4174,8.74295,...,6.2799,8.9918,5.7373,9.47105,8.312,8.385367,8.0559,8.491,8.0209,AD
GSM2982970,9.5162,2.8545,6.679,11.684333,5.5814,5.3972,9.3995,2.3826,4.66085,8.6541,...,5.682767,9.1384,5.4833,9.12835,8.1771,7.607267,8.2137,7.9665,8.1095,AD


Extract clinical features from df1 & df2

In [123]:
# ============================================================
# 1. Extract from df1 (GSE110226)
# ============================================================

print("\n1. EXTRACTING FROM DF1 (GSE110226)")
print("="*60)

with open('GSE110226_series_matrix.txt', 'r') as f:
    lines = f.readlines()

sample_ids_df1 = []
age_df1 = []
sex_df1 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        parts = line.strip().split('\t')[1:]
        sample_ids_df1 = [p.strip('"') for p in parts]

    if line.startswith('!Sample_characteristics_ch1') and 'age:' in line:
        parts = line.strip().split('\t')[1:]
        age_df1 = [p.strip('"').split('age:')[1].strip() if 'age:' in p else np.nan for p in parts]

    if line.startswith('!Sample_characteristics_ch1') and 'Sex:' in line:
        parts = line.strip().split('\t')[1:]
        sex_df1 = [p.strip('"').split('Sex:')[1].strip() if 'Sex:' in p else np.nan for p in parts]

clinical_df1 = pd.DataFrame({
    'Patient_ID': sample_ids_df1,
    'Age': age_df1,
    'Sex': sex_df1
})

print(f"✓ df1: {len(clinical_df1)} patients")
print(clinical_df1.head())

# ============================================================
# 2. Extract from df2 (GSE63060)
# ============================================================

print("\n2. EXTRACTING FROM DF2 (GSE63060)")
print("="*60)

with open('GSE63060_series_matrix.txt', 'r') as f:
    lines = f.readlines()

sample_ids_df2 = []
age_df2 = []
gender_df2 = []

for line in lines:
    if line.startswith('!Sample_geo_accession'):
        parts = line.strip().split('\t')[1:]
        sample_ids_df2 = [p.strip('"') for p in parts]

    if line.startswith('!Sample_characteristics_ch1') and 'age:' in line:
        parts = line.strip().split('\t')[1:]
        age_df2 = [p.strip('"').split('age:')[1].strip() if 'age:' in p else np.nan for p in parts]

    if line.startswith('!Sample_characteristics_ch1') and 'gender:' in line:
        parts = line.strip().split('\t')[1:]
        gender_df2 = [p.strip('"').split('gender:')[1].strip() if 'gender:' in p else np.nan for p in parts]

clinical_df2 = pd.DataFrame({
    'Patient_ID': sample_ids_df2,
    'Age': age_df2,
    'Sex': gender_df2
})

print(f"✓ df2: {len(clinical_df2)} patients")
print(clinical_df2.head())


1. EXTRACTING FROM DF1 (GSE110226)
✓ df1: 20 patients
   Patient_ID Age     Sex
0  GSM2982966  74  female
1  GSM2982967  84  female
2  GSM2982968  84    male
3  GSM2982969  84  female
4  GSM2982970  89    male

2. EXTRACTING FROM DF2 (GSE63060)
✓ df2: 329 patients
   Patient_ID Age     Sex
0  GSM1539080  65  Female
1  GSM1539081  66  Female
2  GSM1539082  67  Female
3  GSM1539083  67  Female
4  GSM1539084  67  Female


In [124]:
# ============================================================
# 3. Extract from ADNI (already have it!)
# ============================================================

print("\n3. EXTRACTING FROM ADNI")
print("="*60)

clinical_adni = adni_pheno[['PTID', 'AGE', 'PTGENDER']].copy()
clinical_adni.columns = ['Patient_ID', 'Age', 'Sex']

print(f"✓ ADNI: {len(clinical_adni)} patients")
print(clinical_adni.head())


3. EXTRACTING FROM ADNI
✓ ADNI: 744 patients
   Patient_ID   Age     Sex
0  002_S_0413  76.3  Female
1  002_S_0685  89.6  Female
2  002_S_0729  65.1  Female
3  002_S_1155  57.8    Male
4  002_S_1261  71.1  Female


In [125]:
# ============================================================
# 4. Combine all clinical features
# ============================================================

print("\n" + "="*60)
print("COMBINING ALL CLINICAL FEATURES")
print("="*60)

clinical_all = pd.concat([clinical_df1, clinical_df2, clinical_adni], ignore_index=True)
clinical_all.set_index('Patient_ID', inplace=True)

print(f"Total patients with clinical data: {len(clinical_all)}")
print("\nSample:")
print(clinical_all.head(10))


COMBINING ALL CLINICAL FEATURES
Total patients with clinical data: 1093

Sample:
           Age     Sex
Patient_ID            
GSM2982966  74  female
GSM2982967  84  female
GSM2982968  84    male
GSM2982969  84  female
GSM2982970  89    male
GSM2982971  73    male
GSM2982972  70    male
GSM2982973  62    male
GSM2982974  55  female
GSM2982975  37    male


Process Age (z-score standardization)

In [129]:
# Convert to numeric
clinical_all['Age'] = pd.to_numeric(clinical_all['Age'], errors='coerce')

print(f"Age statistics:")
print(f"  Mean: {clinical_all['Age'].mean():.1f}")
print(f"  Std: {clinical_all['Age'].std():.1f}")
print(f"  Min: {clinical_all['Age'].min():.0f}")
print(f"  Max: {clinical_all['Age'].max():.0f}")
print(f"  Missing: {clinical_all['Age'].isna().sum()}")

# Z-score standardization
scaler = StandardScaler()
age_standardized = scaler.fit_transform(clinical_all[['Age']])
clinical_all['Age_Zscore'] = age_standardized

print(f"\n✓ Age standardized")
print(f"  New mean: {clinical_all['Age_Zscore'].mean():.4f}")
print(f"  New std: {clinical_all['Age_Zscore'].std():.4f}")


Age statistics:
  Mean: 73.4
  Std: 7.0
  Min: 37
  Max: 91
  Missing: 0

✓ Age standardized
  New mean: -0.0000
  New std: 1.0005


Process Sex (binary encoding)

In [130]:
# Standardize sex values (Male/male, Female/female)
clinical_all['Sex'] = clinical_all['Sex'].str.lower()

print(f"Sex distribution:")
print(clinical_all['Sex'].value_counts())

# Binary encode: Male=1, Female=0
clinical_all['Sex_Male'] = (clinical_all['Sex'] == 'male').astype(int)

print(f"\n✓ Sex encoded (Male=1, Female=0)")

Sex distribution:
Sex
male      549
female    544
Name: count, dtype: int64

✓ Sex encoded (Male=1, Female=0)


In [131]:
#Match to merged dataset and add
# Get patient IDs from merged dataset
patient_ids = merged_data_transposed.index.tolist()

# Match clinical features
clinical_matched = clinical_all.loc[patient_ids][['Age_Zscore', 'Sex_Male']]

print(f"✓ Matched clinical features")
print(f"Shape: {clinical_matched.shape}")
print("\nSample:")
print(clinical_matched.head(10))

# Add to dataset (with categorical Diagnosis)
final_dataset_with_clinical = merged_data_transposed.copy()
final_dataset_with_clinical['Age_Zscore'] = clinical_matched['Age_Zscore'].values
final_dataset_with_clinical['Sex_Male'] = clinical_matched['Sex_Male'].values
final_dataset_with_clinical['Diagnosis'] = diagnosis_list_clean

print("\n" + "="*60)
print("FINAL DATASET WITH CLINICAL FEATURES")
print("="*60)

print(f"Shape: {final_dataset_with_clinical.shape}")
print(f"  Patients: {final_dataset_with_clinical.shape[0]}")
print(f"  Genes: {final_dataset_with_clinical.shape[1] - 3:,}")
print(f"  Clinical features: 2 (Age_Zscore, Sex_Male)")
print(f"  Label: 1 (Diagnosis - categorical)")

print("\nColumn breakdown:")
print(f"  - Genes: columns 0 to {final_dataset_with_clinical.shape[1] - 4}")
print(f"  - Age_Zscore: column {final_dataset_with_clinical.shape[1] - 3}")
print(f"  - Sex_Male: column {final_dataset_with_clinical.shape[1] - 2}")
print(f"  - Diagnosis: column {final_dataset_with_clinical.shape[1] - 1}")

✓ Matched clinical features
Shape: (1093, 2)

Sample:
            Age_Zscore  Sex_Male
Patient_ID                      
GSM2982966    0.082439         0
GSM2982967    1.509525         0
GSM2982968    1.509525         1
GSM2982969    1.509525         0
GSM2982970    2.223068         1
GSM2982971   -0.060269         1
GSM2982972   -0.488395         1
GSM2982973   -1.630064         1
GSM2982974   -2.629024         0
GSM2982975   -5.197779         1

FINAL DATASET WITH CLINICAL FEATURES
Shape: (1093, 14910)
  Patients: 1093
  Genes: 14,907
  Clinical features: 2 (Age_Zscore, Sex_Male)
  Label: 1 (Diagnosis - categorical)

Column breakdown:
  - Genes: columns 0 to 14906
  - Age_Zscore: column 14907
  - Sex_Male: column 14908
  - Diagnosis: column 14909


In [132]:
final_dataset_with_clinical.head()

Gene_Symbol,PPP1R14C,ZNF713,PPIA,PRELP,CMTM2,ODF4,SNX11,ZIM3,CKAP2L,CNST,...,ATF2,PSMD10,CSGALNACT1,PRSS23,ZNF630,SFT2D2,TMEM175,Age_Zscore,Sex_Male,Diagnosis
GSM2982966,9.8348,4.0247,6.8479,11.119767,3.7212,5.1035,9.7325,2.44705,4.74545,8.636275,...,5.6665,9.4669,6.2479,7.628733,8.9419,7.8428,8.3567,0.082439,0,AD
GSM2982967,10.2328,3.2438,6.8912,11.902267,4.8252,5.5432,9.6834,2.7035,4.91425,8.848375,...,5.6871,9.19605,8.7756,8.1483,8.0731,7.6957,8.6026,1.509525,0,AD
GSM2982968,9.4136,2.6882,6.8627,11.1806,6.0325,5.3749,9.4165,2.68285,4.5491,8.7817,...,5.5993,9.6729,8.5262,8.018533,8.3812,8.0462,8.185,1.509525,1,AD
GSM2982969,9.5476,3.1142,6.8061,11.2342,5.686,5.823,9.7334,2.5605,5.4174,8.74295,...,5.7373,9.47105,8.312,8.385367,8.0559,8.491,8.0209,1.509525,0,AD
GSM2982970,9.5162,2.8545,6.679,11.684333,5.5814,5.3972,9.3995,2.3826,4.66085,8.6541,...,5.4833,9.12835,8.1771,7.607267,8.2137,7.9665,8.1095,2.223068,1,AD


In [148]:
print(f"  Total patients: {len(final_dataset_with_clinical)}")
# Filter out Unknown patients
final_dataset_clean = final_dataset_with_clinical[final_dataset_with_clinical['Diagnosis'] != 'Unknown'].copy()
print(f"\nAfter removal:")
print(f"  Total patients: {len(final_dataset_clean)}")

  Total patients: 1093

After removal:
  Total patients: 1042


Data transfortation.

In [149]:
from sklearn.preprocessing import quantile_transform

# Separate features and target
X = final_dataset_clean.drop(['Diagnosis','Age_Zscore','Sex_Male'], axis=1)
clinical_features = final_dataset_clean[['Age_Zscore', 'Sex_Male']]  # Double brackets!
y = final_dataset_clean['Diagnosis']  # Single bracket for one column

In [150]:
X.shape

(1042, 14907)

In [151]:
#LOG2 TRANSFORMATION CHECK"


print(f"Data range: {X.min().min():.2f} to {X.max().max():.2f}")

if X.min().min() < 0:
    print("✓ Data contains negative values - likely already log-transformed")
    print("  Skipping log2 transformation")
    X_log = X.copy()
else:
    print("⚠ Data is all positive - applying log2 transformation")
    X_log = np.log2(X + 1)  # +1 to avoid log(0)
    print(f"  After log2: {X_log.min().min():.2f} to {X_log.max().max():.2f}")

print(f"\nX_log shape: {X_log.shape}")

Data range: 1.33 to 15.05
⚠ Data is all positive - applying log2 transformation
  After log2: 1.22 to 4.00

X_log shape: (1042, 14907)


In [152]:
# Apply quantile normalization
X_quantile = pd.DataFrame(
    quantile_transform(X_log, n_quantiles=min(X_log.shape[0], 1000), random_state=42),
    columns=X_log.columns,
    index=X_log.index
)

In [153]:
print(f"  New range: {X_quantile.min().min():.2f} to {X_quantile.max().max():.2f}")
print(f"  Shape: {X_quantile.shape}")

  New range: 0.00 to 1.00
  Shape: (1042, 14907)


Dimension Reduction

In [154]:
# Calculate variance for each gene
gene_variances = X_quantile.var(axis=0)

print(f"Gene variance statistics:")
print(f"  Mean variance: {gene_variances.mean():.4f}")
print(f"  Median variance: {gene_variances.median():.4f}")
print(f"  Min variance: {gene_variances.min():.4f}")
print(f"  Max variance: {gene_variances.max():.4f}")

Gene variance statistics:
  Mean variance: 0.0836
  Median variance: 0.0836
  Min variance: 0.0836
  Max variance: 0.0914


In [155]:
#Low-Variance Filter (Remove bottom 20%)
# Calculate variance for each gene
gene_variances = X_quantile.var(axis=0)

print(f"Gene variance statistics:")
print(f"  Mean variance: {gene_variances.mean():.4f}")
print(f"  Median variance: {gene_variances.median():.4f}")
print(f"  Min variance: {gene_variances.min():.4f}")
print(f"  Max variance: {gene_variances.max():.4f}")

# Calculate 20th percentile threshold
variance_threshold = gene_variances.quantile(0.20)
print(f"\n20th percentile variance: {variance_threshold:.4f}")

# Keep genes with variance > 20th percentile (remove bottom 20%)
genes_to_keep = gene_variances[gene_variances > variance_threshold].index
X_filtered = X_quantile[genes_to_keep]

Gene variance statistics:
  Mean variance: 0.0836
  Median variance: 0.0836
  Min variance: 0.0836
  Max variance: 0.0914

20th percentile variance: 0.0836


In [156]:
X_filtered.shape

(1042, 11925)

In [157]:
## XGBOOST FEATURE SELECTION
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

In [158]:
y.head()

Unnamed: 0,Diagnosis
GSM2982966,AD
GSM2982967,AD
GSM2982968,AD
GSM2982969,AD
GSM2982970,AD


Convert Diagnosis categorical column to numeric column

In [159]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("✓ Encoding complete!")

# Show the mapping
print("\nEncoding mapping:")
encoding_map = dict(zip(le.classes_, le.transform(le.classes_)))

for diagnosis, code in sorted(encoding_map.items(), key=lambda x: x[1]):
    count = (y == diagnosis).sum()
    print(f"  {code} = {diagnosis}: {count} patients")

print(f"\ny_encoded sample (first 20):")
print(y_encoded[:20])

print(f"\nOriginal y sample (first 20):")
print(y.head(20).tolist())

# Save encoding reference
encoding_ref = pd.DataFrame({
    'Numeric_Code': list(encoding_map.values()),
    'Diagnosis': list(encoding_map.keys()),
    'Patient_Count': [int((y_encoded == code).sum()) for code in encoding_map.values()]
})

✓ Encoding complete!

Encoding mapping:
  0 = AD: 152 patients
  1 = Control: 371 patients
  2 = MCI: 519 patients

y_encoded sample (first 20):
[0 0 0 0 0 0 0 1 1 1 1 1 1 2 2 2 2 2 2 2]

Original y sample (first 20):
['AD', 'AD', 'AD', 'AD', 'AD', 'AD', 'AD', 'Control', 'Control', 'Control', 'Control', 'Control', 'Control', 'MCI', 'MCI', 'MCI', 'MCI', 'MCI', 'MCI', 'MCI']


In [161]:
# Split data for validation
# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X_filtered,
    y_encoded,  # ← Use y_encoded, not y
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # ← Also use y_encoded here
)
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # For multi-class classification
    num_class=len(encoding_map),  # Number of classes (3: AD, Control, MCI)
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    eval_metric='mlogloss'  # Multi-class log loss
)

xgb_model.fit(X_train, y_train)

print(f"\n✓ Training complete!")
print(f"  Train accuracy: {xgb_model.score(X_train, y_train):.3f}")
print(f"  Val accuracy: {xgb_model.score(X_val, y_val):.3f}")


✓ Training complete!
  Train accuracy: 1.000
  Val accuracy: 0.550


In [162]:
# Get feature importance
feature_importance = pd.DataFrame({
    'Gene': X_filtered.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

In [163]:
feature_importance.head(10)

Unnamed: 0,Gene,Importance
10859,FAM160A2,0.014049
4500,RHOT2,0.010731
11235,NDUFAF7,0.007214
3274,STXBP1,0.004792
9028,ARHGAP11A,0.004037
5890,SETX,0.003935
5260,ILKAP,0.003432
11831,GDF10,0.00322
4127,SLC35E1,0.003155
4568,APOO,0.003147


In [164]:
# Select top 1000 genes by XGBoost importance
n_features = 1000
top_genes = feature_importance.head(n_features)['Gene'].tolist()

X_selected = X_filtered[top_genes]

X_selected.shape

(1042, 1000)

In [165]:
# Important: Make sure all are aligned (same patients)

# They should all have same index (patient IDs)
print(f"X_selected index length: {len(X_selected.index)}")
print(f"clinical_features index length: {len(clinical_features.index)}")
print(f"y index length: {len(y.index)}")

# Check if they're the same patients
if X_selected.index.equals(clinical_features.index) and X_selected.index.equals(y.index):
    print("✓ All aligned! Same patient order")
else:
    print("⚠ Different patient orders - need to align first")
    # Align them to X_selected
    clinical_features = clinical_features.loc[X_selected.index]
    y = y.loc[X_selected.index]
    print("✓ Aligned to X_selected patient order")

X_selected index length: 1042
clinical_features index length: 1042
y index length: 1042
✓ All aligned! Same patient order


In [166]:
# Merge all components
# Combine: Genes + Clinical + Diagnosis
final_complete_dataset = pd.concat([X_selected, clinical_features, y], axis=1)

print(f"✓ Merged successfully!")
print(f"\nFinal dataset shape: {final_complete_dataset.shape}")
print(f"  Patients (rows): {final_complete_dataset.shape[0]}")
print(f"  Genes: {X_selected.shape[1]:,}")
print(f"  Clinical features: {clinical_features.shape[1]}")
print(f"  Diagnosis column: 1")
print(f"  Total columns: {final_complete_dataset.shape[1]}")

✓ Merged successfully!

Final dataset shape: (1042, 1003)
  Patients (rows): 1042
  Genes: 1,000
  Clinical features: 2
  Diagnosis column: 1
  Total columns: 1003


In [167]:
final_complete_dataset.head()

Unnamed: 0,FAM160A2,RHOT2,NDUFAF7,STXBP1,ARHGAP11A,SETX,ILKAP,GDF10,SLC35E1,APOO,...,DBF4B,DENND2D,ATP6V1B1,GATA2,EIF4ENIF1,KDELR2,LOXL3,Age_Zscore,Sex_Male,Diagnosis
GSM2982966,0.97991,0.744904,0.991405,0.99901,0.672667,0.940844,0.995654,0.673113,0.991132,0.998003,...,0.675902,0.011022,0.679418,0.945079,1.0,0.988076,0.943005,0.082439,0,AD
GSM2982967,0.894483,0.777923,0.989839,0.993379,0.683552,0.996013,0.999142,0.672619,0.952954,0.992014,...,0.678733,0.009352,0.683779,0.922055,0.999012,0.991215,0.998009,1.509525,0,AD
GSM2982968,0.947621,0.712432,0.996389,0.997018,0.673129,1.0,0.996018,0.675398,0.971699,0.993897,...,0.672664,0.010831,0.682122,0.904878,0.997155,0.992876,0.996477,1.509525,1,AD
GSM2982969,0.999021,0.695687,0.997023,0.998043,0.683687,0.976953,1.0,0.676675,0.913708,0.989651,...,0.683691,0.005792,0.678439,0.938832,0.995481,0.999035,0.994199,1.509525,0,AD
GSM2982970,0.888728,0.763736,1.0,0.995479,0.675271,0.998027,0.994035,0.679106,0.990422,0.99058,...,0.673317,0.007632,0.682803,0.781802,0.996034,0.996169,0.993026,2.223068,1,AD


Our gene expression preprocessing pipeline consisted of three sequential steps: log2 transformation, quantile normalization, and low-variance filtering.

Clinical features (age and sex) were processed differently from gene expression data due to their distinct measurement scales and statistical properties.

Features of the same measurement type (all genes) receive identical normalization to ensure comparability. Features of different types (clinical vs. genetic) require type-specific transformations to place them on compatible scales for machine learning.

Class Imbalance

In [168]:
# Check diagnosis distribution
print("Diagnosis Distribution:")
print(final_complete_dataset['Diagnosis'].value_counts())

# Calculate percentages
diagnosis_counts = final_complete_dataset['Diagnosis'].value_counts()
total_patients = len(final_complete_dataset)

print("\n" + "="*60)
print("CLASS DISTRIBUTION (Percentage)")
print("="*60)

for diagnosis, count in diagnosis_counts.items():
    percentage = (count / total_patients) * 100
    print(f"  {diagnosis}: {count} patients ({percentage:.1f}%)")

Diagnosis Distribution:
Diagnosis
MCI        519
Control    371
AD         152
Name: count, dtype: int64

CLASS DISTRIBUTION (Percentage)
  MCI: 519 patients (49.8%)
  Control: 371 patients (35.6%)
  AD: 152 patients (14.6%)


In [169]:
# Calculate imbalance ratios
# Get majority and minority classes
max_class = diagnosis_counts.idxmax()
min_class = diagnosis_counts.idxmin()
max_count = diagnosis_counts.max()
min_count = diagnosis_counts.min()

imbalance_ratio = max_count / min_count

print(f"Majority class: {max_class} ({max_count} patients)")
print(f"Minority class: {min_class} ({min_count} patients)")
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

Majority class: MCI (519 patients)
Minority class: AD (152 patients)
Imbalance ratio: 3.41:1


In [170]:
from imblearn.over_sampling import SMOTE

# Separate features and diagnosis
X_all_features = final_complete_dataset.drop('Diagnosis', axis=1)
y_categorical = final_complete_dataset['Diagnosis']

le = LabelEncoder()
y_numeric = le.fit_transform(y_categorical)

print(f"✓ Encoded to numeric")
print(f"Encoding mapping:")
for cls, code in zip(le.classes_, le.transform(le.classes_)):
    count = (y_numeric == code).sum()
    print(f"  {code} = {cls}: {count} patients")


# Step 3: Apply SMOTE

smote = SMOTE(random_state=42)
X_balanced, y_balanced_numeric = smote.fit_resample(X_all_features, y_numeric)

print(f"\n✓ SMOTE complete!")

print(f"\nBefore SMOTE:")
print(f"  Samples: {len(X_all_features)}")
for cls, code in zip(le.classes_, le.transform(le.classes_)):
    count = (y_numeric == code).sum()
    print(f"    {cls}: {count}")

print(f"\nAfter SMOTE:")
print(f"  Samples: {len(X_balanced)}")
for cls, code in zip(le.classes_, le.transform(le.classes_)):
    count = (y_balanced_numeric == code).sum()
    print(f"    {cls}: {count}")


# Step 4: Convert back to DataFrame with categorical labels

print("\n" + "="*60)
print("Step 4: Creating final balanced dataset")
print("="*60)

# Convert to DataFrame
X_balanced_df = pd.DataFrame(X_balanced, columns=X_all_features.columns)

# Decode y back to categorical
y_balanced_categorical = le.inverse_transform(y_balanced_numeric)

# Combine
final_balanced_dataset = X_balanced_df.copy()
final_balanced_dataset['Diagnosis'] = y_balanced_categorical

print(f"✓ Final balanced dataset created!")
print(f"  Shape: {final_balanced_dataset.shape}")
print(f"  Patients: {final_balanced_dataset.shape[0]}")
print(f"  Features: {final_balanced_dataset.shape[1] - 1:,}")

print("\nFinal diagnosis distribution:")
print(final_balanced_dataset['Diagnosis'].value_counts())

# Show sample
print("\nSample (last 5 columns including Diagnosis):")
print(final_balanced_dataset.iloc[:10, -5:])

✓ Encoded to numeric
Encoding mapping:
  0 = AD: 152 patients
  1 = Control: 371 patients
  2 = MCI: 519 patients

✓ SMOTE complete!

Before SMOTE:
  Samples: 1042
    AD: 152
    Control: 371
    MCI: 519

After SMOTE:
  Samples: 1557
    AD: 519
    Control: 519
    MCI: 519

Step 4: Creating final balanced dataset
✓ Final balanced dataset created!
  Shape: (1557, 1003)
  Patients: 1557
  Features: 1,002

Final diagnosis distribution:
Diagnosis
AD         519
Control    519
MCI        519
Name: count, dtype: int64

Sample (last 5 columns including Diagnosis):
     KDELR2     LOXL3  Age_Zscore  Sex_Male Diagnosis
0  0.988076  0.943005    0.082439         0        AD
1  0.991215  0.998009    1.509525         0        AD
2  0.992876  0.996477    1.509525         1        AD
3  0.999035  0.994199    1.509525         0        AD
4  0.996169  0.993026    2.223068         1        AD
5  0.989865  1.000000   -0.060269         1        AD
6  0.994059  0.995085   -0.488395         1        AD


In [171]:
final_balanced_dataset.shape

(1557, 1003)

In [172]:
final_balanced_dataset.head()

Unnamed: 0,FAM160A2,RHOT2,NDUFAF7,STXBP1,ARHGAP11A,SETX,ILKAP,GDF10,SLC35E1,APOO,...,DBF4B,DENND2D,ATP6V1B1,GATA2,EIF4ENIF1,KDELR2,LOXL3,Age_Zscore,Sex_Male,Diagnosis
0,0.97991,0.744904,0.991405,0.99901,0.672667,0.940844,0.995654,0.673113,0.991132,0.998003,...,0.675902,0.011022,0.679418,0.945079,1.0,0.988076,0.943005,0.082439,0,AD
1,0.894483,0.777923,0.989839,0.993379,0.683552,0.996013,0.999142,0.672619,0.952954,0.992014,...,0.678733,0.009352,0.683779,0.922055,0.999012,0.991215,0.998009,1.509525,0,AD
2,0.947621,0.712432,0.996389,0.997018,0.673129,1.0,0.996018,0.675398,0.971699,0.993897,...,0.672664,0.010831,0.682122,0.904878,0.997155,0.992876,0.996477,1.509525,1,AD
3,0.999021,0.695687,0.997023,0.998043,0.683687,0.976953,1.0,0.676675,0.913708,0.989651,...,0.683691,0.005792,0.678439,0.938832,0.995481,0.999035,0.994199,1.509525,0,AD
4,0.888728,0.763736,1.0,0.995479,0.675271,0.998027,0.994035,0.679106,0.990422,0.99058,...,0.673317,0.007632,0.682803,0.781802,0.996034,0.996169,0.993026,2.223068,1,AD


In [173]:
# Save
final_balanced_dataset.to_csv('Final_Balanced_Dataset_SMOTE.csv', index=False)