In [8]:
import GEOparse
import pandas as pd

In [None]:
# Load GEO Series
gse = GEOparse.get_GEO(geo="GSE183947", destdir="GEO")

In [5]:
# Explore metadata
print(gse.metadata)

{'title': ['Identification of five cytotoxicity-related genes involved in the progression of breast cancer'], 'geo_accession': ['GSE183947'], 'status': ['Public on Sep 15 2021'], 'submission_date': ['Sep 11 2021'], 'last_update_date': ['Jan 31 2022'], 'pubmed_id': ['35046993'], 'summary': ['Breast cancer is one of the deadly tumors in women, and its incidence continues to increase. This study aimed to identify novel therapeutic molecules using RNA sequencing (RNAseq) data of breast cancer from our hospitals.'], 'overall_design': ['30 pairs of normal and cancerous tissues from the same excision were collected from the Affiliated Cancer Hospital of Guangzhou Medical University, the Affiliated Cancer Hospital of Sun Yat-sen University and Guangzhou Army General Hospital. RNA sequencing was performed by Guangzhou Huayin Health medical Group. Original reads of RNA sequencing data were normalized as FPKM data.'], 'type': ['Expression profiling by high throughput sequencing'], 'contributor': 

In [None]:
# Access sample data
for gsm_name, gsm in gse.gsms.items():
    print(gsm.metadata)

In [13]:
print(len(gse.gsms))  # should be 60
print(gse.metadata.keys())
print(gse.metadata.get("supplementary_file"))


60
dict_keys(['title', 'geo_accession', 'status', 'submission_date', 'last_update_date', 'pubmed_id', 'summary', 'overall_design', 'type', 'contributor', 'sample_id', 'contact_name', 'contact_institute', 'contact_address', 'contact_city', 'contact_state', 'contact_zip/postal_code', 'contact_country', 'supplementary_file', 'platform_id', 'platform_taxid', 'sample_taxid', 'relation'])
['ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE183nnn/GSE183947/suppl/GSE183947_fpkm.csv.gz']


In [55]:
import pandas as pd
import requests
import gzip
from io import BytesIO

# Correct URL from metadata
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE183nnn/GSE183947/suppl/GSE183947_fpkm.csv.gz"

# Download and decompress
r = requests.get(url)

# Load gzipped CSV
df = pd.read_csv(BytesIO(gzip.decompress(r.content)), index_col=0)

print("✅ Expression data loaded successfully!")
print("Shape:", df.shape)
print(df.head())


✅ Expression data loaded successfully!
Shape: (20246, 60)
          CA.102548  CA.104338  CA.105094  CA.109745  CA.1906415  CA.1912627  \
TSPAN6         0.93       1.97       0.00       5.45        4.52        4.75   
TNMD           0.00       0.00       0.00       0.00        0.00        0.00   
DPM1           0.00       0.43       0.00       3.43        8.45        8.53   
SCYL3          5.78       5.17       8.76       4.58        7.20        6.03   
C1orf112       2.83       6.26       3.37       6.24        5.16       13.69   

          CA.1924346  CA.1926760  CA.1927842  CA.1933414  ...  CAP.2040686  \
TSPAN6          3.96        3.58        6.41       11.89  ...         6.66   
TNMD            0.00        0.23        0.39        0.44  ...         0.12   
DPM1            7.80        7.62        6.40        6.09  ...         4.93   
SCYL3           9.05        5.37        5.92       12.45  ...         8.02   
C1orf112        6.69        5.28        7.65       13.71  ...         7

In [59]:
import GEOparse
import pandas as pd

# Load GEO metadata
gse = GEOparse.get_GEO("GSE183947", destdir="./data")

meta = []

for gsm_name, gsm in gse.gsms.items():
    title = gsm.metadata.get("title", [""])[0]  # sample title
    meta.append({
        "gsm": gsm_name,
        "title": title
    })

# Create DataFrame
pheno = pd.DataFrame(meta)
print("Pheno shape:", pheno.shape)
print(pheno.head(10))



07-Oct-2025 11:46:40 DEBUG utils - Directory ./data already exists. Skipping.
07-Oct-2025 11:46:40 INFO GEOparse - File already exist: using local version.
07-Oct-2025 11:46:40 INFO GEOparse - Parsing ./data/GSE183947_family.soft.gz: 
07-Oct-2025 11:46:40 DEBUG GEOparse - DATABASE: GeoMiame
07-Oct-2025 11:46:40 DEBUG GEOparse - SERIES: GSE183947
07-Oct-2025 11:46:40 DEBUG GEOparse - PLATFORM: GPL11154
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574685
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574686
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574687
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574688
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574689
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574690
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574691
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574692
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574693
07-Oct-2025 11:46:40 DEBUG GEOparse - SAMPLE: GSM5574694
07-Oct-2025 11:46:40 DEBU

Pheno shape: (60, 2)
          gsm        title
0  GSM5574685   tumor rep1
1  GSM5574686   tumor rep2
2  GSM5574687   tumor rep3
3  GSM5574688   tumor rep4
4  GSM5574689   tumor rep5
5  GSM5574690   tumor rep6
6  GSM5574691   tumor rep7
7  GSM5574692   tumor rep8
8  GSM5574693   tumor rep9
9  GSM5574694  tumor rep10


In [62]:
print(df.columns[:10])
print(pheno.columns)
print(pheno.head(5))



Index(['CA.102548', 'CA.104338', 'CA.105094', 'CA.109745', 'CA.1906415',
       'CA.1912627', 'CA.1924346', 'CA.1926760', 'CA.1927842', 'CA.1933414'],
      dtype='object')
Index(['gsm', 'title'], dtype='object')
          gsm       title
0  GSM5574685  tumor rep1
1  GSM5574686  tumor rep2
2  GSM5574687  tumor rep3
3  GSM5574688  tumor rep4
4  GSM5574689  tumor rep5


In [63]:
# Look for GSM IDs in column names
for col in df.columns[:10]:
    matches = [gsm for gsm in pheno['gsm'] if gsm in col]
    if matches:
        print(col, matches)


In [64]:
# Example mapping (must match your experiment)
sample_map = {
    'CA.102548': {'condition': 'tumor', 'replicate': 1},
    'CA.104338': {'condition': 'tumor', 'replicate': 2},
    'CA.105094': {'condition': 'tumor', 'replicate': 3},
    # ... continue for all 60 samples
}

# Convert to DataFrame
pheno_new = pd.DataFrame.from_dict(sample_map, orient='index')
pheno_new.index.name = 'sample_id'
pheno_new.reset_index(inplace=True)

# Align expression data
df = df[pheno_new['sample_id']]


In [65]:
print("Expression matrix shape:", df.shape)
print("Metadata shape:", pheno_new.shape)
print(df.columns[:5])
print(pheno_new.head())


Expression matrix shape: (20246, 3)
Metadata shape: (3, 3)
Index(['CA.102548', 'CA.104338', 'CA.105094'], dtype='object')
   sample_id condition  replicate
0  CA.102548     tumor          1
1  CA.104338     tumor          2
2  CA.105094     tumor          3


In [68]:
import pandas as pd
import numpy as np
import requests, gzip
from io import BytesIO

# Download FPKM CSV
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE183nnn/GSE183947/suppl/GSE183947_fpkm.csv.gz"
r = requests.get(url)
with gzip.GzipFile(fileobj=BytesIO(r.content)) as f:
    expr_df = pd.read_csv(f, index_col=0)

print("Expression matrix shape:", expr_df.shape)
print("First 10 columns:", expr_df.columns[:10])

# Step 1: Create metadata based on column order
sample_ids = expr_df.columns.tolist()
num_samples = len(sample_ids)

# Example: 60 samples, first 30 = tumor, next 30 = normal, 5 replicates each (adjust as needed)
conditions = ['tumor']*30 + ['normal']*30
replicates = list(range(1,6))*12  # 12 sets of 5 replicates

metadata = pd.DataFrame({
    'sample_id': sample_ids,
    'condition': conditions,
    'replicate': replicates
})

# Step 2: Expression matrix columns match metadata
expr_df = expr_df[metadata['sample_id']]

print("Metadata shape:", metadata.shape)
print("Expression matrix aligned shape:", expr_df.shape)

# Optional: Save CSV
expr_df.to_csv("GEO/expression_matrix.csv")
metadata.to_csv("GEO/sample_metadata.csv", index=False)


Expression matrix shape: (20246, 60)
First 10 columns: Index(['CA.102548', 'CA.104338', 'CA.105094', 'CA.109745', 'CA.1906415',
       'CA.1912627', 'CA.1924346', 'CA.1926760', 'CA.1927842', 'CA.1933414'],
      dtype='object')
Metadata shape: (60, 3)
Expression matrix aligned shape: (20246, 60)


In [70]:
import numpy as np
expr_log = np.log2(expr_df + 1)

expr_log.to_csv("GEO/GSE183947_fpkm_log2.csv")
metadata.to_csv("GEO/GSE183947_metadata.csv", index=False)

