In [8]:
import GEOparse
import pandas as pd

In [None]:
# Load GEO Series
gse = GEOparse.get_GEO(geo="GSE183947", destdir="GEO")

In [5]:
# Explore metadata
print(gse.metadata)

{'title': ['Identification of five cytotoxicity-related genes involved in the progression of breast cancer'], 'geo_accession': ['GSE183947'], 'status': ['Public on Sep 15 2021'], 'submission_date': ['Sep 11 2021'], 'last_update_date': ['Jan 31 2022'], 'pubmed_id': ['35046993'], 'summary': ['Breast cancer is one of the deadly tumors in women, and its incidence continues to increase. This study aimed to identify novel therapeutic molecules using RNA sequencing (RNAseq) data of breast cancer from our hospitals.'], 'overall_design': ['30 pairs of normal and cancerous tissues from the same excision were collected from the Affiliated Cancer Hospital of Guangzhou Medical University, the Affiliated Cancer Hospital of Sun Yat-sen University and Guangzhou Army General Hospital. RNA sequencing was performed by Guangzhou Huayin Health medical Group. Original reads of RNA sequencing data were normalized as FPKM data.'], 'type': ['Expression profiling by high throughput sequencing'], 'contributor': 

In [6]:
# Access sample data
for gsm_name, gsm in gse.gsms.items():
    print(gsm.metadata)

{'title': ['tumor rep1'], 'geo_accession': ['GSM5574685'], 'status': ['Public on Sep 15 2021'], 'submission_date': ['Sep 11 2021'], 'last_update_date': ['Sep 15 2021'], 'type': ['SRA'], 'channel_count': ['1'], 'source_name_ch1': ['breast'], 'organism_ch1': ['Homo sapiens'], 'taxid_ch1': ['9606'], 'characteristics_ch1': ['tissue: breast tumor', 'metastasis: yes', 'donor: 102548'], 'molecule_ch1': ['total RNA'], 'extract_protocol_ch1': ["Total RNA was isolated and purified using\xa0TRIzol (Life, cat.265709, CA, USA) following the manufacturer's procedure.", 'After the quality inspection of Agilent 2100 Bioanalyzer (Agilent, cat.G2939AA, CA, USA) and NanoPhotometer (Implen, cat.N60, Munich, Germany), mRNA\xa0with poly(A)\xa0is purified from 1μg total RNA using VAHTS mRNA Capture Beads with Oligo (dT) (Vazyme, cat.N401-01, Nanjing, China) through\xa0two rounds of purification. Subsequently, mRNA fragment was interrupted using VAHTS Universal V6 RNA-seq Library Prep Kit (Vazyme, cat.NR604, 

In [13]:
print(len(gse.gsms))  # should be 60
print(gse.metadata.keys())
print(gse.metadata.get("supplementary_file"))


60
dict_keys(['title', 'geo_accession', 'status', 'submission_date', 'last_update_date', 'pubmed_id', 'summary', 'overall_design', 'type', 'contributor', 'sample_id', 'contact_name', 'contact_institute', 'contact_address', 'contact_city', 'contact_state', 'contact_zip/postal_code', 'contact_country', 'supplementary_file', 'platform_id', 'platform_taxid', 'sample_taxid', 'relation'])
['ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE183nnn/GSE183947/suppl/GSE183947_fpkm.csv.gz']


In [17]:
import pandas as pd
import requests
import gzip
from io import BytesIO

# Correct URL from metadata
url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE183nnn/GSE183947/suppl/GSE183947_fpkm.csv.gz"

# Download and decompress
r = requests.get(url)

# Load gzipped CSV
df = pd.read_csv(BytesIO(gzip.decompress(r.content)), index_col=0)

print("✅ Expression data loaded successfully!")
print("Shape:", df.shape)
print(df.head())


✅ Expression data loaded successfully!
Shape: (20246, 60)
          CA.102548  CA.104338  CA.105094  CA.109745  CA.1906415  CA.1912627  \
TSPAN6         0.93       1.97       0.00       5.45        4.52        4.75   
TNMD           0.00       0.00       0.00       0.00        0.00        0.00   
DPM1           0.00       0.43       0.00       3.43        8.45        8.53   
SCYL3          5.78       5.17       8.76       4.58        7.20        6.03   
C1orf112       2.83       6.26       3.37       6.24        5.16       13.69   

          CA.1924346  CA.1926760  CA.1927842  CA.1933414  ...  CAP.2040686  \
TSPAN6          3.96        3.58        6.41       11.89  ...         6.66   
TNMD            0.00        0.23        0.39        0.44  ...         0.12   
DPM1            7.80        7.62        6.40        6.09  ...         4.93   
SCYL3           9.05        5.37        5.92       12.45  ...         8.02   
C1orf112        6.69        5.28        7.65       13.71  ...         7

In [18]:
print(df)

               CA.102548  CA.104338  CA.105094  CA.109745  CA.1906415  \
TSPAN6              0.93       1.97       0.00       5.45        4.52   
TNMD                0.00       0.00       0.00       0.00        0.00   
DPM1                0.00       0.43       0.00       3.43        8.45   
SCYL3               5.78       5.17       8.76       4.58        7.20   
C1orf112            2.83       6.26       3.37       6.24        5.16   
...                  ...        ...        ...        ...         ...   
RP11-1084J3.4       0.00       0.16       0.00       0.00        0.00   
RP11-944L7.5        0.00       0.00       0.00       0.00        0.00   
FLJ00388            0.00       0.00       0.00       0.83        0.00   
RP11-474G23.1       0.00       0.00       0.00       0.10        0.64   
AC005358.1          0.00       0.00       0.00       0.00        0.00   

               CA.1912627  CA.1924346  CA.1926760  CA.1927842  CA.1933414  \
TSPAN6               4.75        3.96        3

In [25]:
gsm = list(gse.gsms.values())[0]
print(gsm.metadata.keys())
print(gsm.metadata.get("platform_id"))


dict_keys(['title', 'geo_accession', 'status', 'submission_date', 'last_update_date', 'type', 'channel_count', 'source_name_ch1', 'organism_ch1', 'taxid_ch1', 'characteristics_ch1', 'molecule_ch1', 'extract_protocol_ch1', 'description', 'data_processing', 'platform_id', 'contact_name', 'contact_institute', 'contact_address', 'contact_city', 'contact_state', 'contact_zip/postal_code', 'contact_country', 'instrument_model', 'library_selection', 'library_source', 'library_strategy', 'relation', 'supplementary_file_1', 'series_id', 'data_row_count'])
['GPL11154']


In [26]:
import pandas as pd

meta = []

for gsm_name, gsm in gse.gsms.items():
    characteristics = gsm.metadata.get("characteristics_ch1", [])
    
    # Convert key:value pairs into dictionary entries
    char_dict = {}
    for item in characteristics:
        if ":" in item:
            key, val = item.split(":", 1)
            char_dict[key.strip()] = val.strip()
    
    meta.append({
        "GSM": gsm_name,
        "title": gsm.metadata.get("title", [""])[0],
        **char_dict  # unpack characteristics
    })

pheno = pd.DataFrame(meta)
print("✅ Phenotype data shape:", pheno.shape)
print(pheno.head(10))


✅ Phenotype data shape: (60, 5)
          GSM        title        tissue metastasis    donor
0  GSM5574685   tumor rep1  breast tumor        yes   102548
1  GSM5574686   tumor rep2  breast tumor        yes   104338
2  GSM5574687   tumor rep3  breast tumor        yes   105094
3  GSM5574688   tumor rep4  breast tumor         no   109745
4  GSM5574689   tumor rep5  breast tumor         no  1906415
5  GSM5574690   tumor rep6  breast tumor        yes  1912627
6  GSM5574691   tumor rep7  breast tumor         no  1924346
7  GSM5574692   tumor rep8  breast tumor         no  1926760
8  GSM5574693   tumor rep9  breast tumor        yes  1927842
9  GSM5574694  tumor rep10  breast tumor         no  1933414
