In [129]:
# imports
import os
import GEOparse
import pandas as pd
import numpy as np

In [130]:
# load/download GEO data
path = '../data/'
if len(os.listdir(path)) == 2:
  gse = GEOparse.get_GEO(filepath=path + 'GSE4115_family.soft.gz')
  gsd = GEOparse.get_GEO(filepath=path + 'GDS2771.soft.gz')
else:
  gse = GEOparse.get_GEO(geo="GSE4115", destdir=path)
  gsd = GEOparse.get_GEO(geo="GDS2771", destdir=path)

22-Jul-2021 00:43:27 INFO GEOparse - Parsing ../data/GSE4115_family.soft.gz: 
22-Jul-2021 00:43:27 DEBUG GEOparse - DATABASE: GeoMiame
22-Jul-2021 00:43:27 DEBUG GEOparse - SERIES: GSE4115
22-Jul-2021 00:43:27 DEBUG GEOparse - PLATFORM: GPL96
22-Jul-2021 00:43:28 DEBUG GEOparse - SAMPLE: GSM93997
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94019
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94020
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94021
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94022
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94023
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94024
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94025
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94026
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94027
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94028
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94029
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMPLE: GSM94030
22-Jul-2021 00:43:29 DEBUG GEOparse - SAMP

In [131]:
# dataset description
print('Description:\n', gsd.metadata['description'])
print('Title:\n', gsd.metadata['title'])
print('Value Type:\n', gsd.metadata['value_type'])
print('Type:\n', gse.metadata['type'])
print('Summary:\n', gse.metadata['summary'])
print('Overall Design:\n', gse.metadata['overall_design'])

Description:
 ['Analysis of large airway epithelial cells from cigarette smokers without cancer, with cancer, and with suspect lung cancer. Results provide insight into the feasibility of using gene expression to detect early stage lung cancer in smokers.']
Title:
 ['Large airway epithelial cells from cigarette smokers with suspect lung cancer']
Value Type:
 ['transformed count']
Type:
 ['Expression profiling by array']
Summary:
 ['RNA was obtained from histologically normal bronchial epithelium of smokers during time of clinical bronchoscopy from relatively accessible airway tissue. Gene expression data from smokers with lung cancer was compared with samples from smokers without lung cancer. This allowed us to generate a diagnostic gene expression profile that could distinguish the two classes. This profile could provide additional clinical benefit in diagnosing cancer amongst smokers with suspect lung cancer.', 'Keywords: Disease state analysis']
Overall Design:
 ['79 total arrays ru

In [132]:
# biomakers
genes = [
  'IL8',
  'CD55',
  'RGS1',
  'PLA2G4A',
  'C6',
  'DEFB1',
  'TPD52',
  'CD164',
  'CXCL2',
  'SERPINA1',
  'FCGR3A',
  'SPN',
  'TOB1',
  'DUSP6',
  'CCT2',
  'PPBP',
  'PPP2CA',
  'ATP6AP2',
  'PTK9',
  'ANXA3',
  'FGF14',
  'NUCKS1',
  'HDGF2',
  'DMD',
  'NELL2',
  'ACTR2',
  'CPNE3',
  'FOS',
  'SOX9',
  'UBXD2',
  'ZC3H7B',
  'CCDC81',
  'ZNF354A',
  'ZNF160',
  'ZNF611',
  'LMO4',
  'YWHAE',
  'TMED2',
  'DNAJC12',
  'RAB1A',
  'TRAM1',
  'LOC653471',
  'RPL35A',
  'GLT28D1',
  'TSR1',
  'BACH2',
  'LOC153561',
  'COX5B',
  'DUOX1',
  'UBE2D2',
  'SENP6',
  'FBXW12',
  'GTF2H3',
  'DCLRE1C',
  'SLC39A8',
  'SLC4A4',
  'TMEM47',
  'UBE2J1',
  'FXR1',
  'ARL6IP5',
  'C1orf80',
  'ATP8B1',
  'AD7C-NTP',
  'STARD7',
  'FTO',
  'DKFZP434A0131',
  'FTL',
  'FLJ14346',
  'PRR11',
  'KIAA0738',
  'ALMS1',
  'LOC152719',
]

len(genes)

72

In [133]:
# selected data for analysis
data = gsd.table.drop_duplicates(subset=['IDENTIFIER']).query('IDENTIFIER in @genes')
cols = data['IDENTIFIER']
data = data.drop(['ID_REF', 'IDENTIFIER'], axis=1).T
data.columns = cols

data['CLASS'] = gsd.columns['disease state']

data.head()

IDENTIFIER,STARD7,TMED2,ACTR2,ARL6IP5,UBE2D2,TRAM1,ATP6AP2,FXR1,TPD52,CD55,...,UBE2J1,TSR1,DNAJC12,PRR11,DCLRE1C,CCDC81,BACH2,FGF14,GTF2H3,CLASS
GSM93997,9.83023,9.97877,6.35497,6.92793,7.69779,9.40571,3.28228,4.45694,5.4287,7.88733,...,6.00876,5.91865,6.23475,7.44096,5.6784,8.80719,3.99275,5.97948,5.96075,no cancer
GSM94077,9.52652,9.45514,7.20277,6.81656,6.69856,8.49452,3.69789,5.77118,4.13438,5.58063,...,5.39663,7.56681,5.2515,9.25598,6.97001,7.14709,3.81177,4.437,7.75606,no cancer
GSM94078,9.46638,9.49856,5.46486,5.69999,6.6491,9.08392,3.57176,4.33171,4.10273,5.80711,...,4.9469,7.38292,5.26488,8.32976,6.84481,7.26304,3.73244,4.88644,6.74153,no cancer
GSM94079,9.42326,9.54988,6.44251,7.38279,6.87374,8.64206,3.59308,4.94478,5.40378,5.93021,...,5.54001,7.31449,5.15691,8.46424,6.58894,7.81053,3.80185,5.36702,6.75565,no cancer
GSM94080,8.8717,8.69152,6.51238,4.82235,6.20103,7.85745,4.38852,4.42237,4.10417,6.01965,...,5.0619,8.20394,5.86241,9.25034,7.17475,6.5577,4.32888,5.0631,7.75842,no cancer
