In [1]:
import os
import re
from Bio import Entrez
import GEOparse
import gzip
import pandas as pd
import re
import json
import xml.etree.ElementTree as ET
from xml.etree import cElementTree as ElementTree
import matplotlib.pyplot as plt
Entrez.email = 'mondrolan@gmail.com'

In [2]:
nb_retmax = 5000
handle = Entrez.esearch(db="gds", term="androgen receptor AND Homo Sapiens[porgn]", retmax = nb_retmax)
record = Entrez.read(handle)

In [3]:
record.keys()

dict_keys(['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'TranslationStack', 'QueryTranslation'])

In [4]:
def download_entry_GEO(GEO_id, directory): #télécharge Les résumés des entrées de La recherche par mots clés dans GEODataSets
    """"Crée un dossier contenant un dossier par identifiant et au sein de ce dossier
    crée le résumé (.txt) concernant l‘identifiant"""
    for entry in GEO_id:
        handle = Entrez.efetch(db="gds", id=[entry], retmode="text") #gds = banque de données GEO DataSets
        record = handle.read()
        record.encode("utf-8")
        path = directory+entry
        if not os.path.isdir(path):
            os.makedirs(path) #crée Le dossier Liés a L‘identifiant GEO
        fname = path+'/GEOentry'+entry+'.txt'
        with open(fname,'a', encoding="utf-8") as f : #rédaction du fichier texte résumant Les données de GEO datasets
            f.write(record)

In [5]:
dentry_geo = download_entry_GEO(GEO_id = record['IdList'], directory = 'scrapping/entry')

In [None]:
dentry_geo

In [6]:
def build_GEO_GSE(directory): #Lien entre Les identifiants GEO et Les numéros d'accés GSE
    """"Retourne le dictionnaire dict_GEO_GSE qui donne le lien entre GEO_id et GSEXXX.
    Prend en argument le chemin d'accés du dossier contenant les fichiers textes
    contenant les données GEO"""
    files = os.listdir(directory) #donne la Liste des fichiers
    dict_GEO_GSE = {}
    for entry in files:
        if entry != '.':
            fname = directory+'/'+entry+'/GEO'+entry+'.txt'
            # print (fname)
            resume = open(fname, "r", encoding="utf-8")
            all_lines = resume.readlines()
            resume.close()
            for line in all_lines:
                acc_nb = re.search('(GSE\d+)\s+', line) #recherche du numéros d'accés de La série GSE
                if acc_nb != None:
                    acc_nb = acc_nb.group(0)
                    acc = acc_nb.split("\t")
                    dict_GEO_GSE[entry] = acc[0].split()[0] #un unique numéro GSE par GEO_id
    return dict_GEO_GSE

In [7]:
bgeo_gse = build_GEO_GSE(directory = 'scrapping')

In [15]:
bgeo_gse

{'entry1329': 'GSE1561',
 'entry1836': 'GSE3871',
 'entry200000846': 'GSE846',
 'entry200001561': 'GSE1561',
 'entry200003871': 'GSE3871',
 'entry200004027': 'GSE4027',
 'entry200004084': 'GSE4084',
 'entry200004353': 'GSE4353',
 'entry200004399': 'GSE4399',
 'entry200004453': 'GSE4453',
 'entry200004454': 'GSE4454',
 'entry200004456': 'GSE4456',
 'entry200004636': 'GSE4636',
 'entry200005345': 'GSE5345',
 'entry200005850': 'GSE5850',
 'entry200006796': 'GSE6796',
 'entry200006797': 'GSE6797',
 'entry200007585': 'GSE7585',
 'entry200007708': 'GSE7708',
 'entry200007868': 'GSE7868',
 'entry200008402': 'GSE8402',
 'entry200008466': 'GSE8466',
 'entry200008533': 'GSE8533',
 'entry200008534': 'GSE8534',
 'entry200008860': 'GSE8860',
 'entry200009000': 'GSE9000',
 'entry200011155': 'GSE11155',
 'entry200011428': 'GSE11428',
 'entry200011847': 'GSE11847',
 'entry200012200': 'GSE12200',
 'entry200012348': 'GSE12348',
 'entry200012438': 'GSE12438',
 'entry200013332': 'GSE13332',
 'entry2000139

In [8]:
print(len(bgeo_gse.keys()))
print(len(bgeo_gse.values()))

941
941


In [9]:
def build_GSE_GEO(AR_GEO_GSE): #Lien entre Les numéros d'accés GSE et tes identifiants GEO
    """Retourne le dictionnaire dict_GSE_GEO en ayant pour argument le dictionnaire
    de lien entre GEO_id et GSEXXX (build_GEO_GSE())"""
    dict_GSE_GEO = {}
    for ID in AR_GEO_GSE:
        GSE_id = AR_GEO_GSE[ID]
        if GSE_id not in dict_GSE_GEO:
            dict_GSE_GEO[GSE_id] = [ID]
        else:
            dict_GSE_GEO[GSE_id].append(ID)
    return dict_GSE_GEO

In [10]:
bgse_geo = build_GSE_GEO(AR_GEO_GSE = bgeo_gse)

In [11]:
print(len(bgse_geo.keys()))
print(len(bgse_geo.values()), end = '\n')

for key, value in bgse_geo.items():
    print(key, len([item for item in value]))

531
531
GSE1561 2
GSE3871 2
GSE846 2
GSE4027 2
GSE4084 52
GSE4353 1
GSE4399 1
GSE4453 1
GSE4454 1
GSE4456 1
GSE4636 3
GSE5345 1
GSE5850 1
GSE6796 1
GSE6797 1
GSE7585 1
GSE7708 2
GSE7868 2
GSE8402 1
GSE8466 1
GSE8533 1
GSE8534 1
GSE8860 1
GSE9000 1
GSE11155 1
GSE11428 1
GSE11847 1
GSE12200 1
GSE12348 3
GSE12438 1
GSE13332 1
GSE13919 1
GSE14028 1
GSE14043 1
GSE14092 1
GSE14097 1
GSE14464 1
GSE14575 1
GSE15091 1
GSE17044 7
GSE17461 1
GSE17466 16
GSE18022 1
GSE18122 1
GSE18146 1
GSE18338 1
GSE18401 1
GSE18402 1
GSE18684 1
GSE19445 1
GSE19561 2
GSE21034 1
GSE21035 1
GSE21036 1
GSE21245 1
GSE21887 2
GSE22010 1
GSE22076 1
GSE22483 2
GSE22606 2
GSE22914 1
GSE23814 1
GSE23815 1
GSE26483 1
GSE27636 1
GSE27682 1
GSE27823 1
GSE28126 1
GSE28219 1
GSE28264 1
GSE28596 1
GSE28788 1
GSE28857 1
GSE28948 1
GSE28950 1
GSE29232 1
GSE29650 1
GSE30622 1
GSE30623 1
GSE30624 1
GSE31362 1
GSE31410 1
GSE31528 1
GSE31978 1
GSE32345 3
GSE32356 1
GSE32875 1
GSE32892 1
GSE32982 1
GSE34042 1
GSE34589 1
GSE34780 1
GSE

In [12]:
def download_SOFT(AR_GSE_GEO, directory): #télécharge Les fichiers soft pour chacun des GSEXXX
    """Télécharge les fichiers .soft pour chacun des GSEXXX en ayant pour arguments le dictionnaire des
    liens entre GSE et GEO_id (build_GSE_GEO(AR_GEO_GSE)) et le chemin d'accés voulu
    gse_id_fails est une liste qui contient les entrées gse n'ayant pas été téléchargées
    gse_id_success est une liste qui contient les entrées gse téléchargées"""
    gse_id_fails = []
    gse_id_success = []
    
    for gse_id in AR_GSE_GEO.keys():
        path = directory+gse_id
        gse_id = gse_id.strip()
        try:
        # gse_handle = GEOparse.get_GEO(geo = gse_id, destdir = path)
            print (" Downloading {0} \n".format(gse_id))
            gse_handle = GEOparse.get_GEO(geo = gse_id, destdir = path, silent = True)
            gse_id_success.append(gse_id)
        except :
            print (" !!! Download error with {0} \n".format(gse_id))
            gse_id_fails.append(gse_id)
    return (gse_id_success)

In [13]:
soft = download_SOFT(AR_GSE_GEO = bgse_geo, directory = 'softfiles/soft')

 Downloading GSE1561 

 Downloading GSE3871 

 Downloading GSE846 

 Downloading GSE4027 

 Downloading GSE4084 

 Downloading GSE4353 

 Downloading GSE4399 

 Downloading GSE4453 

 Downloading GSE4454 

 Downloading GSE4456 

 Downloading GSE4636 

 Downloading GSE5345 

 Downloading GSE5850 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE6796 

 Downloading GSE6797 

 Downloading GSE7585 

 Downloading GSE7708 

 Downloading GSE7868 

 Downloading GSE8402 

 Downloading GSE8466 

 Downloading GSE8533 

 Downloading GSE8534 

 Downloading GSE8860 

 Downloading GSE9000 

 Downloading GSE11155 

 Downloading GSE11428 

 Downloading GSE11847 

 Downloading GSE12200 

 Downloading GSE12348 

 Downloading GSE12438 

 Downloading GSE13332 

 Downloading GSE13919 

 Downloading GSE14028 

 Downloading GSE14043 

 Downloading GSE14092 

 Downloading GSE14097 

 Downloading GSE14464 

 Downloading GSE14575 

 Downloading GSE15091 

 Downloading GSE17044 

 Downloading GSE17461 

 Downloading GSE17466 

 Downloading GSE18022 

 Downloading GSE18122 

 Downloading GSE18146 

 Downloading GSE18338 

 Downloading GSE18401 

 Downloading GSE18402 

 Downloading GSE18684 

 Downloading GSE19445 

 Downloading GSE19561 

 Downloading GSE21034 

 Downloading GSE21035 

 Downloading GSE21036 

 Downloading GSE21245 

 Do

  table_data = parse_table_data(gpl_soft)


 Downloading GSE45620 

 Downloading GSE45902 

 Downloading GSE47203 

 Downloading GSE47220 

 Downloading GSE47804 

 Downloading GSE47805 

 Downloading GSE47806 

 Downloading GSE47807 

 Downloading GSE47987 

 Downloading GSE48056 

 Downloading GSE48308 

 Downloading GSE48667 

 Downloading GSE49083 

 Downloading GSE49091 

 Downloading GSE49153 

 Downloading GSE49196 

 Downloading GSE49295 

 Downloading GSE49832 

 Downloading GSE50605 

 Downloading GSE50618 

 Downloading GSE50936 

 Downloading GSE51002 

 Downloading GSE51063 

 Downloading GSE51384 

 Downloading GSE51497 

 Downloading GSE51524 

 Downloading GSE52169 

 Downloading GSE52201 

 Downloading GSE52627 

 Downloading GSE53115 

 Downloading GSE54104 

 Downloading GSE54109 

 Downloading GSE54110 

 Downloading GSE54137 

 Downloading GSE54202 

 Downloading GSE54946 

 Downloading GSE54991 

 Downloading GSE55007 

 Downloading GSE55030 

 Downloading GSE55031 

 Downloading GSE55032 

 Downloading GSE

  table_data = parse_table_data(gpl_soft)


 Downloading GSE63693 

 Downloading GSE64656 

 Downloading GSE64885 

 Downloading GSE65066 

 Downloading GSE65478 

 Downloading GSE65562 

 Downloading GSE65738 

 Downloading GSE66037 

 Downloading GSE66187 

 Downloading GSE66722 

 Downloading GSE67537 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE67809 

 Downloading GSE67980 

 Downloading GSE69489 

 Downloading GSE69613 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE69712 

 Downloading GSE69896 

 Downloading GSE70078 

 Downloading GSE70079 

 Downloading GSE70161 

 Downloading GSE70162 

 Downloading GSE70163 

 Downloading GSE70679 

 Downloading GSE71334 

 Downloading GSE71335 

 Downloading GSE71336 

 Downloading GSE71704 

 Downloading GSE72438 

 Downloading GSE72483 

 Downloading GSE72714 

 Downloading GSE72920 

 Downloading GSE73917 

 Downloading GSE73930 

 Downloading GSE73988 

 Downloading GSE73994 

 Downloading GSE73995 

 Downloading GSE74069 

 Downloading GSE75035 

 Downloading GSE76141 

 Downloading GSE76334 

 Downloading GSE76335 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE76336 

 Downloading GSE77770 

 Downloading GSE77771 

 Downloading GSE77883 

 Downloading GSE77928 

 Downloading GSE78201 

 Downloading GSE79356 

 Downloading GSE79357 

 Downloading GSE79402 

 Downloading GSE79689 

 Downloading GSE80256 

 Downloading GSE80450 

 Downloading GSE80452 

 Downloading GSE80741 

 Downloading GSE80742 

 Downloading GSE80743 

 Downloading GSE80979 

 Downloading GSE81859 

 Downloading GSE82179 

 Downloading GSE82202 

 Downloading GSE82223 

 Downloading GSE83649 

 Downloading GSE83652 

 Downloading GSE83860 

 Downloading GSE84432 

 Downloading GSE85541 

 Downloading GSE86245 

 Downloading GSE86454 

 Downloading GSE86457 

 Downloading GSE86503 

 Downloading GSE86547 

 Downloading GSE86978 

 Downloading GSE88752 

 Downloading GSE89226 

 Downloading GSE89917 

 Downloading GSE89938 

 Downloading GSE89939 

 Downloading GSE90922 

 Downloading GSE93603 

 Downloading GSE93845 

 Downloading GSE93928 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE94013 

 Downloading GSE94243 

 Downloading GSE94520 

 Downloading GSE94577 

 Downloading GSE94580 

 Downloading GSE94682 

 Downloading GSE94783 

 Downloading GSE96084 

 Downloading GSE97204 

 Downloading GSE97549 

 Downloading GSE98009 

 Downloading GSE98069 

 Downloading GSE98809 

 Downloading GSE99378 

 Downloading GSE99381 

 Downloading GSE99820 

 Downloading GSE99857 

 Downloading GSE100224 

 Downloading GSE100710 

 Downloading GSE101607 

 Downloading GSE101635 

 Downloading GSE101897 

 Downloading GSE102124 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE102164 

 Downloading GSE102183 

 Downloading GSE103082 

 Downloading GSE103449 

 Downloading GSE103637 

 Downloading GSE104399 

 Downloading GSE104935 

 Downloading GSE105088 

 Downloading GSE106559 

 Downloading GSE106560 

 Downloading GSE107319 

 Downloading GSE107320 

 Downloading GSE107321 

 Downloading GSE107438 

 Downloading GSE108687 

 Downloading GSE109021 

 Downloading GSE109061 

 Downloading GSE109062 

 Downloading GSE109063 

 Downloading GSE109748 

 Downloading GSE109751 

 Downloading GSE109752 

 Downloading GSE109758 

 Downloading GSE109763 

 Downloading GSE110252 

 Downloading GSE110802 

 Downloading GSE110903 

 Downloading GSE110905 

 Downloading GSE113308 

 Downloading GSE114052 

 Downloading GSE114273 

 Downloading GSE114274 

 Downloading GSE114275 

 Downloading GSE114326 

 Downloading GSE114385 

 Downloading GSE114500 

 Downloading GSE114732 

 Downloading GSE115270 

 Downloading GSE116189 

 Downloading GSE116191 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE150807 

 Downloading GSE151064 

 Downloading GSE151083 

 Downloading GSE151113 

 Downloading GSE151429 

 Downloading GSE151492 

 Downloading GSE151581 

 Downloading GSE151619 

 Downloading GSE152230 

 Downloading GSE152246 

 Downloading GSE152254 

 Downloading GSE152315 

 Downloading GSE152318 

 Downloading GSE154632 

 Downloading GSE154633 

 Downloading GSE155272 

 Downloading GSE156218 

 Downloading GSE156223 

 Downloading GSE156280 

 Downloading GSE156881 

 Downloading GSE156882 

 Downloading GSE156884 

 Downloading GSE156885 

 Downloading GSE157104 

 Downloading GSE157105 

 Downloading GSE157106 

 Downloading GSE157107 

 Downloading GSE157862 

 Downloading GSE157974 

 Downloading GSE158218 

 Downloading GSE158556 

 Downloading GSE158557 

 Downloading GSE158593 

 Downloading GSE158598 



  table_data = parse_table_data(gpl_soft)


 Downloading GSE159606 

 Downloading GSE160393 

 Downloading GSE160399 

 Downloading GSE160723 

 Downloading GSE161167 

 Downloading GSE161189 

 Downloading GSE161301 

 Downloading GSE161302 

 Downloading GSE161303 

 Downloading GSE162319 

 Downloading GSE162761 

 Downloading GSE163539 

 Downloading GSE165247 

 Downloading GSE165562 

 Downloading GSE166192 

 Downloading GSE167213 

 Downloading GSE168651 

 Downloading GSE168663 

 Downloading GSE168665 

 Downloading GSE168670 

 Downloading GSE169139 

 Downloading GSE171045 

 Downloading GSE173331 

 Downloading GSE173886 

 Downloading GSE174295 

 Downloading GSE176128 

 Downloading GSE180628 

 Downloading GSE181224 

 Downloading GSE181226 

 Downloading GSE181229 

 Downloading GSE181462 

 Downloading GSE182064 

 Downloading GSE182852 

 Downloading GSE184168 

 Downloading GSE185223 



In [14]:
len(soft)

530

In [16]:
len(soft)

530

In [5]:
def read_file_SOFT(directory): #ouvre le fichier SOFT zippé et rend accessible ses données en chaines de caractéres
    """"Reatourne le fichier soft en liste de chaines de caractére ol chaque élément
    de la liste correspond & une ligne du fichier [‘lignei1', 'ligne2',...]. Prend
    en argument le chemin d'accés du fichier SOFT à analyser"""
    f = gzip.open(directory)
    file_content = f.read()
    file = str(file_content)
    all_lines = file.split('\\n')
    f.close()
    return all_lines

In [6]:
readsolf = read_file_SOFT(directory = "C:/Users/mondr/Downloads/softFiles/softGSE43146/GSE43146_family.soft.gz")
readsolf

["b'^DATABASE = GeoMiame",
 '!Database_name = Gene Expression Omnibus (GEO)',
 '!Database_institute = NCBI NLM NIH',
 '!Database_web_link = http://www.ncbi.nlm.nih.gov/geo',
 '!Database_email = geo@ncbi.nlm.nih.gov',
 '^SERIES = GSE43146',
 '!Series_title = ER\\xce\\xb2 activation by genistein promotes metastatic progression in prostate cancer',
 '!Series_geo_accession = GSE43146',
 '!Series_status = Public on Dec 27 2012',
 '!Series_submission_date = Dec 26 2012',
 '!Series_last_update_date = Jan 23 2019',
 '!Series_summary = Androgen signalling through the androgen receptor (AR) plays a critical role in prostate cancer (PCa) initiation and progression. Estrogen and the estrogen receptor, in synergy with androgen, are essential for cell growth of the normal and malignant prostate. However, the exact role that estrogen plays in prostate carcinogenesis, and the precise mechanisms involved, remain unclear.   We have previously demonstrated the metastasis-promoting effect of an estrogen r

Series_contributor : nombre de contributeurs
Series_sample_id : nombre de d'échantillons
Series_contact_city : ville de contact de la serie 
Series_contact_country : pays de contact de la serie
Series_status : date de publication de la serie

Platform_technology : technologie utilisée
Platform_distribution : mode de distribution
Platform_manufacturer : fabricant
Platform_contact_name : nom de la plateforme
Platform_contact_country : pays de la plateforme
Platform_submission_date = date de publication de la plateforme


In [21]:
import re
pattern=re.compile(r'contri') 
str_match = [x for x in readsolf if re.search('contri', x)]
print(str_match)
print('Series_contributor :', len(str_match))

['!Series_contributor = Hisae,,Nakamura', '!Series_contributor = Yuwei,,Wang', '!Series_contributor = Hui,,Xue', '!Series_contributor = Mark,T,Romanish', '!Series_contributor = Dixie,L,Mager', '!Series_contributor = Cheryl,D,Helgason', '!Series_contributor = Yuzhuo,,Wang']
Series_contributor : 7


In [7]:
def info_sample(all_lines): #récupere Les informations Liees aux echantillons
    """"Retourne le dictionnaire contenant les informations des échantillons
    { "'GSMXxx" : [{titre : données},...], ‘GSMYYY' : [{titre : données},...]}.
    Prend en argument la lecture du fichier a analyser (fonction read_file_SOFT())"""
    dict_sample = {} #initialise Les dictionnaires qui vont contenir les infos {numéros d'accés : [infos]}
    l_sample = [] #initialise les Listes [infos] qui vont contenir Les dictionnaires {titre_info : données}
    entity_id = None #initialise L'identifiant de L'entité
    state = 'STATE_START' #initialise l'état du systéme
    kvp = re.compile(r'\!(.*) = (.*)')
    entity = re.compile(r'\^(.*) = (.*)')
    for line in all_lines: #on parcoure Les différentes Lignes du fichier
        if line.startswith('^'): #identifie La sous-structure associée a une entité (SERIES, PLATFORM ou SAMPLE)
            m = entity.match(line) #recherche L'expression réguliére
            if m:
                entity_type = m.group(1) #récupére Le titre de L’entité (SERIES, PLATFORM ou SAMPLE)
                entity_id = m.group(2) #récupere Le numéros d’accés associé @ L‘entité (GSEXXX, GPLXXX Ou GSMXXX)
                if entity_type == 'SAMPLE': #identifie le type SAMPLE
                    state = 'STATE_IN_SAMPLE'
                    sample_id = entity_id
                    if sample_id not in dict_sample: #crée une nouvelle clé pour un autre identifiant s'il y en a plusieurs
                        l_sample = []
                        dict_sample[sample_id] = l_sample
                        title = []
            else:
                state = 'STATE_START'
        elif line.startswith('!'): #identifie les Lignes correspondant aux données
            m = kvp.match(line) #recherche l’expression régulière
            if m:
                key = m.group(1) #récupére le titre de L'information de la ligne (ex : Serie_...)
                value = m.group(2) #récupére les données associées au titre
                if state == 'STATE_IN_SAMPLE': #lie les données Sample_... associées a L’entité SAMPLE et Les groupe dict_sample
                    if key not in title: #vérifie qu'il n'y a pas deux fois une clé avec Le méme nom
                        cpt = 1
                        title.append(key)
                        dict_sample[sample_id].append({key : value})
                    else:
                        cpt = cpt +1
                        topic = key + "_{0}".format(cpt) #renomme La clé si elle existe déja
                        dict_sample[sample_id].append({topic : value})
    return dict_sample

In [8]:
info  = info_sample(readsolf)

In [23]:
info

{'GSM1057363': [{'Sample_title': 'LTL313h_control_rep1'},
  {'Sample_geo_accession': 'GSM1057363'},
  {'Sample_status': 'Public on Dec 27 2012'},
  {'Sample_submission_date': 'Dec 26 2012'},
  {'Sample_last_update_date': 'Feb 05 2013'},
  {'Sample_type': 'RNA'},
  {'Sample_channel_count': '1'},
  {'Sample_source_name_ch1': 'LTL313 prostate cancer xenograft, control (not-treated)'},
  {'Sample_organism_ch1': 'Homo sapiens'},
  {'Sample_taxid_ch1': '9606'},
  {'Sample_characteristics_ch1': 'tissue: LTL313 prostate cancer xenograft'},
  {'Sample_characteristics_ch1_2': 'host: NOD-SCID male mice'},
  {'Sample_growth_protocol_ch1': 'The establishment of two distinct human tumour lines,  LTL313, has been described previously (Andersen RJ,Mawji NR, et al. 2010, Watahiki A,Wang Y, et al. 2011, Hu P,Chu GC, et al. 2011).Briefly, they were derived from primary biopsy PCa specimen, which was grafted under the renal capsule of NOD-SCID male mice, where the tumour tissue receives sufficient amounts

In [25]:
info['GSM1057364']

[{'Sample_title': 'LTL313h_control_rep2'},
 {'Sample_geo_accession': 'GSM1057364'},
 {'Sample_status': 'Public on Dec 27 2012'},
 {'Sample_submission_date': 'Dec 26 2012'},
 {'Sample_last_update_date': 'Feb 05 2013'},
 {'Sample_type': 'RNA'},
 {'Sample_channel_count': '1'},
 {'Sample_source_name_ch1': 'LTL313 prostate cancer xenograft, control (not-treated)'},
 {'Sample_organism_ch1': 'Homo sapiens'},
 {'Sample_taxid_ch1': '9606'},
 {'Sample_characteristics_ch1': 'tissue: LTL313 prostate cancer xenograft'},
 {'Sample_characteristics_ch1_2': 'host: NOD-SCID male mice'},
 {'Sample_growth_protocol_ch1': 'The establishment of two distinct human tumour lines,  LTL313, has been described previously (Andersen RJ,Mawji NR, et al. 2010, Watahiki A,Wang Y, et al. 2011, Hu P,Chu GC, et al. 2011).Briefly, they were derived from primary biopsy PCa specimen, which was grafted under the renal capsule of NOD-SCID male mice, where the tumour tissue receives sufficient amounts of oxygen and nutrients (W

In [10]:
def table_info(dict_data):
    """Retourne le tableau Pandas contenant les informations contenues dans le dictionnaire en argument.
    Présente en ligne les numéros d’acceés (GSEXXX, GPLXXX ou GSMXXX) et en colonne le titre des lignes
    du fichier SOFT"""
    row = []
    info = []
    data = {}
    for ID in dict_data.keys():
        row.append(ID)
        info.append(dict_data[ID])
    for l_info in info:
        for dict_info in l_info:
            for title in dict_info.keys():
                if title not in data:
                    value = [dict_info[title]]
                    data[title] = value
                else:
                    data[title].append(dict_info[title])
    df = pd.DataFrame(data, index = row)
    return df

In [13]:
df = table_info(dict_data = info)
df 

Unnamed: 0,Sample_title,Sample_geo_accession,Sample_status,Sample_submission_date,Sample_last_update_date,Sample_type,Sample_channel_count,Sample_source_name_ch1,Sample_organism_ch1,Sample_taxid_ch1,...,Sample_platform_id,Sample_contact_name,Sample_contact_institute,Sample_contact_address,Sample_contact_city,Sample_contact_zip/postal_code,Sample_contact_country,Sample_supplementary_file,Sample_series_id,Sample_data_row_count
GSM1057363,LTL313h_control_rep1,GSM1057363,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, control (not...",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057364,LTL313h_control_rep2,GSM1057364,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, control (not...",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057365,LTL313h_control_rep3,GSM1057365,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, control (not...",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057366,LTL313h_genistein_rep1,GSM1057366,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, genistein-tr...",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057367,LTL313h_genistein_rep2,GSM1057367,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, genistein-tr...",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057368,LTL313h_genistein_rep3,GSM1057368,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, genistein-tr...",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057369,LTL313h_ICI_rep1,GSM1057369,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, ICI-treated",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057370,LTL313h_ICI_rep2,GSM1057370,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, ICI-treated",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000
GSM1057371,LTL313h_ICI_rep3,GSM1057371,Public on Dec 27 2012,Dec 26 2012,Feb 05 2013,RNA,1,"LTL313 prostate cancer xenograft, ICI-treated",Homo sapiens,9606,...,GPL6480,"Hisae,,Nakamura",BC Cancer Agency,675 West 10th Ave,Vancouver,V5Z1L3,Canada,NONE,GSE43146,41000


In [18]:
df.columns

Index(['Sample_title', 'Sample_geo_accession', 'Sample_status',
       'Sample_submission_date', 'Sample_last_update_date', 'Sample_type',
       'Sample_channel_count', 'Sample_source_name_ch1', 'Sample_organism_ch1',
       'Sample_taxid_ch1', 'Sample_characteristics_ch1',
       'Sample_characteristics_ch1_2', 'Sample_growth_protocol_ch1',
       'Sample_molecule_ch1', 'Sample_extract_protocol_ch1',
       'Sample_label_ch1', 'Sample_label_protocol_ch1', 'Sample_hyb_protocol',
       'Sample_scan_protocol', 'Sample_description', 'Sample_data_processing',
       'Sample_platform_id', 'Sample_contact_name', 'Sample_contact_institute',
       'Sample_contact_address', 'Sample_contact_city',
       'Sample_contact_zip/postal_code', 'Sample_contact_country',
       'Sample_supplementary_file', 'Sample_series_id',
       'Sample_data_row_count'],
      dtype='object')

In [None]:
pubmed 309 publications