**Author:** Benoît BAILLIF

**Purpose:** Download required raw data to perform downstream analysis (visualizations, models...)  

**Input:** URL strings corresponding to L1000 signatures metadata (and profiled compounds)  
**Output:** Create files in ./data/raw directory

In [1]:
import requests
import gzip
import shutil
import os

In [2]:
data_directory = './data/'
if not os.path.exists(data_directory) :
    os.mkdir(data_directory)

In [3]:
raw_data_directory = data_directory + 'raw/'
if not os.path.exists(raw_data_directory) :
    os.mkdir(raw_data_directory)

In [4]:
def clean_url(url) :
    """Replace special characters with their real characters"""
    cleaned_url = url.replace('%5F', '_')
    cleaned_url = cleaned_url.replace('%2D', '-')
    cleaned_url = cleaned_url.replace('%2E', '.')
    return cleaned_url

In [5]:
def download_uncompressed_file(raw_url, save_path = 'data/raw/') :
    """Purpose is to download a file from a given URL
    Parameters :
    str raw_url : URL to fetch
    str save_path : Path where to save the file"""
    
    cleaned_url = clean_url(raw_url)
    
    filename = cleaned_url.split('/')[-1]
    if 'file=' in filename :
        filename = filename.split('file=')[-1]
        
    filepath = save_path + filename
    
    with requests.get(cleaned_url, stream=True, verify=False) as r:
        r.raise_for_status()
        with open(filepath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
        
    if filename.split('.')[-1] == 'gz':
        with gzip.open(filepath, 'rb') as f_in :
            uncompressed_filename = '.'.join(filename.split('.')[:-1])
            uncompressed_filepath = save_path + uncompressed_filename
            with open(uncompressed_filepath, 'wb') as f_out :
                shutil.copyfileobj(f_in, f_out)

In [6]:
gse70138_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE70nnn/GSE70138/suppl/"
#https://ftp.ncbi.nlm.nih.gov/geo/series/GSE70nnn/GSE70138/suppl/GSE70138%5FBroad%5FLINCS%5Fcell%5Finfo%5F2017%2D04%2D28%2Etxt%2Egz

In [16]:
gse70138_cell_info_url = gse70138_url + "GSE70138_Broad_LINCS_cell_info_2017-04-28.txt.gz"
download_uncompressed_file(gse70138_cell_info_url, raw_data_directory)

In [7]:
gse70138_gene_info_url = gse70138_url + "GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz"
download_uncompressed_file(gse70138_gene_info_url, raw_data_directory)

In [8]:
gse70138_pert_info_url = gse70138_url + "GSE70138_Broad_LINCS_pert_info.txt.gz"
download_uncompressed_file(gse70138_pert_info_url, raw_data_directory)

In [9]:
gse70138_sig_info_url = gse70138_url + "GSE70138_Broad_LINCS_sig_info_2017-03-06.txt.gz"
download_uncompressed_file(gse70138_sig_info_url, raw_data_directory)

In [10]:
gse70138_signatures_url = gse70138_url + "GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017-03-06.gctx.gz"
download_uncompressed_file(gse70138_signatures_url, raw_data_directory)

In [11]:
gse92742_url = 'https://ftp.ncbi.nlm.nih.gov/geo/series/gse92nnn/gse92742/suppl/'

In [12]:
gse92742_cell_info_url = gse92742_url + "GSE92742_Broad_LINCS_cell_info.txt.gz"
download_uncompressed_file(gse92742_cell_info_url, raw_data_directory)

In [13]:
gse92742_gene_info_url = gse92742_url + "GSE92742_Broad_LINCS_gene_info.txt.gz"
download_uncompressed_file(gse92742_gene_info_url, raw_data_directory)

In [14]:
gse92742_pert_info_url = gse92742_url + "GSE92742_Broad_LINCS_pert_info.txt.gz"
download_uncompressed_file(gse92742_pert_info_url, raw_data_directory)

In [15]:
gse92742_sig_info_url = gse92742_url + "GSE92742_Broad_LINCS_sig_info.txt.gz"
download_uncompressed_file(gse92742_sig_info_url, raw_data_directory)

In [16]:
gse92742_signatures_url = gse92742_url + "GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx.gz"
download_uncompressed_file(gse92742_signatures_url, raw_data_directory)

In [6]:
pubchem_sqlite_url = "https://cluster.hpcc.ucr.edu/~tbackman/bioassayR/pubchem_protein_only.sqlite"
download_uncompressed_file(pubchem_sqlite_url, raw_data_directory)



In [31]:
lincs_compound_info_url = "http://lincsportal.ccs.miami.edu/dcic/api/download?path=LINCS_Data/Metadata/Small_Molecules/2018_02_20&file=CompoundTable_LINCS_StandardizedCmpds_LSMIDs.txt"
download_uncompressed_file(LINCS_compound_info_url, raw_data_directory)

In [32]:
cmap_to_lincs_mapping_url = "http://lincsportal.ccs.miami.edu/dcic/api/download?path=LINCS_Data/Metadata/Small_Molecules/2018_02_20&file=LincsID2FacilityID_LINCS_StandardizedCmpds_LSMIDs.txt"
download_uncompressed_file(cmap_to_lincs_mapping_url, raw_data_directory)

In [33]:
drug_repurposing_hub_url = "https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20180907.txt"
download_uncompressed_file(drug_repurposing_hub_url, raw_data_directory)