In [5]:

import requests
import time
import json

from tqdm import tqdm


# Download uniprot id mapping and save it to data/uniprot_id_mapping.json
# We used L1000.txt genes as an input to uniprot id mapper

# https://www.uniprot.org/id-mapping/uniprotkb/b9d6a74dbbc83b9228e0bbc46a6e94ca732cb379/overview?facets=reviewed%3Atrue&query=*&fields=accession%2Cid%2Cgene_names


l1000 = []
with open('../data/L1000.txt', 'r') as file:
    for line in file:
        l1000.append(line.rstrip())

with open('../data/uniprot_id_mapping.json', 'r') as file:
    # uniprot_data = json.loads(file.read())
    gene_to_uniprot = {}
    for result in json.loads(file.read())['results']:
        gene_to_uniprot[result['from']] = result['to']['primaryAccession']

with open('../data/L1000_to_uniprot.txt', 'w') as f:
    for gene in l1000:
        f.write(gene + "\t" + gene_to_uniprot.get(gene, '?') + "\n")



# Download uniprot data
uniprot_data = []
with open('../data/L1000_to_uniprot.txt', 'r') as file:

    for line in tqdm(file):
        l1000_name, uniprotkb_assension = line.rstrip().split('\t')

        if uniprotkb_assension == '?':
            continue
        
        response = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprotkb_assension}.json')

        with open(f'../data/uniprot_data/by_id/{uniprotkb_assension}.json', 'w') as file:
            json.dump(response.json(), file)

        time.sleep(0.1)


1058it [17:16,  1.02it/s]


In [9]:
# store uniprot data in a csv file (L1000_name, uniprotkb_assension, function_description, subunit_interactions)
import pandas as pd

uniprot_data = []

with open('data/L1000_to_uniprot.txt', 'r') as file:
    for line in file:
        l1000_name, uniprotkb_assension = line.rstrip().split('\t')

        if uniprotkb_assension == '?':
            continue

        with open(f'data/uniprot_data/by_id/{uniprotkb_assension}.json', 'r') as f:
            protein_data = json.loads(f.read())

            for comment in protein_data['comments']:
                if comment['commentType'] == 'FUNCTION':
                    function_description = comment['texts'][0]['value']

                if comment['commentType'] == 'SUBUNIT':
                    subunit_interactions = comment['texts'][0]['value']

            uniprot_data.append((l1000_name, uniprotkb_assension, function_description, subunit_interactions))

df = pd.DataFrame(uniprot_data, columns=['L1000_name', 'uniprotkb_assension', 'function_description', 'subunit_interactions'])
df.to_csv(f'data/uniprot_data/uniprot_data.csv', index=False)