### Install necessary packages

In [0]:
!pip install cptac
!pip install uniprot
!pip install xmltodict
!pip3 install tqdm

### Import libraries

In [0]:
# Imports all the standard libraries we will need
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cptac
import cptac.utils as ut
import uniprot
import pprint
import urllib.parse
import urllib.request
import json
import xmltodict
import urllib3
from tqdm import tqdm
import warnings

### Mount drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

current_dir = '/content/drive/My Drive/BIO 465 Group Project/'

### Get representative dataset

In [0]:
cptac.download(dataset="endometrial")
en = cptac.Endometrial()
dat = en.get_CNV()
dat.head(10)

### Functions

In [0]:
def find_in_dict(dict, key, addtl_key='', addtl_key2=''):
  found = False
  entries = []
  for d in dict:
    if d['@type'] == key:
      found = True
      if addtl_key == '':
        entries.append(d)
      elif addtl_key2 == '':
        if type(d) is list:
          for i in d:
            entries.append(i[addtl_key])
        else:
          entries.append(d[addtl_key])
      else:
        if type(d[addtl_key]) is list:
          for i in d[addtl_key]:
            entries.append(i[addtl_key2])
        else:
          entries.append(d[addtl_key][addtl_key2])
    elif found:
      break
  return entries

In [0]:
def get_gene_data(gene):
  url = 'https://www.uniprot.org/uniprot/'

  params = {
  'from': 'GENENAME',
  'to': 'ACC',
  'format': 'list',
  'query': '{}+organism:human+reviewed:yes'.format(gene),
  'sort': 'score'
  }

  data = urllib.parse.urlencode(params)
  data = data.encode('utf-8')
  req = urllib.request.Request(url, data)
  with urllib.request.urlopen(req) as f:
    response = f.read()
  this_id = response.decode('utf-8').split('\n')[0]

  #print(this_id)

  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    http = urllib3.PoolManager()
    response2 = http.request('GET', url + this_id + '.xml')
    
  #print(response2.data)
  data = xmltodict.parse(response2.data)

  if 'uniprot' not in data.keys():
    return "No data"

  #print(data)

  # create little dictionary
  this_gene = {
      'uniprot': this_id,
      'protein': data['uniprot']['entry']['protein']['recommendedName']['fullName'],
      'function': find_in_dict(data['uniprot']['entry']['dbReference'], 'Reactome', '@id'),
      'chromosome': find_in_dict(data['uniprot']['entry']['dbReference'], 'Proteomes', 'property', '@value')#,
      #'location': find_in_dict(data['uniprot']['entry']['comment'], 'subcellular location', 'subcellularLocation', 'loc ation'),
      #'tissue': find_in_dict(data['uniprot']['entry']['comment'], 'tissue specificity', 'text'),
      #'isoforms': find_in_dict(data['uniprot']['entry']['comment'], 'alternative products', 'isoform', 'id')
  }

  return this_gene

In [0]:
# df.describe().T : statistics for each column

with open(current_dir + 'uniprot_dict.json') as json_file:
    all_genes = json.load(json_file)

### Generate file

In [0]:
start = 0
loop = tqdm(total=len(dat.columns), position=0, leave=False)

for i in range(len(dat.columns)):
  start = i
  loop.update(1)
  #loop.set_description('gene:{}\n'.format(dat.columns[i]))
  if dat.columns[i] not in all_genes.keys():
    break


for i in range(start, len(dat.columns)):
  all_genes[dat.columns[i]] = get_gene_data(dat.columns[i])

  if i % 100 == 0:
    json_f = json.dumps(all_genes)
    with open(current_dir + 'uniprot_dict.json', 'w+') as f:
      f.write(json_f)

  loop.set_description('gene:{}'.format(dat.columns[i]))
  loop.update(1)
loop.close()

# add to big dictionary
#all_genes[col] = this_gene

# save to file
#json_f = json.dumps(all_genes)
#with open(current_dir + 'uniprot_dict.json', 'w+') as f:
#  f.write(json_f)
