# Test Python functionality

In [1]:
print('hey', 9+6)
import os
print(os.uname())

hey 15
posix.uname_result(sysname='Linux', nodename='cn3121', release='3.10.0-862.14.4.el7.x86_64', version='#1 SMP Wed Sep 26 15:12:11 UTC 2018', machine='x86_64')


# Main workflow

## (1) Set the project directory and load the main Python library

In [1]:
project_dir = '/data/BIDS-HPC/private/projects/dmi'
import os, sys
checkout_dir = os.path.join(project_dir, 'checkout')
if checkout_dir not in sys.path:
    sys.path.append(checkout_dir)
import target_class_lib as tc

## (2) Generate or read in the initial lookup lists that can be calculated by querying the Ensembl REST API

In [20]:
symbol_lookup_list, id_lookup_list = tc.get_initial_lookup_lists(project_dir)

Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/initial_lookup_lists.pkl...


## (3) Go through symbol_lookup_list, split into chunks all the names that don't have Ensembl IDs, and try to determine them if the pickle file corresponding to each chunk doesn't yet exist

In [47]:
# Takes about 20 hours if the pickle files don't already exist
missing_lookup_chunks = tc.get_missing_lookups_in_chunks(symbol_lookup_list, project_dir, chunk_size=1000, pickle_dir_single='missing_lookup_lists')

Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_000.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_001.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_002.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_003.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_004.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_005.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_006.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup_list_007.pkl...
Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists/missing_lookup

## (4) From the two initial lookup lists and the set of chunks of "missing" lookups, create the full lookup table as a Pandas dataframe

In [48]:
# Should be 37,719 null in the 128,610 total
data_lookup = tc.get_data_lookup_table(id_lookup_list, symbol_lookup_list, missing_lookup_chunks)

Out of 128610 entries in the initial lookup table, 40233 are null
2514 entries have been added to the lookup table, so now 37719 are null


## (5) Create a lookup table using the output from the Biomart website

In [6]:
# Should be 77,912 of these
hgnc_lookup = tc.get_hgnc_lookup_table(project_dir)

## (6) Incorporate some additional synonyms into the data lookup table using the HGNC database

In [7]:
data_lookup = tc.incorporate_hgnc_lookups(data_lookup, hgnc_lookup)

There are 79 names in the data lookup table lacking Ensembl IDs that DO have Ensembl IDs in the HGNC lookup table
However, not all of the corresponding Ensembl IDs in the HGNC lookup table are necessarily in the Ensembl database; that's what we're about to find out
{'ENSG00000186354', 'ENSG00000198384', 'ENSG00000234449', 'PWAR6', 'MT1IP', 'ATP8A2P1', 'ENSG00000266960', 'ENSG00000260760', 'BRD7P3', 'ENSG00000204365', 'ENSG00000233864', 'ENSG00000224813', 'ENSG00000256927', 'TTTY16', 'LINC00904', 'ENSG00000212290', 'ENSG00000240800', 'PGBD3', 'TOP1P1', 'ENSG00000254671', 'ENSG00000256304', 'ENSG00000241670', 'CBX3P2', 'ENSG00000273585', 'ENSG00000236850', 'ENSG00000233265', 'SIK3-IT1', 'ENSG00000223111', 'OR4A14P', 'ENSG00000248686', 'ENSG00000220548', 'MRS2P2', 'ENSG00000236269', 'NPPA-AS1', 'CDR1', 'ENSG00000254962', 'ENSG00000205850', 'ENSG00000221972', 'ENSG00000243251', 'ENSG00000223929', 'ENSG00000265592', 'ENSG00000279355', 'TPTE2P3', 'SKP1P2', 'ENSG00000251317', 'ENSG00000270024

# Supplementary lookup method and exploration

In [2]:
# Import relevant module
import pandas as pd

# Create an empty dataframe to hold the lookup data
lookup = pd.DataFrame(columns=['id', 'id_version', 'biotype', 'tsv_file'])

# Potential ways (using add_tsv_file_to_lookup()) to add to the lookup table using the GTF (and, for the xref files, TSV) files from the Ensembl FTP site after processing them in Bash to better-formatted TSV files

# These TSV files were generated using the make_tsv_files_for_lookup_table() function in the .sh library
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_38.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-entrez.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-refseq.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-ena.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-uniprot.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_37.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_38-80.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_37-ena.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_37-uniprot.tsv')

# # After downloading the corresponding GTF files en masse using a Bash for loop, these files were generated using the process_all_gtf_files() .sh library function and the code below was generated using the write_python_lines() function
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_37-82.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_37-85.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_37-87.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-100.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-76.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-77.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-78.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-79.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-80.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-81.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-82.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-83.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-84.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-85.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-86.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-87.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-88.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-89.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-90.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-91.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-92.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-93.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-94.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-95.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-96.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-97.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-98.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-99.tsv')

# Read in the list of unique gene names generated using Bash from the 2261 datafiles (this should result in a set of size 128,612)
unique_gene_names = pd.read_csv('/data/BIDS-HPC/private/projects/dmi/data/unique_best_gene_names.txt', names=['name'])

# Identify whatever unique_gene_names that we can using the lookup table (look at the size of remainder to see how many names we still need to identify)
remainder = set(unique_gene_names['name']) - set(lookup.index)

# Now comparing the two lookup methods a bit

# This says that I'm starting with the same 128,610 unknowns
set([x[0] for x in symbol_lookup_list]+[x[0] for x in id_lookup_list]) == set(unique_gene_names['name'])

# Now asking: Which ones are identified?
# Note this does not include the names identified using xref or the HGNC lookup table
identified_before = []
for x in symbol_lookup_list+id_lookup_list:
    if x[1] is not None:
        identified_before.append(x[0])
identified_before = set(identified_before)

# known = set(['a','b','c','d','e'])
# test = set(['b','f','d','g'])
known = set(lookup.index)
test = set(unique_gene_names['name'])
len(known) - len(known-test) # this many are known
identified_after = known - (known-test) # these ones of the test set are known
# This code just confirms the way we just calculated identified_after above
# identified_after = []
# for item in test:
#     if item in known:
#         identified_after.append(item)

# See which ones were identified using the old method but not the new one
identified_before - identified_after

# test2 is the set of all identified names using the old method, depending on how many Main cells were actually run, less the set of the identified names using the old method excluding the xref or HGNC lookup steps
# Basically, test2 tells us what other names were identified after the initial POST methods were run using the REST API
test2 = set(data_lookup['id'][data_lookup['id'].notnull()].index) - identified_before # if these are all already identified by the new lookup method, then I can trust that xref is practically just looking at old gene names, not other DBs

# known gives us the set of names identified using the new method (using the lookup variable)
known = set(lookup.index)

# This tells us how many of the names in test2 we can identify using the new lookup table lookup
len(known) - len(known-test2) # this many are known... well it's 1819 not 2514, so xref is doing more than just looking at old gene names probably, unless I don't have all old names in my dataset still, i.e., PIs are using still older gene names

# Scratch

In [32]:
import json
import pandas as pd

with open('/data/BIDS-HPC/private/projects/dmi/data/metadata.json') as f:
    metadata = json.load(f)

dfs = []
for tsv_file in metadata['filedata']['tsv_files']:
    dfs.append(pd.read_csv(tsv_file, sep='\t'))

In [122]:
sums = []
for df in dfs:
    #sums.append(sum([ 1 if '|' in x else 0 for x in df['name'] ])) # This shows that there are no '|' characters in any name in any of the files
    if 'locus' in df.keys():# This shows that there are no '|' characters in any 'locus' field in any of the files that contain such fields
        sums.append(sum([ 1 if '|' in x else 0 for x in df['locus'] ]))
sum(sums)

0

In [121]:
len(sums)

174

0

In [163]:
unique_names = []
for df in dfs:
    unique_names.append(len(set(df['name'])) == len(df))

In [148]:
df = dfs[1022]
tmp = df['name'].value_counts()!=1
for dup_name in tmp[tmp].index:
    #print(df[df['name']==dup_name]['fpkm'].to_list())
    df_tmp = df[df['name']==dup_name][['name','locus']]
    df_tmp['name'] = df_tmp.apply(lambda x: '|'.join(x), axis=1)
    #df = df.replace(df_tmp['name'])
    df.loc[df_tmp['name'].index,'name'] = df_tmp['name']
    print(df_tmp['name'])
    print('-------------------')

In [156]:
#df = dfs[1022]
len(dfs[1022]['name'].unique())

61968

In [165]:
for iname, unique_name in enumerate(unique_names):
    #if (not unique_name) and ('locus' in dfs[iname].keys()):
    if (not unique_name):
        print(iname)
        # df = dfs[iname]
        # dupes = df['name'].value_counts()!=1
        # for dup_name in dupes[dupes].index:
        #     df_tmp = df[df['name']==dup_name][['name','locus']]
        #     df_tmp['name'] = df_tmp.apply(lambda x: '|'.join(x), axis=1)
        #     df.loc[df_tmp['name'].index,'name'] = df_tmp['name']

279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528


In [171]:
df = dfs[405]
df['name'].value_counts()

C2ORF15       2
C6ORF165      2
CANX          1
COL19A1       1
AC112715.2    1
             ..
HADHA         1
LMO7          1
PCDHB8        1
ANKFN1        1
PLD5          1
Name: name, Length: 19462, dtype: int64

In [97]:
print(df[df['name']==dup_name])

name  fpkm            name2 name3 class_code nearest_ref_id  \
61738  ENSG00000237333   0.0  ENSG00000237333  MSH5          -              -   
61752  ENSG00000237333   0.0  ENSG00000237333  MSH5          -              -   

      tss_id                              locus length coverage  fpkm_conf_lo  \
61738      -  HSCHR6_MHC_SSTO:31697727-31699023      -        -           0.0   
61752      -  HSCHR6_MHC_SSTO:31714323-31720408      -        -           0.0   

       fpkm_conf_hi fpkm_status  
61738           0.0          OK  
61752           0.0          OK  


In [94]:
print(df[df['name']=='ENSG00000269324']['fpkm'].to_list())

[0.0, 0.0, 0.0, 0.0]


In [166]:
for ifile, unique_name in enumerate(unique_names):
    if not unique_name:
        print(ifile, unique_name, metadata['filedata']['filenames'][ifile])

BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASYAJ-09A-01R.expression.txt
405 False /data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASYCN-09A-01R.expression.txt
406 False /data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASYHN-09A-01R.expression.txt
407 False /data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASYIS-09A-01R.expression.txt
408 False /data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASYSJ-09A-01R.expression.txt
409 False /data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASYWF-03A-01R.expression.txt
410 False /data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase2/L3/expression/StJude/TARGET-10-PASZEW-09A-01R.expression.txt
411 Fals

In [40]:
metadata['filedata']['filenames'][0]

'/data/BIDS-HPC/private/projects/dmi/data/tree/Public/ALL/mRNA-seq/Phase1/L3/expression/BCCA/HS0825.gene.quantification.txt'

In [42]:
dfs[0]
# for df in dfs:
#     print(len(df))

Unnamed: 0,name,rpkm,name2,raw_counts,median_length_normalized
0,ENSG00000000457,3.586679,SCYL3,513,6.500000
1,ENSG00000000460,2.103487,C1ORF112,468,3.812060
2,ENSG00000000938,1.759050,FGR,230,3.187850
3,ENSG00000000971,0.067245,CFH,21,0.121865
4,ENSG00000001460,0.170594,C1ORF201,55,0.309160
...,...,...,...,...,...
51732,ENSG00000252766,1.981214,U6,8,3.590480
51733,ENSG00000252855,0.000000,AC134878.2,0,0.000000
51734,ENSG00000252900,0.000000,U6,0,0.000000
51735,ENSG00000252939,0.000000,AC006156.6,0,0.000000


In [10]:
metadata['metadata']

{'base_url': 'https://target-data.nci.nih.gov/',
 'datadir': '/data/BIDS-HPC/private/projects/dmi/data/tree/',
 'file_index': '/data/BIDS-HPC/private/projects/dmi/data/all_files_in_tree.txt',
 'working_dir': '/home/weismanal/notebook/2020-05-28/dmi',
 'ndatafiles': 2261}

In [13]:
metadata['filedata'].keys()

dict_keys(['filenames', 'weblinks', 'idatafiles', 'tsv_files', 'format_nums'])

In [36]:
len(dfs)

2261

In [30]:
metadata['filedata']['tsv_files']

.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000737.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000738.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000739.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000740.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000741.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000742.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000743.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000744.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000745.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000746.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000747.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000748.tsv',
 '/data/BIDS-HPC/private/projects/dmi/data/tsv_files/tsv_file_0000749.tsv',
 '/da

In [84]:
import pandas as pd
lookup = pd.DataFrame(columns=['id', 'id_version', 'biotype', 'tsv_file'])

# These TSV files were generated using the make_tsv_files_for_lookup_table() function in the .sh library
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_38.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-entrez.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-refseq.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-ena.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_38-uniprot.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_37.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_38-80.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_37-ena.tsv')
# lookup = tc.add_tsv_file_to_lookup(lookup, '/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/xref_37-uniprot.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_37-82.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_37-85.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_37-87.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-100.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-76.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-77.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-78.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-79.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-80.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-81.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-82.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-83.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-84.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-85.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-86.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-87.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-88.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-89.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-90.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-91.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-92.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-93.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-94.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-95.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-96.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-97.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-98.tsv')
lookup = tc.add_tsv_file_to_lookup(lookup, '/home/weismanal/notebook/2020-06-01/dmi/processed/main_38-99.tsv')

import pandas as pd
unique_gene_names = pd.read_csv('/data/BIDS-HPC/private/projects/dmi/data/unique_best_gene_names_uppercase.txt', names=['name'])
remainder = set(unique_gene_names['name']) - set(lookup.index)

In [50]:
#set([x[0] for x in symbol_lookup_list]+[x[0] for x in id_lookup_list]) == set(unique_gene_names['name']) # this says that I'm starting with the same 128,610 unknowns
# Now: Which ones are identified?
identified_before = []
for x in symbol_lookup_list+id_lookup_list:
    if x[1] is not None:
        identified_before.append(x[0])
identified_before = set(identified_before)


# known = set(['a','b','c','d','e'])
# test = set(['b','f','d','g'])
known = set(lookup.index)
test = set(unique_gene_names['name'])
len(known) - len(known-test) # this many are known
identified_after = known - (known-test) # these ones of the test set are known
# identified_after = []
# for item in test:
#     if item in known:
#         identified_after.append(item)
# set(identified_after) == x

#identified_before - identified_after

In [85]:
test2 = set(data_lookup['id'][data_lookup['id'].notnull()].index) - identified_before # if these are all already identified, then I can trust that xref is practically just looking at old gene names, not other DBs
known = set(lookup.index)
len(known) - len(known-test2) # this many are known... well it's 1819 not 2514, so xref is doing more than just looking at old gene names probably, unless I don't have all old names in my dataset still, i.e., PIs are using still older gene names

# Next up: Run xref on the 15240 remainder gene names; see whether the old (in Main part of notebook) or the new (in Scratch part) methods are more comprehensive and choose that (or modified version of) which is the most comprehensive as possible, write it up, and move on!

1819

In [3]:
len(set(unique_gene_names['name']))

128610

In [19]:
set(lookup.index)

{'IGKV2D-19',
 'ENSG00000033011',
 'ENSG00000272494',
 'LINC01204',
 'PTDSS2',
 'ENSG00000285628',
 'ENSG00000168757',
 'TTTY5',
 'HNRNPA1P32',
 'PTCRA',
 'PSMD10',
 'ENSG00000266933',
 'ENSG00000225647',
 'ENSG00000245970',
 'ENSG00000280139',
 'ENSG00000110492',
 'PHF21B',
 'ENSG00000252218',
 'ENSG00000235029',
 'ENSG00000261642',
 'SLC46A1',
 'ENSG00000284704',
 'SPON2',
 'ING3',
 'ENSG00000254383',
 'MYO19',
 'ENSG00000260032',
 'ENSG00000268199',
 'ENSG00000222033',
 'ENSG00000231095',
 'ENSG00000272137',
 'ENSG00000224155',
 'ENSG00000224622',
 'ENSG00000287569',
 'ENSG00000160256',
 'ENSG00000274923',
 'ENSG00000234546',
 'ENSG00000228960',
 'ENSG00000280025',
 'ATP6V1FNB',
 'EEF1AKMT2',
 'AC010745.4',
 'ENSG00000211699',
 'ENSG00000222791',
 'ENSG00000276916',
 'ENSG00000266530',
 'ENSG00000127774',
 'ASAP1-IT2',
 'ENSG00000200029',
 'WIZ',
 'ENPP7P9',
 'ENSG00000225509',
 'ENSG00000197769',
 'ENSG00000237992',
 'ENSG00000204437',
 'AC009220.2',
 'RPSAP5',
 'AL353593.2',
 'ENS

In [43]:
lookup

Unnamed: 0,id,id_version,biotype,tsv_file
ENSG00000223972,ENSG00000223972,5,transcribed_unprocessed_pseudogene,/data/BIDS-HPC/private/projects/dmi/data/proce...
ENSG00000227232,ENSG00000227232,5,unprocessed_pseudogene,/data/BIDS-HPC/private/projects/dmi/data/proce...
ENSG00000278267,ENSG00000278267,1,miRNA,/data/BIDS-HPC/private/projects/dmi/data/proce...
ENSG00000243485,ENSG00000243485,5,lncRNA,/data/BIDS-HPC/private/projects/dmi/data/proce...
ENSG00000284332,ENSG00000284332,1,miRNA,/data/BIDS-HPC/private/projects/dmi/data/proce...
...,...,...,...,...
CU442762.4,ENSG00000271726,1,miRNA,/data/BIDS-HPC/private/projects/dmi/data/proce...
AC002321.2,ENSG00000215616,3,protein_coding,/data/BIDS-HPC/private/projects/dmi/data/proce...
AC002321.1,ENSG00000215611,3,protein_coding,/data/BIDS-HPC/private/projects/dmi/data/proce...
CT867977.1,ENSG00000265557,1,miRNA,/data/BIDS-HPC/private/projects/dmi/data/proce...


In [56]:
'biotypes' in lookup.keys()

False

In [6]:
import pandas as pd
lookup = pd.Series(name='id')

df = pd.read_csv('/data/BIDS-HPC/private/projects/dmi/data/processed_files_for_lookup/main_38.tsv', sep='\t')
id_list = df['id'].apply(lambda x: x.upper()).to_list()
name_list = df['name'].apply(lambda x: x.upper()).to_list()
lookup = pd.concat([lookup, pd.Series(index=id_list, data=id_list, name='id')])
lookup = pd.concat([lookup, pd.Series(index=name_list, data=id_list, name='id')])

len(lookup.index.unique())

121333

In [None]:
import pandas
'/data/BIDS-HPC/private/projects/dmi/data/ensembl_ftp_site/grch38/Homo_sapiens.GRCh38.100.gtf'

In [10]:
import json
with open('/data/BIDS-HPC/private/projects/dmi/data/metadata.json') as f:
    x = json.load(f)

In [29]:
x['filedata'].keys()
#x['filedata']['format_nums']

dict_keys(['filenames', 'weblinks', 'idatafiles', 'tsv_files', 'format_nums'])

In [9]:
sum(data_lookup['id'].isnull())

37708

In [18]:
with open('/home/weismanal/notebook/2020-05-22/dmi/data_lookup.html', 'w') as f:
    f.write(data_lookup.sort_values(by='id', axis='index').to_html())


In [15]:
len(data_lookup) - data_lookup.isnull().sum()

id    90902
dtype: int64

In [29]:
symbol_list == id_list2

True

In [84]:
pairs_to_test = []
for name in intxn:
    if name != hgnc_lookup.loc[name,'id']:
        #print(name)
        pairs_to_test.append([hgnc_lookup.loc[name,'id'], name])
#pairs_to_test
id_list2 = [x[0] for x in pairs_to_test]

# #print(tc.ensembl_name_request('id', id_list2))
# print(tc.ensembl_name_request('symbol', id_list2))
# for name in id_list2:
#     print(tc.ensembl_name_request('xref', [name]))

iiter = 0
lookup_list = []
nidentified = 0
#lookup_list, result_processed, iiter, nidentified = tc.run_and_process_query('symbol', id_list2, lookup_list, iiter, nidentified) # returned nothing as expected
lookup_list, result_processed, iiter, nidentified = tc.run_and_process_query('id', id_list2, lookup_list, iiter, nidentified) # returned 11 as expected
# for name in id_list2: # returned the same 11 as expected
#     lookup_list, result_processed, iiter, nidentified = tc.run_and_process_query('xref', [name], lookup_list, iiter, nidentified)
#lookup_list
tested_dict = dict(pairs_to_test)
print(tested_dict)

lookups_to_add = []
for item in lookup_list:
    if item[1] is not None:
        lookups_to_add.append([tested_dict[item[1]], item[1]])
lookups_to_add

symbols = [x[0] for x in lookups_to_add] # these don't show up in Ensembl
ids = [x[1] for x in lookups_to_add] # these do

print(data_lookup.loc[symbols,'id'])

data_lookup.loc[symbols,'id'] = ids

print(data_lookup.loc[symbols,'id'])

print('We added {} lookups to the data lookup table using the HGNC lookup table; now the number of null entries in the data lookup table is {}'.format(len(lookups_to_add), data_lookup.isnull()['id'].sum()))

counts = hgnc_lookup.loc[:,'id'].value_counts()
duplicates = list(counts.index[counts!=2])
data_lookup.loc[duplicates,'id']

[1] Given 28 names, we identified 11 of them
{'ENSG00000198384': 'TPTE2P3', 'ENSG00000240800': 'ATP8A2P1', 'ENSG00000257151': 'PWAR6', 'ENSG00000242349': 'NPPA-AS1', 'ENSG00000285665': 'TOP1P1', 'ENSG00000236269': 'ENO1-IT1', 'ENSG00000236850': 'BMS1P20', 'ENSG00000237456': 'ID2B', 'ENSG00000275691': 'MT1IP', 'ENSG00000257802': 'MRS2P2', 'ENSG00000231865': 'SIK3-IT1', 'ENSG00000180042': 'OR1R1P', 'ENSG00000169075': 'BRD7P3', 'ENSG00000233265': 'MICF', 'ENSG00000231643': 'YWHAQP8', 'ENSG00000271171': 'LINC00904', 'ENSG00000212290': 'RNA5SP424', 'ENSG00000243453': 'COX7BP1', 'ENSG00000233864': 'TTTY15', 'ENSG00000204365': 'C10ORF126', 'ENSG00000266405': 'CBX3P2', 'ENSG00000256211': 'SKP1P2', 'ENSG00000225520': 'TTTY16', 'ENSG00000184258': 'CDR1', 'ENSG00000279355': 'AGPAT4-IT1', 'ENSG00000254962': 'OR4A14P', 'ENSG00000270441': 'LAMB2P1', 'ENSG00000243251': 'PGBD3'}
name
ID2B         ENSG00000237456
LAMB2P1      ENSG00000270441
LINC00904    ENSG00000271171
SKP1P2       ENSG00000256211
BRD

In [94]:
counts = hgnc_lookup.loc[:,'id'].value_counts()
duplicates = list(counts.index[counts!=2])
data_lookup.loc[duplicates,'id']

name
ENSG00000230417    ENSG00000230417
ENSG00000255374    ENSG00000255374
ENSG00000230426    ENSG00000230426
Name: id, dtype: object

In [98]:
set(ids) & set(duplicates)

set()

In [76]:
#print(symbols)
#print(ids)
tc.ensembl_name_request('id', ids)

{'ENSG00000271171': {'object_type': 'Gene',
  'seq_region_name': '19',
  'version': 3,
  'strand': -1,
  'description': 'novel transcript',
  'db_type': 'core',
  'display_name': 'AC008555.5',
  'biotype': 'lncRNA',
  'assembly_name': 'GRCh38',
  'start': 34891605,
  'logic_name': 'havana_homo_sapiens',
  'source': 'havana',
  'species': 'homo_sapiens',
  'end': 34905139,
  'id': 'ENSG00000271171'},
 'ENSG00000237456': {'object_type': 'Gene',
  'version': 3,
  'seq_region_name': '3',
  'strand': -1,
  'description': 'inhibitor of DNA binding 2, dominant negative helix-loop-helix protein (ID2) pseudogene',
  'display_name': 'AC104849.1',
  'db_type': 'core',
  'biotype': 'processed_pseudogene',
  'assembly_name': 'GRCh38',
  'start': 62124204,
  'source': 'havana',
  'logic_name': 'havana_homo_sapiens',
  'species': 'homo_sapiens',
  'end': 62124608,
  'id': 'ENSG00000237456'},
 'ENSG00000270441': {'strand': -1,
  'seq_region_name': '3',
  'version': 1,
  'object_type': 'Gene',
  'descr

In [58]:
mylist.sort() == lookup_list.sort()

True

In [54]:
lookup_list[:len(id_list2)].sort() == lookup_list[len(id_list2):].sort()
mylist = lookup_list[len(id_list2):]

In [21]:
import re
ensembl_format = re.compile('^ENSG[0-9]{11}$')
id_list = []
symbol_list = []
for name in intxn:
    if ensembl_format.match(name) is not None:
        id_list.append(name)
    else:
        symbol_list.append(name)
#symbol_list
#print(symbol_list)
print(tc.ensembl_name_request('id', id_list))
print(tc.ensembl_name_request('symbol', symbol_list))
for name in intxn:
    print(tc.ensembl_name_request('xref', [name]))

{'ENSG00000233864': None, 'ENSG00000133808': None, 'ENSG00000205663': None, 'ENSG00000253264': None, 'ENSG00000186354': None, 'ENSG00000257151': None, 'ENSG00000236850': None, 'ENSG00000265119': None, 'ENSG00000256927': None, 'ENSG00000231865': None, 'ENSG00000223111': None, 'ENSG00000236269': None, 'ENSG00000224813': None, 'ENSG00000225520': None, 'ENSG00000220548': None, 'ENSG00000233265': None, 'ENSG00000241670': None, 'ENSG00000184258': None, 'ENSG00000223119': None, 'ENSG00000248686': None, 'ENSG00000251317': None, 'ENSG00000204365': None, 'ENSG00000283073': None, 'ENSG00000254671': None, 'ENSG00000270024': None, 'ENSG00000254462': None, 'ENSG00000279355': None, 'ENSG00000260760': None, 'ENSG00000221972': None, 'ENSG00000256763': None, 'ENSG00000234449': None, 'ENSG00000266960': None, 'ENSG00000242349': None, 'ENSG00000223929': None, 'ENSG00000243453': None, 'ENSG00000273585': None, 'ENSG00000250003': None, 'ENSG00000205662': None, 'ENSG00000213865': None, 'ENSG00000270898': None,

In [12]:
names = []
for name in intxn:
    print(hgnc_lookup.loc[name,'id'], data_lookup.loc[name,'id'])
    names.append(hgnc_lookup.loc[name,'id'])
tc.ensembl_name_request('id', names)

ENSG00000240800 None
ENSG00000223119 None
ENSG00000233864 None
ENSG00000269721 None
ENSG00000256927 None
ENSG00000251317 None
ENSG00000198384 None
ENSG00000240800 None
ENSG00000242349 None
ENSG00000257151 None
ENSG00000198384 None
ENSG00000221972 None
ENSG00000243453 None
ENSG00000242349 None
ENSG00000243251 None
ENSG00000223111 None
ENSG00000236850 None
ENSG00000285665 None
ENSG00000205850 None
ENSG00000236269 None
ENSG00000270898 None
ENSG00000205663 None
ENSG00000256304 None
ENSG00000235719 None
ENSG00000253264 None
ENSG00000236850 None
ENSG00000237456 None
ENSG00000256763 None
ENSG00000275691 None
ENSG00000257802 None
ENSG00000231865 None
ENSG00000212290 None
ENSG00000180042 None
ENSG00000254962 None
ENSG00000223929 None
ENSG00000169075 None
ENSG00000283073 None
ENSG00000257151 None
ENSG00000233265 None
ENSG00000241670 None
ENSG00000273585 None
ENSG00000184258 None
ENSG00000205662 None
ENSG00000213865 None
ENSG00000265592 None
ENSG00000231643 None
ENSG00000271171 None
ENSG000002669

{'ENSG00000221972': None,
 'ENSG00000257151': None,
 'ENSG00000186354': None,
 'ENSG00000265592': None,
 'ENSG00000285665': {'start': 171339299,
  'assembly_name': 'GRCh38',
  'biotype': 'processed_pseudogene',
  'db_type': 'core',
  'display_name': 'AL031274.1',
  'end': 171339959,
  'id': 'ENSG00000285665',
  'species': 'homo_sapiens',
  'source': 'havana',
  'logic_name': 'havana_homo_sapiens',
  'strand': 1,
  'seq_region_name': '1',
  'version': 1,
  'object_type': 'Gene',
  'description': 'DNA topoisomerase I pseudogene 1'},
 'ENSG00000205663': None,
 'ENSG00000231643': {'start': 101191115,
  'display_name': 'Z70280.1',
  'db_type': 'core',
  'biotype': 'processed_pseudogene',
  'assembly_name': 'GRCh38',
  'species': 'homo_sapiens',
  'id': 'ENSG00000231643',
  'end': 101191409,
  'logic_name': 'havana_homo_sapiens',
  'source': 'havana',
  'strand': -1,
  'object_type': 'Gene',
  'seq_region_name': 'X',
  'version': 4,
  'description': 'tyrosine 3-monooxygenase/tryptophan 5-mon

In [38]:
len(hgnc_lookup['id'].unique())

38953

In [10]:
tc.test_caps_equality(df)

AttributeError: 'int' object has no attribute 'upper'

In [37]:
#len(hgnc_lookup['id'].unique())
#len(hgnc_lookup.index.unique())
set1 = set(data_lookup.index[data_lookup['id'].isnull()]) # the question is which of these are in HGNC lookup
set2 = set(hgnc_lookup.index)
set1 & set2

{'AGPAT4-IT1',
 'ATP8A2P1',
 'BMS1P20',
 'BRD7P3',
 'C10ORF126',
 'CBX3P2',
 'CDR1',
 'COX7BP1',
 'ENO1-IT1',
 'ENSG00000069712',
 'ENSG00000133808',
 'ENSG00000184258',
 'ENSG00000186354',
 'ENSG00000198384',
 'ENSG00000204365',
 'ENSG00000205662',
 'ENSG00000205663',
 'ENSG00000205850',
 'ENSG00000212290',
 'ENSG00000213865',
 'ENSG00000220548',
 'ENSG00000221972',
 'ENSG00000223111',
 'ENSG00000223119',
 'ENSG00000223929',
 'ENSG00000224813',
 'ENSG00000225520',
 'ENSG00000231865',
 'ENSG00000233265',
 'ENSG00000233864',
 'ENSG00000234449',
 'ENSG00000235719',
 'ENSG00000236269',
 'ENSG00000236850',
 'ENSG00000240800',
 'ENSG00000241670',
 'ENSG00000242349',
 'ENSG00000243251',
 'ENSG00000243453',
 'ENSG00000248686',
 'ENSG00000250003',
 'ENSG00000251317',
 'ENSG00000253264',
 'ENSG00000254462',
 'ENSG00000254671',
 'ENSG00000254962',
 'ENSG00000256304',
 'ENSG00000256763',
 'ENSG00000256927',
 'ENSG00000257151',
 'ENSG00000260760',
 'ENSG00000265119',
 'ENSG00000265592',
 'ENSG0000

In [None]:
ENSG00000198384 TPTE2P3
ENSG00000266405 CBX3P2
ENSG00000256211 SKP1P2
ENSG00000271171 LINC00904
ENSG00000243251 PGBD3
ENSG00000212290 RNA5SP424
ENSG00000257151 PWAR6
ENSG00000257802 MRS2P2
ENSG00000180042 OR1R1P
ENSG00000270441 LAMB2P1
ENSG00000275691 MT1IP
ENSG00000231643 YWHAQP8
ENSG00000237456 ID2B
ENSG00000233864 TTTY15
ENSG00000285665 TOP1P1
ENSG00000236269 ENO1-IT1
ENSG00000184258 CDR1
ENSG00000236850 BMS1P20
ENSG00000279355 AGPAT4-IT1
ENSG00000240800 ATP8A2P1
ENSG00000225520 TTTY16
ENSG00000169075 BRD7P3
ENSG00000231865 SIK3-IT1
ENSG00000243453 COX7BP1
ENSG00000254962 OR4A14P
ENSG00000233265 MICF
ENSG00000242349 NPPA-AS1

In [66]:
def test_caps_equality(df):
    for icol, series in enumerate([df.index.to_series()] + [df.iloc[:,icol] for icol in range(df.shape[1])]):
        series2 = series[series.notnull()]
        series3 = series2.apply(lambda x: x.upper())
        print('Uppercase equality of column {}: {}'.format(icol, (series2==series3).sum(axis=0)==len(series2)))

test_caps_equality(data_lookup)

Uppercase equality of column 0: True
Uppercase equality of column 1: True


In [9]:
import pandas as pd
d = {'col1': ['andrew', 'weisman'], 'col2': [None, 'WEISMAN']}
df = pd.DataFrame(data=d)
# print((df['col1'] == df['col2']).sum(axis=0) == len(df))
# print(type(hgnc_lookup.index.to_series()), type(df['col1']))
# df['col2'][df['col2'].notnull()]
# type(df.iloc[:,1])



In [8]:
data_lookup

Unnamed: 0_level_0,id
name,Unnamed: 1_level_1
ENSG00000023902,ENSG00000023902
ENSG00000057757,ENSG00000057757
ENSG00000006652,ENSG00000006652
ENSG00000006327,ENSG00000006327
ENSG00000058272,ENSG00000058272
...,...
ZNF33AP1,ENSG00000235197
ZFYVE20,ENSG00000131381
ZCCHC16,ENSG00000187823
ZNF434,ENSG00000140987


In [9]:
hgnc_lookup

Unnamed: 0_level_0,id
name,Unnamed: 1_level_1
ENSG00000121410,ENSG00000121410
ENSG00000268895,ENSG00000268895
ENSG00000148584,ENSG00000148584
ENSG00000175899,ENSG00000175899
ENSG00000245105,ENSG00000245105
...,...
ZYG11B,ENSG00000162378
ZYX,ENSG00000159840
ZYXP1,ENSG00000274572
ZZEF1,ENSG00000074755


In [19]:
import random
x = random.sample([x[0] for x in id_lookup_list], k=1000)
x.sort()
y = tc.ensembl_name_request('id', x)

In [25]:
z = [i for i in y.keys()]
set(x)==set(z)

True

In [11]:
symbol_lookup_list

'ENSG00000198691'],
 ['AC005280.1', 'ENSG00000251393'],
 ['AC005086.1', 'ENSG00000252824'],
 ['ABHD11-AS1', 'ENSG00000225969'],
 ['A1CF', 'ENSG00000148584'],
 ['AADACL2', 'ENSG00000197953'],
 ['AC006427.1', 'ENSG00000248851'],
 ['AC007179.1', 'ENSG00000231815'],
 ['AASS', 'ENSG00000008311'],
 ['ABCC5-AS1', 'ENSG00000223882'],
 ['A2MP1', 'ENSG00000256069'],
 ['AATF', 'ENSG00000275700'],
 ['AC006427.2', 'ENSG00000286888'],
 ['ABCG4', 'ENSG00000172350'],
 ['AC005013.1', 'ENSG00000228421'],
 ['AC005086.3', 'ENSG00000279724'],
 ['ABCC11', 'ENSG00000121270'],
 ['AC002331.1', 'ENSG00000286712'],
 ['AC003070.1', 'ENSG00000267344'],
 ['ABHD17A', 'ENSG00000129968'],
 ['AC002553.1', 'ENSG00000227782'],
 ['ABHD11', 'ENSG00000106077'],
 ['AC005544.1', 'ENSG00000264491'],
 ['ABCC6P1', 'ENSG00000256340'],
 ['ABCA9', 'ENSG00000154258'],
 ['ABCC2', 'ENSG00000023839'],
 ['AC006076.1', 'ENSG00000225563'],
 ['AC006539.1', 'ENSG00000240673'],
 ['ABHD6', 'ENSG00000163686'],
 ['ABHD8', 'ENSG00000127220'],
 [

In [5]:
# Import relevant modules
import pandas as pd
import os

# Read the result.txt file created by the Biomart website
df = pd.read_csv(os.path.join(project_dir,'data','gene_lookup_table.txt'), sep='\t', names=['hgnc','symbol','id'], header=0)

# Make the contents of the symbol and id columns uppercase
df = df.dropna(axis='index') # this should result in 38,956 non-header rows --> it does
df['symbol'] = df['symbol'].apply(lambda x: x.upper())
df['id'] = df['id'].apply(lambda x: x.upper())

# Set the index to be the symbol column, delete that column and the HGNC ID column, delete rows without an Ensembl ID, and rename the index
df = df.set_index('symbol')
df = df.drop(columns='hgnc')
df = df.rename_axis(index='name')

# Duplicate the table but this time using the Ensembl IDs as the index labels
df2 = df.set_index(df['id'])
df2 = df2.rename_axis(index='name')

AttributeError: 'float' object has no attribute 'upper'

In [168]:
import pandas as pd
import os
df = pd.read_csv(os.path.join(project_dir,'data','gene_lookup_table.txt'), sep='\t', names=['hgnc','symbol','id'], header=0)
#df['Approved symbol'].apply(lambda x: x.lower())
df = df.set_index(df['symbol'].apply(lambda x: x.upper()))
df = df.drop(columns=['hgnc','symbol'])
df = df.dropna(axis='index') # this should result in 38,956 non-header rows --> it does
df = df.rename_axis(index='name')
#df
df2 = df.set_index(df['id'])
df2 = df2.rename_axis(index='name')
#df2
df_tot = pd.concat([df2,df])
#df_tot
lookup = lookup.rename_axis(index='name')
#lookup
df['id'] = df['id'].apply(lambda x: x.lower())
df

Unnamed: 0_level_0,id
name,Unnamed: 1_level_1
A1BG,ensg00000121410
A1BG-AS1,ensg00000268895
A1CF,ensg00000148584
A2M,ensg00000175899
A2M-AS1,ensg00000245105
...,...
ZYG11B,ensg00000162378
ZYX,ensg00000159840
ZYXP1,ensg00000274572
ZZEF1,ensg00000074755


In [115]:
#import numpy as np
#np.sum(lookup['id'] == None)
set1 = set([x.upper() for x in lookup.index[lookup.loc[:,'id'].isnull()].tolist()])
import pandas as pd
hgnc = pd.read_csv('/data/BIDS-HPC/private/projects/dmi/data/gene_lookup_table.txt', sep='\t')
set2 = set([x.upper() for x in hgnc['Approved symbol'].tolist()])
set1.intersection(set2)
#hgnc['Approved symbol'].lower()
hgnc['upper'] = hgnc['Approved symbol'].apply(lambda x: x.upper())
hgnc['lower'] = hgnc['Approved symbol'].apply(lambda x: x.lower())
intersection = set1.intersection(set2)
for name in intersection:
    #print(name)
    #print(hgnc[:,'Approved symbol'][np.sum(hgnc['upper'] == name)])
    mylist = hgnc.loc[hgnc.loc[:,'Approved symbol'] == name, 'Ensembl gene ID'].to_list()
    if len(mylist) != 0:
        if not isinstance(mylist[0],float):
            print(mylist[0], name)
        #print(type(mylist[0]))

ENSG00000198384 TPTE2P3
ENSG00000266405 CBX3P2
ENSG00000256211 SKP1P2
ENSG00000271171 LINC00904
ENSG00000243251 PGBD3
ENSG00000212290 RNA5SP424
ENSG00000257151 PWAR6
ENSG00000257802 MRS2P2
ENSG00000180042 OR1R1P
ENSG00000270441 LAMB2P1
ENSG00000275691 MT1IP
ENSG00000231643 YWHAQP8
ENSG00000237456 ID2B
ENSG00000233864 TTTY15
ENSG00000285665 TOP1P1
ENSG00000236269 ENO1-IT1
ENSG00000184258 CDR1
ENSG00000236850 BMS1P20
ENSG00000279355 AGPAT4-IT1
ENSG00000240800 ATP8A2P1
ENSG00000225520 TTTY16
ENSG00000169075 BRD7P3
ENSG00000231865 SIK3-IT1
ENSG00000243453 COX7BP1
ENSG00000254962 OR4A14P
ENSG00000233265 MICF
ENSG00000242349 NPPA-AS1


In [52]:
import pandas as pd
import numpy as np
data = id_lookup_list + symbol_lookup_list
df = pd.DataFrame(data=[x[1] for x in data], index=[x[0] for x in data], columns=['id'])
np.sum(df.isnull())
nnone = 0
for x in id_lookup_list:
    if x[1] is None:
        nnone = nnone + 1
for x in symbol_lookup_list:
    if x[1] is None:
        nnone = nnone + 1
print(nnone, np.sum(df.isnull())) # this is a good check for the number of Nones in the two inital lists (id_lookup_list and symbol_lookup_list)
#df.loc[['BRCA2','ENSG00000023902'],'id'] = [4,5]
df.loc[['BRCA2','ENSG00000023902']]
nnotnones = 0
indexes = []
values = []
for chunk in missing_lookup_chunks:
    for item in chunk:
        if item[1] is not None:
            nnotnones = nnotnones + 1 # this is a good check for how many we identified (based on the .out files)
            indexes.append(item[0])
            values.append(item[1])
print(nnotnones)
values
df.loc[indexes,'id'] = values
print(np.sum(df.isnull()))

40233 id    40233
dtype: int64
2514
id    37719
dtype: int64


In [57]:
assert(5+4==9)

In [53]:
len(df)

128610

In [16]:
import random
none_list = []
for item in id_lookup_list:
    if item[1] is None:
        none_list.append(item)
none_list2 = random.sample(none_list, k=1000)
test_chunk = tc.get_missing_lookups_in_chunks(none_list2, project_dir, chunk_size=1000, pickle_dir_single='missing_lookup_lists2')
#x = test_chunk[0]
x = missing_lookup_chunks[10]
for item in x:
    if item[1] is not None:
        print('hey it did something!')
print('andrew')

Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_lists2/missing_lookup_list_000.pkl...
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
hey it did something!
andrew


In [9]:
x = test_chunk[0]
for item in x:
    if item[1] is not None:
        print('hey it did something!')

In [49]:
test_chunk = tc.get_missing_lookups_in_chunks(none_list2, project_dir, chunk_size=1000, pickle_dir_single='missing_lookup_lists2')

Given 1 names, we identified 0 of them
[560] Given 1 names, we identified 0 of them
[561] Given 1 names, we identified 0 of them
[562] Given 1 names, we identified 0 of them
[563] Given 1 names, we identified 0 of them
[564] Given 1 names, we identified 0 of them
[565] Given 1 names, we identified 0 of them
[566] Given 1 names, we identified 0 of them
[567] Given 1 names, we identified 0 of them
[568] Given 1 names, we identified 0 of them
[569] Given 1 names, we identified 0 of them
[570] Given 1 names, we identified 0 of them
[571] Given 1 names, we identified 0 of them
[572] Given 1 names, we identified 0 of them
[573] Given 1 names, we identified 0 of them
[574] Given 1 names, we identified 0 of them
[575] Given 1 names, we identified 0 of them
[576] Given 1 names, we identified 0 of them
[577] Given 1 names, we identified 0 of them
[578] Given 1 names, we identified 0 of them
[579] Given 1 names, we identified 0 of them
[580] Given 1 names, we identified 0 of them
[581] Given 1 na

()

In [7]:
print(sum([ 1 if x[1] is None else 0 for x in symbol_lookup_list ]))
print(sum([ 1 if x[1] is None else 0 for x in id_lookup_list ]))

24782
15451


In [6]:
tc.ensembl_name_request('xref', ['OK/SW-CL.36'], wait_time=0)
#tc.ensembl_name_request('xref', ['andrew'], wait_time=0)
#tc.ensembl_name_request('xref', ['BRCA2'], wait_time=0)



[]

In [11]:
imissing = 0
for item in symbol_lookup_list:
    if item[0].find('/') != -1:
        print(iitem, item)

27976 ['OK/SW-CL.36', None]
27988 ['OK/SW-CL.58', None]


Go through the missing lookups in the symbols lookup list and try to determine them using the xref endpoint of the Ensembl REST API

In [5]:
missing_lookup_list = tc.get_missing_lookups(project_dir, symbol_lookup_list, max_num_names=5)

Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/missing_lookup_list.pkl...


In [20]:
chunk_size = 10

# Get the names in the symbol lookup list that do not have Ensembl IDs
to_xref = []
for item in symbol_lookup_list:
    if item[1] is None:
        to_xref.append(item[0])

# Split this list of names to try xref-ing into chunks of a certain size
to_xref_lists = list(tc.divide_chunks(to_xref, chunk_size))

# Set the directory to save the missing lists in and create it if it doesn't already exist
pickle_dir = os.path.join(project_dir,'data','missing_lookup_lists')
if not os.path.exists(pickle_dir):
    os.mkdir(pickle_dir)

# For each sub-list in to_xref, if the corresponding pickle file doesn't already exist, go through the names and try to determine their Ensembl IDs using the xref endpoint of the Ensembl REST API, and save the pickle file
for ilist, to_xref_single_list in enumerate(to_xref_lists):
    pickle_file = 'missing_lookup_list_{:03d}.pkl'.format(ilist)
    get_missing_lookups(pickle_dir, pickle_file, to_xref_single_list, max_num_names=-1)

    #print(pickle_file)

#print([ len(y) for y in to_xref_lists ])
#print(to_xref_lists[0])

missing_lookup_list_000.pkl doesn't exist
AC005391.3
missing_lookup_list_001.pkl doesn't exist
AC012314.20
missing_lookup_list_002.pkl doesn't exist
AC091654.7
missing_lookup_list_003.pkl doesn't exist
AL357932.1
missing_lookup_list_004.pkl doesn't exist
C1ORF111
missing_lookup_list_005.pkl doesn't exist
CTD-2218K11.3
missing_lookup_list_007.pkl doesn't exist
GSG2
missing_lookup_list_008.pkl doesn't exist
LOC100131860
missing_lookup_list_009.pkl doesn't exist
LOC57399
missing_lookup_list_010.pkl doesn't exist
RP1-101K10.4
missing_lookup_list_011.pkl doesn't exist
RP11-10N16.2
missing_lookup_list_012.pkl doesn't exist
RP11-181K3.4
missing_lookup_list_013.pkl doesn't exist
RP11-21C4.5
missing_lookup_list_014.pkl doesn't exist
RP11-27M9.1
missing_lookup_list_015.pkl doesn't exist
RP11-327I22.8
missing_lookup_list_016.pkl doesn't exist
RP11-413E6.1
missing_lookup_list_017.pkl doesn't exist
RP11-432M8.9
missing_lookup_list_018.pkl doesn't exist
RP11-51L5.1
missing_lookup_list_019.pkl doesn'

In [24]:
print(to_xref_lists[2])

['AC091654.7', 'AC096669.3', 'AC103863.1', 'AC092570.3', 'AC079834.3', 'AC074093.1', 'AC092174.1', 'AC092765.1', 'AC091969.1', 'AC093110.3', 'AC073900.4', 'AC074117.10', 'AC098614.3', 'AC104037.1', 'AC079150.2', 'AC074323.1', 'AC104438.1', 'AC092798.2', 'AC104651.1', 'AC093897.1', 'AC079807.4', 'AC078941.1', 'AC093106.4', 'AC092104.3', 'AC098971.2', 'AC083855.5', 'AC079235.1', 'AC092104.2', 'AC078988.1', 'AC079150.3', 'AC092669.2', 'AC083883.1', 'AC079586.1', 'AC097495.2', 'AC091736.11', 'AC093415.4', 'AC100852.2', 'AC092431.3', 'AC103965.1', 'AC092646.2', 'AC092662.6', 'AC092573.3', 'AC074389.7', 'AC099339.1', 'AC092619.1', 'AC098973.2', 'AC090519.7', 'AC092796.1', 'AC092641.2', 'AC078794.1', 'AC093847.1', 'AC094019.4', 'AC079117.3', 'AC096582.8', 'AC093899.3', 'AC092295.7', 'AC096649.4', 'AC079807.2', 'AC104076.3', 'AC099757.1', 'AC084010.1', 'AC093106.5', 'AC096655.2', 'AC098592.8', 'AC073987.2', 'AC092122.2', 'AC104389.32', 'AC079603.1', 'AC104308.2', 'AC100830.4', 'AC091813.2', 'A

In [4]:
my_list = ['geeks', 'for', 'geeks', 'like', 
           'geeky','nerdy', 'geek', 'love', 
               'questions','words', 'life'] 
# How many elements each list should have 
n = 5

x = list(tc.divide_chunks(my_list, n)) 
print(x)
print([ len(y) for y in x ])

[['geeks', 'for', 'geeks', 'like', 'geeky'], ['nerdy', 'geek', 'love', 'questions', 'words'], ['life']]
[5, 5, 1]


In [7]:
to_xref = []
for item in symbol_lookup_list:
    if item[1] is None:
        to_xref.append(item[0])

In [6]:
missing_lookup_list

[['AC005391.3', None],
 ['AC005330.2', None],
 ['AC005329.7', None],
 ['AC007040.10', None],
 ['AC006159.4', None]]

In [None]:
lookup_list = []
iiter = 0
nidentified = 0
for item in symbol_lookup_list:
    if item[1] is None:
        lookup_list, result_processed, iiter, nidentified = run_and_process_query('xref', [item[0]], lookup_list, iiter, nidentified)

In [49]:
#res = tc.ensembl_name_request('xref', [to_xref2[0]])
#res = tc.ensembl_name_request('xref', ['AC005086.1'])
#res
names_list = ['AC005086.1']
res = [{'type': 'gene', 'id': 'ENSG00000252824'}]
{names_list[0]: res[0]}

if endpoint == 'xref':
    if res:
        #print('not empty')
        res = {names_list[0]: res[0]}
    else:
        #print('empty')
        res = {}

# Symbol does not exist: {}
# Symbol exists:
# {'AC005086.1': {'version': 1,
#   'biotype': 'snoRNA',
#   'object_type': 'Gene',
#   'strand': -1,
#   'id': 'ENSG00000252824',
#   'db_type': 'core',
#   'assembly_name': 'GRCh38',
#   'logic_name': 'ncrna_homo_sapiens',
#   'source': 'ensembl',
#   'display_name': 'AC005086.1',
#   'seq_region_name': '7',
#   'end': 102194164,
#   'species': 'homo_sapiens',
#   'start': 102194076}}

# xref does not exist: [] --> {}
# xref exists: [{'type': 'gene', 'id': 'ENSG00000252824'}] --> {names_list[0]: res[0]}

ERROR: Names list is too long (2) for the xref endpoint (max length is 1)


NameError: name 'exit' is not defined

In [48]:
'andrew' == 'anddrew'

False

In [50]:
tmp_list = []
iiter = 0
nidentified = 0
for item in to_xref2:
    tmp_list, result_processed, iiter, nidentified = tc.run_and_process_query('xref', [item], tmp_list, iiter, nidentified)

AttributeError: 'list' object has no attribute 'keys'

In [13]:
to_xref2

['AC005391.3',
 'AC005330.2',
 'AC005329.7',
 'AC007040.10',
 'AC006159.4',
 'AC004893.11',
 'AC004692.4',
 'AC002310.11',
 'AC005392.13',
 'AC002069.6']

In [12]:
to_xref2 = [ to_xref[x] for x in range(10) ]

In [5]:
symbol_lookup_list

'ENSG00000198691'],
 ['AC005280.1', 'ENSG00000251393'],
 ['AC005086.1', 'ENSG00000252824'],
 ['ABHD11-AS1', 'ENSG00000225969'],
 ['A1CF', 'ENSG00000148584'],
 ['AADACL2', 'ENSG00000197953'],
 ['AC006427.1', 'ENSG00000248851'],
 ['AC007179.1', 'ENSG00000231815'],
 ['AASS', 'ENSG00000008311'],
 ['ABCC5-AS1', 'ENSG00000223882'],
 ['A2MP1', 'ENSG00000256069'],
 ['AATF', 'ENSG00000275700'],
 ['AC006427.2', 'ENSG00000286888'],
 ['ABCG4', 'ENSG00000172350'],
 ['AC005013.1', 'ENSG00000228421'],
 ['AC005086.3', 'ENSG00000279724'],
 ['ABCC11', 'ENSG00000121270'],
 ['AC002331.1', 'ENSG00000286712'],
 ['AC003070.1', 'ENSG00000267344'],
 ['ABHD17A', 'ENSG00000129968'],
 ['AC002553.1', 'ENSG00000227782'],
 ['ABHD11', 'ENSG00000106077'],
 ['AC005544.1', 'ENSG00000264491'],
 ['ABCC6P1', 'ENSG00000256340'],
 ['ABCA9', 'ENSG00000154258'],
 ['ABCC2', 'ENSG00000023839'],
 ['AC006076.1', 'ENSG00000225563'],
 ['AC006539.1', 'ENSG00000240673'],
 ['ABHD6', 'ENSG00000163686'],
 ['ABHD8', 'ENSG00000127220'],
 [

In [10]:
to_xref = []
for item in symbol_lookup_list:
    if item[1] is None:
        to_xref.append(item[0])

In [3]:
if not os.path.exists(os.path.join(project_dir,'data','primary_lookup_lists.pkl')):

    # Import the GMB library in order to get the make_pickle() function
    gmb_dir = '/data/BIDS-HPC/private/projects/gmb/checkout'
    if gmb_dir not in sys.path:
        sys.path.append(gmb_dir)
    import time_cell_interaction_lib as tci # we need this to get the make_pickle() function

    # Query the Ensembl REST API to see whether we can identify the gene "names"
    id_lookup_list, id_nidentified = tc.get_lookup_list('id', os.path.join(project_dir,'data','unique_ensembl_ids.txt'))
    symbol_lookup_list, symbol_nidentified = tc.get_lookup_list('symbol', os.path.join(project_dir,'data','unique_other_names.txt'))

    # Run a check on the number of Nones in the lists; otherwise, save the lookup lists to disk
    if (sum([ 0 if x[1] is None else 1 for x in symbol_lookup_list ]) != symbol_nidentified) or (sum([ 0 if x[1] is None else 1 for x in id_lookup_list ]) != id_nidentified):
        print('ERROR: Inconsistent number of Nones in the lookup lists')
        exit
    else:
        tci.make_pickle([symbol_lookup_list, id_lookup_list], os.path.join(project_dir,'data'), 'primary_lookup_lists.pkl')

else:
    [symbol_lookup_list, id_lookup_list] = tci.load_pickle(os.path.join(project_dir,'data'), 'primary_lookup_lists.pkl')

[1] Given 1000 names, we identified 987 of them
[2] Given 1000 names, we identified 989 of them
[3] Given 1000 names, we identified 990 of them
[4] Given 1000 names, we identified 984 of them


KeyboardInterrupt: 

In [4]:
id_lookup_list, id_nidentified = tc.get_lookup_list('id', os.path.join(project_dir,'data','unique_ensembl_ids.txt'), max_list_len=10, max_iter=15)

[1] Given 10 names, we identified 10 of them
[2] Given 10 names, we identified 10 of them
[3] Given 10 names, we identified 10 of them
[4] Given 10 names, we identified 10 of them
[5] Given 10 names, we identified 10 of them
[6] Given 10 names, we identified 10 of them
[7] Given 10 names, we identified 10 of them
[8] Given 10 names, we identified 10 of them
[9] Given 10 names, we identified 10 of them
[10] Given 10 names, we identified 10 of them
[11] Given 10 names, we identified 10 of them
[12] Given 10 names, we identified 9 of them
[13] Given 10 names, we identified 7 of them
[14] Given 10 names, we identified 10 of them
[15] Given 10 names, we identified 10 of them


In [22]:
sum([ 0 if x[1] is None else 1 for x in symbol_lookup_list ])

870

In [17]:
col.is

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [7]:
list1 = ["ENSG00000005483", "ENSG00000005486", "ENSG00000005513", "ENSG00000005700", "ENSG00000005801", "ENSG00000005810", "ENSG00000005812", "ENSG00000005844", "ENSG00000005882", "ENSG00000005884"]
list2 = ["ENSG00000005889", "ENSG00000005893", "ENSG00000005955", "ENSG00000005961", "ENSG00000005981", "ENSG00000006007", "ENSG00000006015", "ENSG00000006016", "ENSG00000006025", "ENSG00000006042"]
res1 = tc.ensembl_name_request('id', list1)
res2 = tc.ensembl_name_request('id', list2)

In [9]:
res2

{'ENSG00000005961': {'db_type': 'core',
  'assembly_name': 'GRCh38',
  'id': 'ENSG00000005961',
  'start': 44372180,
  'logic_name': 'ensembl_havana_gene_homo_sapiens',
  'source': 'ensembl_havana',
  'end': 44389649,
  'display_name': 'ITGA2B',
  'seq_region_name': '17',
  'species': 'homo_sapiens',
  'biotype': 'protein_coding',
  'version': 19,
  'strand': -1,
  'description': 'integrin subunit alpha 2b [Source:HGNC Symbol;Acc:HGNC:6138]',
  'object_type': 'Gene'},
 'ENSG00000006025': {'description': 'oxysterol binding protein like 7 [Source:HGNC Symbol;Acc:HGNC:16387]',
  'strand': -1,
  'object_type': 'Gene',
  'biotype': 'protein_coding',
  'version': 12,
  'start': 47807372,
  'source': 'ensembl_havana',
  'display_name': 'OSBPL7',
  'end': 47821834,
  'species': 'homo_sapiens',
  'seq_region_name': '17',
  'logic_name': 'ensembl_havana_gene_homo_sapiens',
  'assembly_name': 'GRCh38',
  'db_type': 'core',
  'id': 'ENSG00000006025'},
 'ENSG00000006042': {'id': 'ENSG00000006042',


In [5]:
symbol_lookup_list

ENSG00000287636'],
 ['AARD', 'ENSG00000205002'],
 ['AC002066.1', 'ENSG00000237813'],
 ['AC004148.1', 'ENSG00000263272'],
 ['ABCB6', 'ENSG00000115657'],
 ['AC006947.1', 'ENSG00000229330'],
 ['AC003984.1', 'ENSG00000235139'],
 ['AC006499.2', 'ENSG00000250074'],
 ['AC003006.1', 'ENSG00000269026'],
 ['AC005537.2', 'ENSG00000232006'],
 ['AANAT', 'ENSG00000129673'],
 ['ABL2', 'ENSG00000143322'],
 ['AC005625.1', 'ENSG00000266963'],
 ['AC004771.1', 'ENSG00000234203'],
 ['ABCC9', 'ENSG00000069431'],
 ['ABI2', 'ENSG00000138443'],
 ['AC000124.1', 'ENSG00000226770'],
 ['ABHD13', 'ENSG00000139826'],
 ['AC005943.2', 'ENSG00000279009'],
 ['ABI3BP', 'ENSG00000154175'],
 ['AC002511.1', 'ENSG00000232680'],
 ['AC004063.1', 'ENSG00000250670'],
 ['AC003043.2', 'ENSG00000267211'],
 ['AC004562.1', 'ENSG00000199801'],
 ['AC005307.1', 'ENSG00000260725'],
 ['AC005154.5', 'ENSG00000281039'],
 ['ABCD1', 'ENSG00000101986'],
 ['AC002480.2', 'ENSG00000232949'],
 ['AC006328.1', 'ENSG00000233619'],
 ['AC004079.1', 'EN

## (2) Return a list of symbols and corresponding IDs identified using the symbol lookup REST function of Ensembl given the symbols in a text file called to_lookup.txt

In [2]:
max_post_size = 1000
res = tc.get_lookup_list(project_dir, max_post_size=max_post_size)

[1] Given 1000 symbols, we identified 311 of them
[2] Given 1000 symbols, we identified 477 of them
[3] Given 1000 symbols, we identified 457 of them
[4] Given 1000 symbols, we identified 462 of them
[5] Given 1000 symbols, we identified 470 of them
[6] Given 1000 symbols, we identified 424 of them
[7] Given 1000 symbols, we identified 180 of them
[8] Given 1000 symbols, we identified 82 of them
[9] Given 1000 symbols, we identified 0 of them
[10] Given 1000 symbols, we identified 22 of them
[11] Given 1000 symbols, we identified 17 of them
[12] Given 1000 symbols, we identified 0 of them
[13] Given 1000 symbols, we identified 10 of them
[14] Given 1000 symbols, we identified 0 of them
[15] Given 1000 symbols, we identified 0 of them
[16] Given 1000 symbols, we identified 0 of them
[17] Given 1000 symbols, we identified 0 of them
[18] Given 1000 symbols, we identified 0 of them
[19] Given 1000 symbols, we identified 0 of them
[20] Given 1000 symbols, we identified 0 of them
[21] Given 

## (3) Load the lookup table, add the new entries to it, and save it to disk

In [34]:
# Import relevant modules
import pandas as pd
import numpy as np
gmb_dir = '/data/BIDS-HPC/private/projects/gmb/checkout'
if gmb_dir not in sys.path:
    sys.path.append(gmb_dir)
import time_cell_interaction_lib as tci # we need this to get the make_pickle() function

# Load the first-pass of the global lookup table using Pandas
lookup = pd.read_csv(os.path.join(project_dir,'data','partial_global_lookup_table.txt'), sep='\t', names=['status','name','id'], index_col='name')

# Count how many null IDs there are
nnull_before = np.sum(pd.isnull(lookup.loc[:,'id']))

# Use the list generated in Step (2) to add IDs to the lookup table
for pair in res:
    lookup.loc[pair[0],:] = ['lookup:', pair[1]]

# Count how many null IDs there are
nnull_after = np.sum(pd.isnull(lookup.loc[:,'id']))

# Print how we improved the lookup table
print('{} lookup pairs have been recorded; {} null before, {} null after'.format(len(res), nnull_before, nnull_after))

# Save the new lookup table in the data directory
tci.make_pickle(lookup, os.path.join(project_dir,'data'), 'lookup_table_after_symbol_lookup.pkl')

2948 lookup pairs have been recorded; 27695 null before, 24747 null after
Creating pickle file /data/BIDS-HPC/private/projects/dmi/data/lookup_table_after_symbol_lookup.pkl...


In [None]:
lookup = tci.load_pickle(os.path.join(project_dir,'data'), 'lookup_table_after_symbol_lookup.pkl')

In [7]:
set(['a','c','d','e','f']) - set(['a','b','d'])


{'c', 'e', 'f'}

In [35]:
bleh = tci.load_pickle(os.path.join(project_dir,'data'), 'lookup_table_after_symbol_lookup.pkl')

Reading pickle file /data/BIDS-HPC/private/projects/dmi/data/lookup_table_after_symbol_lookup.pkl...


In [41]:
lookup.equals(bleh)

True

In [31]:
np.sum(lookup.loc[:,'status'] == 'unmatched:')

24747

In [13]:
lookup

Unnamed: 0_level_0,status,id
name,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,lookup:,ENSG00000000003
ENSG00000000005,lookup:,ENSG00000000005
ENSG00000000419,lookup:,ENSG00000000419
ENSG00000000457,lookup:,ENSG00000000457
ENSG00000000460,lookup:,ENSG00000000460
...,...,...
ZNRD1,unmatched:,
ZNRD1-AS1,unmatched:,
ZRSR1,unmatched:,
ZSCAN5D,unmatched:,


In [27]:
print(np.sum(pd.isnull(lookup.loc[:,'id'])))

27695


In [32]:
#lookup.loc['ENSG00000000457','id'] = 'andrew'
lookup.loc['AC010376.1',:]

status            lookup:
id        ENSG00000253801
Name: AC010376.1, dtype: object

In [11]:
import numpy as np
#np.sum(lookup['id'] == '')
#np.sum(pd.isnull(lookup.loc[:,'id']))
lookup.loc['AC009161.1']

KeyError: 'AC009161.1'

In [3]:
print(len(res))
res

2948


05'],
 ['AC009474.2', 'ENSG00000230355'],
 ['AC013410.2', 'ENSG00000237617'],
 ['AC009161.1', 'ENSG00000237901'],
 ['AC011379.1', 'ENSG00000250069'],
 ['AC016727.1', 'ENSG00000270820'],
 ['AC012564.1', 'ENSG00000283157'],
 ['AC012314.7', 'ENSG00000274905'],
 ['AC009492.1', 'ENSG00000278004'],
 ['AC011551.1', 'ENSG00000278797'],
 ['AC008565.1', 'ENSG00000251135'],
 ['AC009065.2', 'ENSG00000259780'],
 ['AC011453.1', 'ENSG00000269842'],
 ['AC013268.5', 'ENSG00000283684'],
 ['AC012462.3', 'ENSG00000230695'],
 ['AC009800.1', 'ENSG00000248911'],
 ['AC007948.1', 'ENSG00000263863'],
 ['AC010970.1', 'ENSG00000225840'],
 ['AC010376.1', 'ENSG00000253801'],
 ['AC008132.1', 'ENSG00000161103'],
 ['AC015871.1', 'ENSG00000259770'],
 ['AC008746.5', 'ENSG00000276551'],
 ['AC008410.1', 'ENSG00000253261'],
 ['AC008734.1', 'ENSG00000269300'],
 ['AC009478.1', 'ENSG00000225258'],
 ['AC015987.1', 'ENSG00000224746'],
 ['AC008674.1', 'ENSG00000287003'],
 ['AC016734.1', 'ENSG00000228305'],
 ['AC011752.1', 'ENSG0

In [30]:
datadir = os.path.join(project_dir, 'data')
max_post_size = 20
max_iter = 5
iter = 0
lookup_list = []
with open(os.path.join(datadir, 'to_lookup.txt'), 'r') as f:
    iname = 0
    symbols_list = []
    for name in f:
        iname = iname + 1
        symbols_list.append(name.rstrip())
        if iname == max_post_size:
            res = tc.ensembl_symbol_lookup(symbols_list)
            for key in res.keys():
                lookup_list.append([key, res.get(key).get('id')])
            iter = iter + 1
            if iter == max_iter:
                break
            iname = 0
            symbols_list = []

if iname != max_post_size:
    res = tc.ensembl_symbol_lookup(symbols_list)
    for key in res.keys():
        lookup_list.append([key, res.get(key).get('id')])
    iter = iter + 1


print(iname)
print(symbols_list)
#tc.ensembl_symbol_lookup(symbols_list)

20
['AC002310.11', 'AC002310.12', 'AC002310.13', 'AC002310.17', 'AC002310.7', 'AC002314.2', 'AC002314.4', 'AC002321.1', 'AC002321.2', 'AC002331.1', 'AC002365.1', 'AC002366.3', 'AC002368.4', 'AC002383.2', 'AC002386.1', 'AC002395.1', 'AC002398.11', 'AC002398.12', 'AC002398.13', 'AC002398.5']


In [32]:
len(lookup_list)

17

In [6]:
res = tc.ensembl_symbol_lookup(symbols_list)
# # The maximum POST size (and therefore list size) is 1000
# if len(symbols_list) > 1000:
#     print('ERROR: POST lookup list is too long ({})'.format(len(symbols_list)))
#     exit

# # Import relevant modules
# import requests, sys

# # Set some constants
# server = "https://rest.ensembl.org"
# ext = "/lookup/symbol/homo_sapiens"
# headers={ "Content-Type" : "application/json", "Accept" : "application/json"}

# # Query the REST server
# #r = requests.post(server+ext, headers=headers, data='{ "symbols" : ["BRCA2", "BRAF" ] }')
# r = requests.post(server+ext, headers=headers, data=str({'symbols': symbols_list}).replace('\'','"'))

# # Ensure everything is okay
# if not r.ok:
#     r.raise_for_status()
#     sys.exit()

# # Return the result
# #return(repr(r.json()))
# print(r.json())


In [27]:
print([ [key, res.get(key).get('id')] for key in res.keys() ])
#res.get('AC000124.1')

[['AC002386.1', 'ENSG00000230941'], ['AC000362.1', 'ENSG00000230820'], ['AC002056.3', 'ENSG00000283252'], ['AC002306.1', 'ENSG00000259242'], ['AC000124.1', 'ENSG00000226770'], ['AC002066.1', 'ENSG00000237813'], ['AC000081.2', 'ENSG00000251940'], ['AC002070.1', 'ENSG00000248636'], ['AC000367.1', 'ENSG00000224981'], ['AC000374.1', 'ENSG00000227249'], ['AC002064.4', 'ENSG00000280440'], ['AC002395.1', 'ENSG00000272215'], ['AC000029.1', 'ENSG00000275635'], ['AC000067.1', 'ENSG00000225007'], ['AC002365.1', 'ENSG00000206844'], ['AC002331.1', 'ENSG00000286712'], ['AC000403.1', 'ENSG00000278727']]


In [39]:
# The maximum POST size (and therefore list size) is 1000
if len(symbols_list) > 1000:
    print('ERROR: POST lookup list is too long ({})'.format(len(symbols_list)))
    exit

# Import relevant modules
import requests, sys

# Set some constants
server = "https://rest.ensembl.org"
ext = "/lookup/symbol/homo_sapiens"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}

# Query the REST server
#r = requests.post(server+ext, headers=headers, data='{ "symbols" : ["BRCA2", "BRAF" ] }')
r = requests.post(server+ext, headers=headers, data=str({'symbols': symbols_list}).replace('\'','"'))

# Ensure everything is okay
if not r.ok:
    r.raise_for_status()
    sys.exit()

# Return the result
print(r.json())


{}


In [38]:
#str({'symbols': symbols_list}).replace('\'','"')
#
mystr = '{"symbols": ["1060P11.3", "7SK", "A2LD1", "A3GALT2P", "AA06", "AACP", "AAED1", "AARS", "AATK-AS1", "AB015752.3"]}'
r = requests.post(server+ext, headers=headers, data=mystr)
#[ repr(x) for x in r.iter_lines() ]
r.json()

{}

In [4]:
# The maximum POST size (and therefore list size) is 1000
if len(symbols_list) > 1000:
    print('ERROR: POST lookup list is too long ({})'.format(len(symbols_list)))
    exit

# Import relevant modules
import requests, sys

# Set some constants
server = "https://rest.ensembl.org"
ext = "/lookup/symbol/homo_sapiens"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}

# # Query the REST server
# #r = requests.post(server+ext, headers=headers, data='{ "symbols" : ["BRCA2", "BRAF" ] }')
r = requests.post(server+ext, headers=headers, data=str({'symbols': symbols_list}))


# # Ensure everything is okay
# if not r.ok:
#     r.raise_for_status()
#     sys.exit()

# # Return the result
# return(repr(r.json()))


In [17]:
tmp = { 'symbols' : ['BRCA2', 'BRAF' ] }
mydata = str(tmp).replace('\'','"')
r = requests.post(server+ext, data=mydata)

In [18]:
[ print(x) for x in r.iter_lines() ]

b'<html><title>EnsEMBL::REST</title><body><pre>--- '
b'BRAF: '
b'  assembly_name: GRCh38'
b'  biotype: protein_coding'
b'  db_type: core'
b'  description: B-Raf proto-oncogene, serine/threonine kinase [Source:HGNC Symbol;Acc:HGNC:1097]'
b'  display_name: BRAF'
b'  end: 140924928'
b'  id: ENSG00000157764'
b'  logic_name: ensembl_havana_gene_homo_sapiens'
b'  object_type: Gene'
b'  seq_region_name: 7'
b'  source: ensembl_havana'
b'  species: homo_sapiens'
b'  start: 140719327'
b'  strand: -1'
b'  version: 13'
b'BRCA2: '
b'  assembly_name: GRCh38'
b'  biotype: protein_coding'
b'  db_type: core'
b'  description: BRCA2 DNA repair associated [Source:HGNC Symbol;Acc:HGNC:1101]'
b'  display_name: BRCA2'
b'  end: 32400266'
b'  id: ENSG00000139618'
b'  logic_name: ensembl_havana_gene_homo_sapiens'
b'  object_type: Gene'
b'  seq_region_name: 13'
b'  source: ensembl_havana'
b'  species: homo_sapiens'
b'  start: 32315086'
b'  strand: 1'
b'  version: 15'
b'</pre></body></html>'


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]