In [310]:
import pandas as pd
import json
import plotly.express as px
import plotly
import ast
from collections import Counter
import numpy as np

## Analysis of Gene Terms in DGIdb from 2023
Reanalysis of the log data obtained from DGIdb user search patterns in Q1/Q2 2023. This is intended to extract the gene terms most often searched from each search type and give them over to Anastasia who will subsequently determine the frequency of searches that have gene collisions.

### Load Data

In [311]:
df = pd.read_excel('log_data.xlsx')
df[0:5]

Unnamed: 0.1,Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error
0,0,2023-05-03T11:45:03.737594,#95027,34b514ef-ea79-498b-83a8-ec5abff01760,GET,/drug_names.json,json,DrugsController,names,200,58.31,0.14,0.0,{},,
1,0,2023-05-03T11:45:06.699989,#95027,47c0fa4a-7068-42ce-91d8-525f987568ff,GET,/genes/ICAM3,html,GenesController,show,200,895.1,755.2,129.73,{'name': 'ICAM3'},,
2,0,2023-05-03T11:45:10.303598,#94988,1fc0e9d3-f6a2-4b00-baba-0fd9ce9b5405,GET,/genes/HTT,html,GenesController,show,200,32723.75,31973.05,739.82,{'name': 'HTT'},,
3,0,2023-05-03T11:45:12.311825,#94988,205878ef-9bea-47c4-9cff-222a93839525,GET,/gene_names.json,json,GenesController,names,200,151.56,0.12,0.0,{},,
4,0,2023-05-03T11:45:15.174727,#94988,15d9ec98-d825-49ef-ad58-0fd398ea6839,GET,/drug_names.json,json,DrugsController,names,200,54.78,0.1,0.0,{},,


### Clean Data

In [312]:
#We are interested in the api/v2/interactions queries
df['path'].value_counts()[0:10]

path
/api/v2/interactions.json         25280
/                                 12654
/gene_names.json                   7051
/interaction_search_results        5748
/search_interactions               5241
/api/v2/interaction_types.json     3202
/drug_names.json                   2070
/search_categories                 1523
/downloads                         1064
/categories_search_results         1048
Name: count, dtype: int64

In [313]:
data = df[df['path']=='/api/v2/interactions.json'].drop(labels='Unnamed: 0',axis=1).reset_index(drop=True)
data[0:5]

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error
0,2023-05-03T11:59:21.273789,#95027,5a1bd777-14c2-44b1-87b8-4bcec01433c4,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,46.52,7.66,21.43,{'genes': 'ATN1'},,
1,2023-05-03T12:15:13.185862,#95027,66a47e2f-c94d-451c-8cdb-c1a29617b3cb,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,993.19,917.98,61.5,{'drugs': 'Scriptaid'},,
2,2023-05-03T12:25:02.801847,#95027,9f3932bb-5611-4317-ab44-fab33992e283,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,37.76,3.25,23.25,{'genes': 'GPR26'},,
3,2023-05-03T12:29:09.147971,#95027,3a24f3df-c114-4739-8bdc-f89203ec31a0,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,35.21,0.52,19.2,{'drugs': 'Isoprenol'},,
4,2023-05-03T12:29:51.541701,#95027,e88c50f0-2b4f-4c7f-ab0d-075095bce6c2,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,34.51,3.2,20.37,{'genes': 'IL10RA'},,


In [314]:
#We want to pull out the genes from the params of the query data
def get_search_type(record):
    try:
        record = record.replace("'",'"')
        record = json.loads(record)
        keys = list(record.keys())
        if len(keys) > 0:
            return keys[0]
        else:
            return 'KeyLength'
    except json.JSONDecodeError:
        return 'JSONDecode'


In [315]:
data['params'][0]
test = data['params'][0].replace("'",'"')
test = json.loads(test)
list(test.keys())

['genes']

In [316]:
data['type'] = None
data['type'] = data['params'].apply(get_search_type)

In [317]:
# Distribution of Types of Interaction Query sent (drugs vs genes)
data['type'].value_counts()

type
genes         23070
drugs          1647
KeyLength       543
JSONDecode       20
Name: count, dtype: int64

In [318]:
data = data[data['type']=='genes'].reset_index(drop=True)
data[0:5]

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error,type
0,2023-05-03T11:59:21.273789,#95027,5a1bd777-14c2-44b1-87b8-4bcec01433c4,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,46.52,7.66,21.43,{'genes': 'ATN1'},,,genes
1,2023-05-03T12:25:02.801847,#95027,9f3932bb-5611-4317-ab44-fab33992e283,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,37.76,3.25,23.25,{'genes': 'GPR26'},,,genes
2,2023-05-03T12:29:51.541701,#95027,e88c50f0-2b4f-4c7f-ab0d-075095bce6c2,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,34.51,3.2,20.37,{'genes': 'IL10RA'},,,genes
3,2023-05-03T12:29:57.144888,#95027,26747924-e565-43ac-b36e-9d802d4ee5c8,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,33.73,3.04,20.15,{'genes': 'VSTM2A'},,,genes
4,2023-05-03T12:30:01.291949,#95027,8c9d7628-ce57-4c3d-b74e-a7acd042cb6b,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,189.23,115.09,61.95,{'genes': 'SH2B3'},,,genes


In [319]:
def get_genes(record):
    try:
        record = record.replace("'",'"')
        record = json.loads(record)
        if record['genes']:
            genes = record['genes'].split(',')
            return genes
        else:
            return 'KeyLength'
    except:
        pass 

In [320]:
data['genes'] = None
data['genes'] = data['params'].apply(get_genes)
data[0:5]

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error,type,genes
0,2023-05-03T11:59:21.273789,#95027,5a1bd777-14c2-44b1-87b8-4bcec01433c4,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,46.52,7.66,21.43,{'genes': 'ATN1'},,,genes,[ATN1]
1,2023-05-03T12:25:02.801847,#95027,9f3932bb-5611-4317-ab44-fab33992e283,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,37.76,3.25,23.25,{'genes': 'GPR26'},,,genes,[GPR26]
2,2023-05-03T12:29:51.541701,#95027,e88c50f0-2b4f-4c7f-ab0d-075095bce6c2,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,34.51,3.2,20.37,{'genes': 'IL10RA'},,,genes,[IL10RA]
3,2023-05-03T12:29:57.144888,#95027,26747924-e565-43ac-b36e-9d802d4ee5c8,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,33.73,3.04,20.15,{'genes': 'VSTM2A'},,,genes,[VSTM2A]
4,2023-05-03T12:30:01.291949,#95027,8c9d7628-ce57-4c3d-b74e-a7acd042cb6b,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,189.23,115.09,61.95,{'genes': 'SH2B3'},,,genes,[SH2B3]


### How many symbols were queried across all queries? (queries may have more than one gene)

In [321]:
def flatten_comprehension(matrix):
    return [item for row in matrix for item in row]

In [322]:
queried_genes_list = data['genes'].tolist()

In [323]:
queried_genes_list = flatten_comprehension(queried_genes_list)

In [324]:
print(len(queried_genes_list))

188832


### How many interaction queries are requested using genes?

In [326]:
len(data['genes'])

23070

### How many unique gene symbols were queried between the 23,070 queries and 188,832 total symbols?

In [325]:
data['genes'] = data['genes'].apply(set)
data['genes']

0                                         {ATN1}
1                                        {GPR26}
2                                       {IL10RA}
3                                       {VSTM2A}
4                                        {SH2B3}
                          ...                   
23065       {RBMY1C, QSOX1, RN7SL5P, CLIP1, PVR}
23066    {ZNF341, GINM1, MFSD2A, SNORD15B, OSR2}
23067       {TFAP2B, MKRN4P, VASP, SLPI, POU4F1}
23068       {PTHLH, HSD3B2, ITGAM, IDDM9, MTCP1}
23069           {SRRM4, HPDL, JAM3, SLIRP, MOB2}
Name: genes, Length: 23070, dtype: object

In [327]:
queried_gene_symbols_set = data['genes'].agg(lambda x: set.union(*x))
print(len(queried_gene_symbols_set))

43400


In [328]:
#Count each queried gene
all_genes = [gene for sublist in data['genes'] for gene in sublist]
gene_counts = Counter(all_genes)
gene_counts

Counter({'XYZA': 1549,
         'TNF': 1001,
         'AP1': 715,
         'AP2': 576,
         'IL6': 429,
         'IL8': 417,
         'BRAF': 305,
         'XYZB': 278,
         'n': 158,
         'KRAS': 148,
         'L': 144,
         't': 140,
         'K': 140,
         'e': 140,
         'g': 140,
         'h': 140,
         'y': 140,
         'NME1-NME2': 73,
         'COL1A1': 71,
         'UCHL1': 62,
         'NPIPA5': 58,
         'DSG1': 58,
         'MMP2': 57,
         'BASP1': 57,
         'FBN1': 55,
         'KRT14': 55,
         'CAV1': 54,
         'MARCKS': 54,
         'MAP2': 53,
         'DSP': 51,
         'COL6A1': 50,
         'TNS3': 50,
         'CXCL8': 49,
         'LRP1': 49,
         'COL3A1': 49,
         'KIF1A': 49,
         'RAP1GAP2': 49,
         'C1QA': 48,
         'FN1': 48,
         'RUNX1': 48,
         'PTPRC': 48,
         'COL4A1': 48,
         'NEDD4': 46,
         'FLT3': 46,
         'PROSER2': 46,
         'MAP1A': 46,
         'PPB

In [330]:
gene_counts['KRAS']

148

### Load Ambiguous Gene Symbol Set

In [331]:
with open('ambiguous_symbol_set.txt', 'r') as file:
    # Read each line, strip newline characters, and convert to a set
    ambiguous_symbol_set = set(line.strip() for line in file)
ambiguous_symbol_set

{'FAM66E',
 'S3',
 'NOD5',
 'DHX40P1, TBC1D3P1',
 'CS-1',
 'FAM25A',
 'IGHEP1',
 'DIO1',
 'FRITZ',
 'CDG2T',
 'CXDELq22.3',
 'ALDR1',
 'RUFY3',
 'LAP3',
 'UTR',
 'FOP',
 'MUHH',
 'HP-1',
 'GPRK7',
 'EMC19',
 'ARC',
 'CDD',
 'FAM236C',
 'K7',
 'PLIP',
 'CT90',
 'OLF1',
 'AGS8',
 'NUDT10',
 'NKIR',
 'LAS1',
 'NL2',
 'C18ORF2',
 'CAR',
 'DBM',
 'PAGB',
 'PNR',
 'PRPH',
 'DYT14',
 'NAP1',
 'CYPD',
 'BST1',
 'PI3K',
 'FIP1',
 'PP',
 'PTPA',
 'TFIIA',
 'IL8RA',
 'MAK3',
 'NET1',
 'STRA13',
 'MCT',
 'MMP21',
 'CLPSMCR',
 'SSA',
 'KRTAP5-3',
 'TRS-AGA2-4',
 'TP53TG3F',
 'XGPY',
 'D17S1718',
 'A2',
 'RCN3',
 'BEDP',
 'ME2',
 'JDP1',
 'TRS-GCT4-3',
 'COD2',
 'MRG1',
 'LGS',
 'LRF',
 'ESP1',
 'ASE1',
 'CIP1',
 'DGS',
 'UGT1C',
 'PAK3',
 'SHSF3',
 'TPS2',
 'GPCR',
 'eIF-2gA',
 'HBP',
 'PNMT',
 'AR',
 'H2BFG',
 'PRR20A, PRR20B, PRR20D, PRR20E',
 'L18',
 'PTH2',
 'CACNA1C-IT2',
 'HK2',
 'OR6-1',
 'CXorf52',
 'C11orf48',
 'CHC',
 'MRP8',
 'tamo',
 'CGB5, CGB7, CGB8, LHB',
 'PARC',
 'MLCK2',
 'TST1',


In [332]:
len(ambiguous_symbol_set)

5050

### How many of the 23,070 queries included an ambiguous symbol?

In [333]:
data['ambiguous_gene_in_query'] = data.genes.apply(lambda x: x & ambiguous_symbol_set)
data_df = data[data.ambiguous_gene_in_query != set()]
data_df

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error,type,genes,ambiguous_gene_in_query
51,2023-04-29T02:35:58.280063,#333251,75e47a32-329c-4a03-a039-8d72135a1cc6,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,10792.21,10374.24,340.52,"{'genes': 'KRAS', 'interaction_sources': '', '...",,,genes,{KRAS},{KRAS}
63,2023-04-29T03:45:27.432018,#333544,b8e02f25-bc77-488e-89c0-7dbce1078e22,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,10276.04,9858.90,339.68,"{'genes': 'KRAS', 'interaction_sources': '', '...",,,genes,{KRAS},{KRAS}
77,2023-04-29T04:26:52.079678,#333735,3f1776e9-0e6c-49f5-abeb-09fae0bbfcc3,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,10252.55,9812.72,341.46,"{'genes': 'KRAS', 'interaction_sources': '', '...",,,genes,{KRAS},{KRAS}
92,2023-04-29T08:49:07.336871,#338090,c3c73214-28ad-44f1-8b17-0befc5036d84,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,11173.90,10715.35,379.71,"{'genes': 'KRAS', 'interaction_sources': '', '...",,,genes,{KRAS},{KRAS}
103,2023-04-29T09:43:21.049497,#341313,4e7238fe-9d9a-422c-a447-67abee45657e,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,10561.47,10117.51,348.08,"{'genes': 'KRAS', 'interaction_sources': '', '...",,,genes,{KRAS},{KRAS}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23036,2023-04-16T01:17:14.363678,#37959,c83f3204-ff8a-46b9-976d-dfb0db6b9e62,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,516.33,433.78,70.57,"{'genes': 'RPL41P5,RPL13,PTPRR,PEBP1,MYL5'}",,,genes,"{MYL5, RPL41P5, PEBP1, RPL13, PTPRR}",{RPL13}
23056,2023-04-16T01:17:19.774606,#37959,924bbe55-47a8-4b48-b0e5-8d06be1d4c87,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,50.15,11.97,26.96,"{'genes': 'TUBGCP2,RBM7,KMT2B,MRPL33,LDB1'}",,,genes,"{LDB1, RBM7, TUBGCP2, KMT2B, MRPL33}",{LDB1}
23064,2023-04-16T01:17:35.989923,#37959,96b8aa02-0862-4b29-bba9-40934aa1caaa,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,15158.28,14676.73,439.61,"{'genes': 'POLH,NPM1P2,MT-TL2,MSK9,MPI'}",,,genes,"{MSK9, NPM1P2, MPI, MT-TL2, POLH}","{MPI, POLH}"
23065,2023-04-16T01:17:36.045976,#37959,385d79d5-7991-4b4b-ad6e-aeb0886c77a0,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,53.55,13.20,28.44,"{'genes': 'CLIP1,RN7SL5P,RBMY1C,PVR,QSOX1'}",,,genes,"{RBMY1C, QSOX1, RN7SL5P, CLIP1, PVR}",{RBMY1C}


### Of the total 188,832 gene symbols searched, how many times were ambiguous gene symbols searched?

In [334]:
def sum_values_for_terms(data_dict, term_set):
    total_sum = 0
    
    # Iterate over the dictionary
    for key, value in data_dict.items():
        # Check if the key is in the set of terms
        if key in term_set:
            # Add the value to the total sum
            total_sum += value
    
    return total_sum

In [335]:
# Call the function and print the result
result = sum_values_for_terms(gene_counts, ambiguous_symbol_set)
print(result) 

7531


### Of the 43,400 unique gene symbols queried, how many are ambiguous?

In [336]:
def create_df_for_terms(data_dict, term_set, csv_filename):
    # Filter the dictionary to include only the terms in the set
    filtered_dict = {key: value for key, value in data_dict.items() if key in term_set}
    
    # Create a DataFrame from the filtered dictionary
    df = pd.DataFrame(list(filtered_dict.items()), columns=['Ambiguous Symbol', '# of Queries'])

    df = df.sort_values(by='# of Queries', ascending=False)

    df.to_csv(csv_filename, index=False) 
    
    return df

In [337]:
# Call the function and get the DataFrame
df_result = create_df_for_terms(gene_counts, ambiguous_symbol_set, "ambiguous_symbol_queries.csv")
print(df_result)

     Ambiguous Symbol  # of Queries
0                KRAS           148
496              MAP2            53
192               DSP            51
29               LRP1            49
1189            MAP1A            46
...               ...           ...
744        TRQ-CTG1-1             2
1307             STK1             1
225               GIF             1
1314              TAZ             1
1315           SPACDR             1

[1316 rows x 2 columns]


### Summary

In [345]:
title = ['# Interaction Queries using Genes', '# Terms as Genes for Queries', '# Unique Gene Symbols Queried']
summary_data = {'': ['23,070', '188,832', '43,400'],
        '# Ambiguous Symbols':['3,201 (14%)', '7,531 (4%)', '1,316 (3%)']}
summary_df = pd.DataFrame(summary_data, index=title)
summary_df

Unnamed: 0,Unnamed: 1,# Ambiguous Symbols
# Interaction Queries using Genes,23070,"3,201 (14%)"
# Terms as Genes for Queries,188832,"7,531 (4%)"
# Unique Gene Symbols Queried,43400,"1,316 (3%)"
