In [1]:
import pandas as pd
import json
import plotly.express as px
import plotly
import ast
from collections import Counter
import numpy as np

## Analysis of Gene Terms in DGIdb from 2023
Reanalysis of the log data obtained from DGIdb user search patterns in Q1/Q2 2023. This is intended to extract the gene terms most often searched from each search type and give them over to Anastasia who will subsequently determine the frequency of searches that have gene collisions.

### Load Data

In [2]:
df = pd.read_excel("../input/log_data.xlsx")
df[0:5]

Unnamed: 0.1,Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error
0,0,2023-05-03T11:45:03.737594,#95027,34b514ef-ea79-498b-83a8-ec5abff01760,GET,/drug_names.json,json,DrugsController,names,200,58.31,0.14,0.0,{},,
1,0,2023-05-03T11:45:06.699989,#95027,47c0fa4a-7068-42ce-91d8-525f987568ff,GET,/genes/ICAM3,html,GenesController,show,200,895.1,755.2,129.73,{'name': 'ICAM3'},,
2,0,2023-05-03T11:45:10.303598,#94988,1fc0e9d3-f6a2-4b00-baba-0fd9ce9b5405,GET,/genes/HTT,html,GenesController,show,200,32723.75,31973.05,739.82,{'name': 'HTT'},,
3,0,2023-05-03T11:45:12.311825,#94988,205878ef-9bea-47c4-9cff-222a93839525,GET,/gene_names.json,json,GenesController,names,200,151.56,0.12,0.0,{},,
4,0,2023-05-03T11:45:15.174727,#94988,15d9ec98-d825-49ef-ad58-0fd398ea6839,GET,/drug_names.json,json,DrugsController,names,200,54.78,0.1,0.0,{},,


### Clean Data

We are interested in the api/v2/interactions queries

In [3]:
df["path"].value_counts()[0:10]

path
/api/v2/interactions.json         25280
/                                 12654
/gene_names.json                   7051
/interaction_search_results        5748
/search_interactions               5241
/api/v2/interaction_types.json     3202
/drug_names.json                   2070
/search_categories                 1523
/downloads                         1064
/categories_search_results         1048
Name: count, dtype: int64

In [4]:
data = (
    df[df["path"] == "/api/v2/interactions.json"]
    .drop(labels="Unnamed: 0", axis=1)
    .reset_index(drop=True)
)
data[0:5]

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error
0,2023-05-03T11:59:21.273789,#95027,5a1bd777-14c2-44b1-87b8-4bcec01433c4,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,46.52,7.66,21.43,{'genes': 'ATN1'},,
1,2023-05-03T12:15:13.185862,#95027,66a47e2f-c94d-451c-8cdb-c1a29617b3cb,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,993.19,917.98,61.5,{'drugs': 'Scriptaid'},,
2,2023-05-03T12:25:02.801847,#95027,9f3932bb-5611-4317-ab44-fab33992e283,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,37.76,3.25,23.25,{'genes': 'GPR26'},,
3,2023-05-03T12:29:09.147971,#95027,3a24f3df-c114-4739-8bdc-f89203ec31a0,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,35.21,0.52,19.2,{'drugs': 'Isoprenol'},,
4,2023-05-03T12:29:51.541701,#95027,e88c50f0-2b4f-4c7f-ab0d-075095bce6c2,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,34.51,3.2,20.37,{'genes': 'IL10RA'},,


We want to pull out the genes from the params of the query data

In [5]:
def get_search_type(record):
    try:
        record = record.replace("'", '"')
        record = json.loads(record)
        keys = list(record.keys())
        if len(keys) > 0:
            return keys[0]
        else:
            return "KeyLength"
    except json.JSONDecodeError:
        return "JSONDecode"

In [6]:
data["params"][0]
test = data["params"][0].replace("'", '"')
test = json.loads(test)
list(test.keys())

['genes']

In [7]:
data["type"] = None
data["type"] = data["params"].apply(get_search_type)

Distribution of Types of Interaction Query sent (drugs vs genes)

In [8]:
data["type"].value_counts()

type
genes         23070
drugs          1647
KeyLength       543
JSONDecode       20
Name: count, dtype: int64

In [9]:
data = data[data["type"] == "genes"].reset_index(drop=True)
data[0:5]

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error,type
0,2023-05-03T11:59:21.273789,#95027,5a1bd777-14c2-44b1-87b8-4bcec01433c4,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,46.52,7.66,21.43,{'genes': 'ATN1'},,,genes
1,2023-05-03T12:25:02.801847,#95027,9f3932bb-5611-4317-ab44-fab33992e283,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,37.76,3.25,23.25,{'genes': 'GPR26'},,,genes
2,2023-05-03T12:29:51.541701,#95027,e88c50f0-2b4f-4c7f-ab0d-075095bce6c2,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,34.51,3.2,20.37,{'genes': 'IL10RA'},,,genes
3,2023-05-03T12:29:57.144888,#95027,26747924-e565-43ac-b36e-9d802d4ee5c8,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,33.73,3.04,20.15,{'genes': 'VSTM2A'},,,genes
4,2023-05-03T12:30:01.291949,#95027,8c9d7628-ce57-4c3d-b74e-a7acd042cb6b,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,189.23,115.09,61.95,{'genes': 'SH2B3'},,,genes


In [10]:
def get_genes(record):
    try:
        record = record.replace("'", '"')
        record = json.loads(record)
        if record["genes"]:
            genes = record["genes"].split(",")
            return genes
        else:
            return "KeyLength"
    except:  
        pass

In [11]:
data["genes"] = None
data["genes"] = data["params"].apply(get_genes)
data[0:5]

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error,type,genes
0,2023-05-03T11:59:21.273789,#95027,5a1bd777-14c2-44b1-87b8-4bcec01433c4,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,46.52,7.66,21.43,{'genes': 'ATN1'},,,genes,[ATN1]
1,2023-05-03T12:25:02.801847,#95027,9f3932bb-5611-4317-ab44-fab33992e283,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,37.76,3.25,23.25,{'genes': 'GPR26'},,,genes,[GPR26]
2,2023-05-03T12:29:51.541701,#95027,e88c50f0-2b4f-4c7f-ab0d-075095bce6c2,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,34.51,3.2,20.37,{'genes': 'IL10RA'},,,genes,[IL10RA]
3,2023-05-03T12:29:57.144888,#95027,26747924-e565-43ac-b36e-9d802d4ee5c8,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,33.73,3.04,20.15,{'genes': 'VSTM2A'},,,genes,[VSTM2A]
4,2023-05-03T12:30:01.291949,#95027,8c9d7628-ce57-4c3d-b74e-a7acd042cb6b,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,189.23,115.09,61.95,{'genes': 'SH2B3'},,,genes,[SH2B3]


### How many terms idenitfied as gene symbols were queried across all queries? (queries may have more than one gene term)

In [12]:
def flatten_comprehension(matrix):
    return [item for row in matrix for item in row]

In [13]:
queried_genes_list = data["genes"].tolist()

In [14]:
queried_genes_list = flatten_comprehension(queried_genes_list)

In [15]:
num_queried_gene_terms = len(queried_genes_list)
num_queried_gene_terms

188832

### How many interaction queries are requested using genes?

In [16]:
num_interaction_queries_using_genes = len(data["genes"])
num_interaction_queries_using_genes

23070

### How many unique gene symbols were queried between the 23,070 queries and 188,832 total symbols?

In [17]:
data["genes"] = data["genes"].apply(set)
data["genes"]

0                                         {ATN1}
1                                        {GPR26}
2                                       {IL10RA}
3                                       {VSTM2A}
4                                        {SH2B3}
                          ...                   
23065       {QSOX1, RBMY1C, PVR, CLIP1, RN7SL5P}
23066    {MFSD2A, GINM1, OSR2, SNORD15B, ZNF341}
23067       {MKRN4P, POU4F1, SLPI, VASP, TFAP2B}
23068       {HSD3B2, PTHLH, MTCP1, IDDM9, ITGAM}
23069           {HPDL, SRRM4, JAM3, MOB2, SLIRP}
Name: genes, Length: 23070, dtype: object

In [18]:
queried_gene_symbols_set = data["genes"].agg(lambda x: set.union(*x))
num_unique_gene_symbols_queried = len(queried_gene_symbols_set)
num_unique_gene_symbols_queried

43400

Count each queried gene

In [19]:
all_genes = [gene for sublist in data["genes"] for gene in sublist]
gene_counts = Counter(all_genes)
gene_counts

Counter({'XYZA': 1549,
         'TNF': 1001,
         'AP1': 715,
         'AP2': 576,
         'IL6': 429,
         'IL8': 417,
         'BRAF': 305,
         'XYZB': 278,
         'n': 158,
         'KRAS': 148,
         'L': 144,
         'e': 140,
         't': 140,
         'y': 140,
         'h': 140,
         'g': 140,
         'K': 140,
         'NME1-NME2': 73,
         'COL1A1': 71,
         'UCHL1': 62,
         'NPIPA5': 58,
         'DSG1': 58,
         'MMP2': 57,
         'BASP1': 57,
         'FBN1': 55,
         'KRT14': 55,
         'MARCKS': 54,
         'CAV1': 54,
         'MAP2': 53,
         'DSP': 51,
         'COL6A1': 50,
         'TNS3': 50,
         'CXCL8': 49,
         'LRP1': 49,
         'COL3A1': 49,
         'KIF1A': 49,
         'RAP1GAP2': 49,
         'FN1': 48,
         'C1QA': 48,
         'RUNX1': 48,
         'PTPRC': 48,
         'COL4A1': 48,
         'NEDD4': 46,
         'FLT3': 46,
         'PROSER2': 46,
         'MAP1A': 46,
         'PPB

In [20]:
gene_counts["KRAS"]

148

### Load Ambiguous Gene Symbol Set

In [21]:
with open("../output/ambiguous_symbol_set.txt", "r") as file:
    # read each line, strip newline characters, and convert to a set
    ambiguous_symbol_set = set(line.strip() for line in file)
ambiguous_symbol_set

{'SRC2',
 'RP5',
 'P120',
 'PSD',
 'ANCO1',
 'VPS24',
 'MEF',
 'BNSP',
 'SPATA31D3',
 'MPD6',
 'BEX2',
 'STA',
 'H2A.BBD',
 'THG1',
 'PHR1',
 'MRPL32',
 'CHC',
 'ZNF468',
 'PGD',
 'H2AC18',
 'POMP',
 'HOKPP',
 'RHE',
 'RDC1',
 'NR1',
 'VAT1',
 'HSP70-2',
 'PRR20C',
 'PARS',
 'GPD',
 'LINC00443',
 'LIN-10',
 'S4',
 'RNAH',
 'FCGR2C',
 'R51H3',
 'NAT3',
 'DAND1',
 'H4C6',
 'COM1',
 'DLEU2',
 'SPANXC',
 'EDA3',
 'TBC1D3A',
 'T6BP',
 'OR11H12',
 'DA4',
 'AAVS1',
 'ANF',
 'HCR',
 'LINC00086',
 'SAN',
 'HYDINP1',
 'FAME',
 'FHOD3',
 'PRM1',
 'USP12P3',
 'MYD88-4',
 'PABPL1',
 'EWSR2',
 'TRA',
 'OSCP',
 'CHIF',
 'L2',
 'RAR2',
 'GAT2',
 'GTF2A1L',
 'PRG5',
 'G18',
 'TAX1BP2',
 'CLCP1',
 'FBS',
 'HMS',
 'DESI1',
 'CT134',
 'FLJ22167',
 'ACP1',
 'TRX-CAT1-6',
 'HCA2',
 'SPEN',
 'ROC2',
 'OP-2',
 'HOX3',
 'ILT5',
 'BRWD1-IT2',
 'H4C8',
 'ODZ3',
 'PEO',
 'TXN',
 'EEF1AKMT4',
 'SLX9',
 'GGF2',
 'BRWD1-AS2',
 'H2AC8',
 'MLM',
 'LOR',
 'SCG2',
 'USP12-DT',
 'TNX',
 'B61',
 '9G8',
 'GASP-1',
 'EIF2AK

In [22]:
len(ambiguous_symbol_set)

5191

### How many of the interaction queries using genes included an ambiguous symbol?

In [23]:
data["ambiguous_gene_in_query"] = data.genes.apply(lambda x: x & ambiguous_symbol_set)
data_df = data[data.ambiguous_gene_in_query != set()]
data_df

Unnamed: 0,time,request_num,uuid,method,path,format,controller,action,status,duration,view,db,params,location,error,type,genes,ambiguous_gene_in_query
17,2023-04-03T21:41:46.386524,#112476,58e2f514-3c72-470c-9249-323d8c9bb081,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,9202.31,8841.23,308.21,"{'genes': 'TNF,AP1,AP2,XYZA', 'interaction_sou...",,,genes,"{AP1, XYZA, TNF, AP2}",{AP2}
18,2023-04-03T21:41:56.047208,#112476,fc4d3f38-9a45-4a1e-8572-ca7b789b34a4,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,9223.01,8844.45,312.74,"{'genes': 'TNF,AP1,AP2,XYZA', 'interaction_sou...",,,genes,"{AP1, XYZA, TNF, AP2}",{AP2}
19,2023-04-03T21:41:57.140781,#112476,88cb7915-5e7b-4807-9dfc-bdc77e8574f4,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,213.81,1.15,135.14,"{'genes': 'TNF,AP1,AP2,XYZA', 'interaction_sou...",,,genes,"{AP1, XYZA, TNF, AP2}",{AP2}
20,2023-04-03T21:42:17.482316,#112476,44602b18-ff6b-4c05-934e-71914d0080e6,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,9401.45,9043.04,305.14,"{'genes': 'TNF,AP1,AP2,XYZA', 'interaction_sou...",,,genes,"{AP1, XYZA, TNF, AP2}",{AP2}
21,2023-04-03T21:42:27.478590,#112476,6269711b-ec6b-4067-9d03-904711e64451,POST,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,9494.68,9129.97,312.85,"{'genes': 'TNF,AP1,AP2,XYZA', 'interaction_sou...",,,genes,"{AP1, XYZA, TNF, AP2}",{AP2}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23036,2023-04-16T01:17:14.363678,#37959,c83f3204-ff8a-46b9-976d-dfb0db6b9e62,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,516.33,433.78,70.57,"{'genes': 'RPL41P5,RPL13,PTPRR,PEBP1,MYL5'}",,,genes,"{RPL41P5, MYL5, PTPRR, RPL13, PEBP1}",{RPL13}
23056,2023-04-16T01:17:19.774606,#37959,924bbe55-47a8-4b48-b0e5-8d06be1d4c87,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,50.15,11.97,26.96,"{'genes': 'TUBGCP2,RBM7,KMT2B,MRPL33,LDB1'}",,,genes,"{TUBGCP2, KMT2B, MRPL33, RBM7, LDB1}",{LDB1}
23064,2023-04-16T01:17:35.989923,#37959,96b8aa02-0862-4b29-bba9-40934aa1caaa,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,15158.28,14676.73,439.61,"{'genes': 'POLH,NPM1P2,MT-TL2,MSK9,MPI'}",,,genes,"{MSK9, MPI, MT-TL2, NPM1P2, POLH}","{MPI, POLH}"
23065,2023-04-16T01:17:36.045976,#37959,385d79d5-7991-4b4b-ad6e-aeb0886c77a0,GET,/api/v2/interactions.json,json,ServicesV2Controller,interactions,200,53.55,13.20,28.44,"{'genes': 'CLIP1,RN7SL5P,RBMY1C,PVR,QSOX1'}",,,genes,"{QSOX1, RBMY1C, PVR, CLIP1, RN7SL5P}",{RBMY1C}


In [24]:
num_interaction_queries_with_ambig_symbols = len(data_df)
num_interaction_queries_with_ambig_symbols

4089

### Of the total gene terms searched, how many were ambiguous gene symbols?

In [25]:
total_sum = 0

for key, value in gene_counts.items():
    if key in ambiguous_symbol_set:
        total_sum += value

In [26]:
num_queried_gene_terms_ambig = total_sum
num_queried_gene_terms_ambig

8694

### Of the unique gene symbols queried, how many are ambiguous?

In [27]:
filtered_dict = {key: value for key, value in gene_counts.items() if key in ambiguous_symbol_set}

ambiguous_symbol_queries_df = pd.DataFrame(
    list(filtered_dict.items()), columns=["Ambiguous Symbol", "# of Queries"]
)

ambiguous_symbol_queries_df = ambiguous_symbol_queries_df.sort_values(by="# of Queries", ascending=False)

ambiguous_symbol_queries_df.to_csv("../output/ambiguous_symbol_queries.csv", index=False)

In [28]:
num_unique_gene_symbols_queried_ambig = len(ambiguous_symbol_queries_df)
num_unique_gene_symbols_queried_ambig

1469

### Summary

In [29]:
title = [
    "# Interaction Queries using Genes",
    "# Terms as Genes for Queries",
    "# Unique Gene Symbols Queried",
]
summary_data = {
    "": [num_interaction_queries_using_genes, num_queried_gene_terms, num_unique_gene_symbols_queried],
    "# Ambiguous Symbols": [f"{num_interaction_queries_with_ambig_symbols} ({((num_interaction_queries_with_ambig_symbols/num_interaction_queries_using_genes)*100):.2f}%)", f"{num_queried_gene_terms_ambig} ({((num_queried_gene_terms_ambig/num_queried_gene_terms)*100):.2f}%)",f"{num_unique_gene_symbols_queried_ambig} ({((num_unique_gene_symbols_queried_ambig/num_unique_gene_symbols_queried)*100):.2f}%)"],
}
summary_df = pd.DataFrame(summary_data, index=title)
summary_df

Unnamed: 0,Unnamed: 1,# Ambiguous Symbols
# Interaction Queries using Genes,23070,4089 (17.72%)
# Terms as Genes for Queries,188832,8694 (4.60%)
# Unique Gene Symbols Queried,43400,1469 (3.38%)
