In [None]:
import pandas as pd
import json
import plotly.express as px
import plotly
import ast
from collections import Counter
import numpy as np

## Analysis of Gene Terms in DGIdb from 2023
Reanalysis of the log data obtained from DGIdb user search patterns in Q1/Q2 2023. This is intended to extract the gene terms most often searched from each search type and give them over to Anastasia who will subsequently determine the frequency of searches that have gene collisions.

### Load Data

In [None]:
df = pd.read_excel("log_data.xlsx")
df[0:5]

### Clean Data

In [None]:
# We are interested in the api/v2/interactions queries
df["path"].value_counts()[0:10]

In [None]:
data = (
    df[df["path"] == "/api/v2/interactions.json"]
    .drop(labels="Unnamed: 0", axis=1)
    .reset_index(drop=True)
)
data[0:5]

In [None]:
# We want to pull out the genes from the params of the query data
def get_search_type(record):
    try:
        record = record.replace("'", '"')
        record = json.loads(record)
        keys = list(record.keys())
        if len(keys) > 0:
            return keys[0]
        else:
            return "KeyLength"
    except json.JSONDecodeError:
        return "JSONDecode"

In [None]:
data["params"][0]
test = data["params"][0].replace("'", '"')
test = json.loads(test)
list(test.keys())

In [None]:
data["type"] = None
data["type"] = data["params"].apply(get_search_type)

In [None]:
# Distribution of Types of Interaction Query sent (drugs vs genes)
data["type"].value_counts()

In [None]:
data = data[data["type"] == "genes"].reset_index(drop=True)
data[0:5]

In [None]:
def get_genes(record):
    try:
        record = record.replace("'", '"')
        record = json.loads(record)
        if record["genes"]:
            genes = record["genes"].split(",")
            return genes
        else:
            return "KeyLength"
    except:
        pass

In [None]:
data["genes"] = None
data["genes"] = data["params"].apply(get_genes)
data[0:5]

### How many symbols were queried across all queries? (queries may have more than one gene)

In [None]:
def flatten_comprehension(matrix):
    return [item for row in matrix for item in row]

In [None]:
queried_genes_list = data["genes"].tolist()

In [None]:
queried_genes_list = flatten_comprehension(queried_genes_list)

In [None]:
print(len(queried_genes_list))

### How many interaction queries are requested using genes?

In [None]:
len(data["genes"])

### How many unique gene symbols were queried between the 23,070 queries and 188,832 total symbols?

In [None]:
data["genes"] = data["genes"].apply(set)
data["genes"]

In [None]:
queried_gene_symbols_set = data["genes"].agg(lambda x: set.union(*x))
print(len(queried_gene_symbols_set))

In [None]:
# Count each queried gene
all_genes = [gene for sublist in data["genes"] for gene in sublist]
gene_counts = Counter(all_genes)
gene_counts

In [None]:
gene_counts["KRAS"]

### Load Ambiguous Gene Symbol Set

In [None]:
with open("ambiguous_symbol_set.txt", "r") as file:
    # Read each line, strip newline characters, and convert to a set
    ambiguous_symbol_set = set(line.strip() for line in file)
ambiguous_symbol_set

In [None]:
len(ambiguous_symbol_set)

### How many of the 23,070 queries included an ambiguous symbol?

In [None]:
data["ambiguous_gene_in_query"] = data.genes.apply(lambda x: x & ambiguous_symbol_set)
data_df = data[data.ambiguous_gene_in_query != set()]
data_df

### Of the total 188,832 gene symbols searched, how many times were ambiguous gene symbols searched?

In [None]:
def sum_values_for_terms(data_dict, term_set):
    total_sum = 0

    # Iterate over the dictionary
    for key, value in data_dict.items():
        # Check if the key is in the set of terms
        if key in term_set:
            # Add the value to the total sum
            total_sum += value

    return total_sum

In [None]:
# Call the function and print the result
result = sum_values_for_terms(gene_counts, ambiguous_symbol_set)
print(result)

### Of the 43,400 unique gene symbols queried, how many are ambiguous?

In [None]:
def create_df_for_terms(data_dict, term_set, csv_filename):
    # Filter the dictionary to include only the terms in the set
    filtered_dict = {key: value for key, value in data_dict.items() if key in term_set}

    # Create a DataFrame from the filtered dictionary
    df = pd.DataFrame(
        list(filtered_dict.items()), columns=["Ambiguous Symbol", "# of Queries"]
    )

    df = df.sort_values(by="# of Queries", ascending=False)

    df.to_csv(csv_filename, index=False)

    return df

In [None]:
# Call the function and get the DataFrame
df_result = create_df_for_terms(
    gene_counts, ambiguous_symbol_set, "ambiguous_symbol_queries.csv"
)
print(df_result)

### Summary

In [None]:
title = [
    "# Interaction Queries using Genes",
    "# Terms as Genes for Queries",
    "# Unique Gene Symbols Queried",
]
summary_data = {
    "": ["23,070", "188,832", "43,400"],
    "# Ambiguous Symbols": ["3,201 (14%)", "7,531 (4%)", "1,316 (3%)"],
}
summary_df = pd.DataFrame(summary_data, index=title)
summary_df