In [8]:
import requests
import os # yes or no
from bs4 import BeautifulSoup

def download_files_from_hpa(url, max_size_gb=1, subfolder="downloaded_hpa_files"):
    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    # Convert the max size from GB to bytes
    max_size_bytes = max_size_gb * 1e9

    # Make an HTTP GET request to the provided URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a successful response

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Search for all <a> tags with the specified href structure
    links = soup.find_all('a', href=True)

    # Base URL to prepend to relative file paths
    base_url = "https://www.proteinatlas.org"

    for link in links:
        file_url = link['href']
        if file_url.endswith('.zip'):  # Check if the link is to a .zip file
            full_url = base_url + file_url

            # Extract filename from the URL
            filename = file_url.split('/')[-1]

            # Create the full path to save the file
            save_path = os.path.join(subfolder, filename)
            
            # Check if the file already exists
            if os.path.exists(save_path):
                print(f"{filename} already exists. Skipping download.")
                continue

            if filename == "proteinatlas.tsv.zip":
                # Check file size without downloading the entire file
                file_response = requests.head(full_url)
                file_size = int(file_response.headers.get('Content-Length', 0))

                if file_size <= max_size_bytes:
                    # Download the file if it's within the size limit
                    print(f"Downloading {filename}...")
                    file_response = requests.get(full_url, stream=True)
                    with open(save_path, 'wb') as file:
                        for chunk in file_response.iter_content(chunk_size=8192):
                            file.write(chunk)
                    print(f"{filename} downloaded!")
                else:
                    print(f"Skipping {filename} as it exceeds the size limit.")

# Example usage
download_files_from_hpa("https://www.proteinatlas.org/about/download")


proteinatlas.tsv.zip already exists. Skipping download.


In [9]:
import zipfile

# Function to unzip a file
def unzip_file(zip_file_path, output_folder_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder_path)
        print(f"Unzipped files to {output_folder_path}")

# Unzip the file
unzip_file('downloaded_hpa_files/proteinatlas.tsv.zip', 'unzipped_folder')

# Read the first 5 lines of the unzipped .tsv file
try:
    with open('unzipped_folder/proteinatlas.tsv', 'r') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            print(line.strip().replace('\t', ','))
except FileNotFoundError:
    print("The file 'proteinatlas.tsv' was not found in the 'unzipped_folder'.")


Unzipped files to unzipped_folder
Gene,"Gene synonym",Ensembl,"Gene description",Uniprot,Chromosome,Position,"Protein class","Biological process","Molecular function","Disease involvement",Evidence,"HPA evidence","UniProt evidence","NeXtProt evidence","RNA tissue specificity","RNA tissue distribution","RNA tissue specificity score","RNA tissue specific nTPM","RNA single cell type specificity","RNA single cell type distribution","RNA single cell type specificity score","RNA single cell type specific nTPM","RNA cancer specificity","RNA cancer distribution","RNA cancer specificity score","RNA cancer specific FPKM","RNA brain regional specificity","RNA brain regional distribution","RNA brain regional specificity score","RNA brain regional specific nTPM","RNA blood cell specificity","RNA blood cell distribution","RNA blood cell specificity score","RNA blood cell specific nTPM","RNA blood lineage specificity","RNA blood lineage distribution","RNA blood lineage specificity score","RNA blood l

In [10]:
import requests
import os

def download_file_from_github(url, save_path, folder_name="hECA"):
    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Define the full path to save the file
    full_save_path = os.path.join(folder_name, save_path)

    # Download the file
    response = requests.get(url)
    with open(full_save_path, 'wb') as file:
        file.write(response.content)

    print(f"File downloaded and saved as {full_save_path}")

# URL to the hECA marker gene annotation file
heca_url = "https://github.com/XuegongLab/hECA/raw/main/UHAF/uHAF%20marker%20reference.xlsx"

# Name of the file to save
heca_save_path = "uHAF_marker_reference.xlsx"

# Download the file
download_file_from_github(heca_url, heca_save_path)


File downloaded and saved as hECA/uHAF_marker_reference.xlsx


In [1]:
import requests
import zipfile
import os
from io import BytesIO

# Define the URL where the HPA marker reference file is hosted
hpa_zip_url = "https://www.science.org/doi/suppl/10.1126/sciadv.abh2169/suppl_file/sciadv.abh2169_data_s1_to_s4.zip"

# Define the path for the downloaded zip file and the extraction directory
downloaded_zip_path = 'sciadv_abh2169_data_s1_to_s4.zip'
extraction_directory = 'HPA Annotation'

# Function to download and unzip the file
def download_and_unzip(url, zip_path, extract_to):
    # Download the zip file
    response = requests.get(url)
    response.raise_for_status()

    # Save the content of the request into a new file
    with open(zip_path, 'wb') as file:
        file.write(response.content)

    # Unzip the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Call the function to download and unzip
download_and_unzip(hpa_zip_url, downloaded_zip_path, extraction_directory)

# Now process only the file sciadv.abh2169_Data_S2.xlsx
hpa_file_path = os.path.join(extraction_directory, 'sciadv.abh2169_Data_S2.xlsx')

HTTPError: 403 Client Error: Forbidden for url: https://www.science.org/doi/suppl/10.1126/sciadv.abh2169/suppl_file/sciadv.abh2169_data_s1_to_s4.zip

In [2]:
%pip install pandas
%pip install openpyxl
import pandas as pd
import json
import os

def normalize_column_name(df, column_name):
    # Attempt to find a matching column name regardless of case
    for col in df.columns:
        if col.lower() == column_name.lower():
            return col
    return None

def convert_excel_sheets_to_json(file_path, output_directory):
    # Check if the output directory exists, and create it if it doesn't
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Read the Excel file
    xls = pd.ExcelFile(file_path)

    # Iterate over each sheet
    for sheet_name in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name)

        # Debug: print the column names of the current Excel sheet
        print(f"Processing sheet: '{sheet_name}' with columns: {df.columns.tolist()}")

         # Normalize column names
        cell_type_column = normalize_column_name(df, 'cell_type')
        marker_column = normalize_column_name(df, 'marker')

        # Check if the required columns are in the DataFrame
        if not cell_type_column or not marker_column:
            print(f"Skipping sheet '{sheet_name}' as it does not contain required columns.")
            continue

        # Create a dictionary for cell types and markers
        cell_type_to_marker = {}

        # Iterate through the DataFrame
        for index, row in df.iterrows():
            cell_type = row[cell_type_column]
            markers = row[marker_column].split(',')  # Assume markers are separated by commas

            # Append markers to the corresponding cell type
            if cell_type in cell_type_to_marker:
                cell_type_to_marker[cell_type].extend(markers)
            else:
                cell_type_to_marker[cell_type] = markers

        # Construct the JSON file path for this sheet
        json_file_path = os.path.join(output_directory, f"{sheet_name}_marker_reference.json")

        # Convert the dictionary to a JSON object and save it
        with open(json_file_path, 'w') as json_file:
            json.dump(cell_type_to_marker, json_file, indent=4)

        print(f"Sheet '{sheet_name}' converted to JSON and saved at: {json_file_path}")

# Path to the Excel file
excel_file_path = 'hECA/uHAF_marker_reference.xlsx'

# Directory where the JSON files will be saved
output_directory = 'hECA/json_files'

# Convert and save each Excel sheet to a separate JSON file
convert_excel_sheets_to_json(excel_file_path, output_directory)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Processing sheet: 'Brain' with columns: ['cell_type', 'marker']
Sheet 'Brain' converted to JSON and saved at: hECA/json_files/Brain_marker_reference.json
Processing sheet: 'Eye' with columns: ['cell_type', 'marker']
Sheet 'Eye' converted to JSON and saved at: hECA/json_files/Eye_marker_reference.json
Processing sheet: 'Spinal cord' with columns: ['cell_type', 

In [None]:
''''
# Import required libraries
%pip install pandas
import pandas as pd
import json
import os
from collections import defaultdict

class CellTypeAnnotation:
    def __init__(self, marker_data_path):
        self.marker_data = self.load_marker_data(marker_data_path)
    
    def load_marker_data(self, path):
        # This will be implemented in the subclass
        raise NotImplementedError
    
     def map_cell_types_to_genes(self, cell_type):
        # Return marker genes for a given cell type
        return self.marker_data.get(cell_type, [])

# Define a function to extract cell types from specific columns
def extract_cell_types(df, columns_to_check, use_hECA=False):
    cell_types_to_ensembl = defaultdict(set)

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        ensembl_id = row['Ensembl']
        for col in columns_to_check:
            cell_type_data = row[col]
            if pd.notna(cell_type_data):
                # Split the cell_type_data by ';' to get each cell type and its nTPM
                for item in cell_type_data.split(';'):
                    # Extract the cell type name (ignoring nTPM value)
                    cell_type = item.split(':')[0].strip()
                    # Add the Ensembl ID to the set associated with this cell type
                    cell_types_to_ensembl[cell_type].add(ensembl_id)
    if use_hECA:
        # Import required libraries for cell-type annotation with hECA
        %pip install ECAUGT
        import sys
        import ECAUGT
        import time
        import multiprocessing
        import numpy as np

        # Set up the hECA database connection parameters
        endpoint = "https://HCAd-Datasets.cn-beijing.ots.aliyuncs.com"
        access_id = "LTAI5t7t216W9amUD1crMVosD"
        access_key = "ZJPlUbpLCij5qUPjbsU8GnQHm97IxJ"
        instance_name = "HCAd-Datasets"
        table_name = "HCA_d"

        # Initialize the ECAUGT client with the specified parameters
        ECAUGT.Setup_Client(endpoint, access_id, access_key, instance_name, table_name)

        # Check and build index if necessary
        ECAUGT.build_index()

        # Return a message indicating a successful connection
        return "Connected to hECA database successfully"


    # Convert sets to lists for JSON serialization
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        cell_types_to_ensembl[cell_type] = list(ensembl_ids)
        
    return cell_types_to_ensembl

# Specify columns to check for cell types
columns_to_check = [
    "RNA tissue specific nTPM",
    "RNA single cell type specific nTPM",
    "RNA blood cell specific nTPM",
    "RNA blood lineage specific nTPM"
]

# Read the first 5 rows of proteinatlas.tsv into a DataFrame for demonstration
# Uncomment the line below when you want to read the entire file
# df = pd.read_csv("/mnt/data/proteinatlas.tsv", sep='\t', nrows=5)

# Read the uploaded file
df = pd.read_csv("unzipped_folder/proteinatlas.tsv", sep='\t') #For demonstration purposes, read the 5 first lines, add the parameter nrows=5)

# Extract cell types and associated Ensembl Gene IDs
cell_types_to_ensembl = extract_cell_types(df, columns_to_check)

# Directory where the aggregated results will be saved
output_directory = "output_files"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Write the results to a JSON file
json_file_path = 'output_files/cell_types_to_ensembl.json'
with open(json_file_path, 'w') as f:
    json.dump(cell_types_to_ensembl, f)

json_file_path
'''

Collecting pandas
  Downloading pandas-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting numpy<2,>=1.26.0 (from pandas)
  Downloading numpy-1.26.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[?25hDownloading numpy-1.26.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


'output_files/cell_types_to_ensembl.json'

In [2]:
# Import required libraries
%pip install pandas
import pandas as pd
import json
import os
from collections import defaultdict

class GeneExpressionAtlas:
    def __init__(self, data_path, columns_to_check):
        self.data_path = data_path
        self.columns_to_check = columns_to_check
        self.cell_types_to_ensembl = defaultdict(set)

    def extract_cell_types(self):
        df = pd.read_csv(self.data_path, sep='\t')
        for index, row in df.iterrows():
            ensembl_id = row['Ensembl']
            for col in self.columns_to_check:
                cell_type_data = row[col]
                if pd.notna(cell_type_data):
                    for item in cell_type_data.split(';'):
                        cell_type = item.split(':')[0].strip()
                        self.cell_types_to_ensembl[cell_type].add(ensembl_id)
        return self.cell_types_to_ensembl

    def to_json(self, output_path):
        # Convert sets to lists for JSON serialization
        for cell_type, ensembl_ids in self.cell_types_to_ensembl.items():
            self.cell_types_to_ensembl[cell_type] = list(ensembl_ids)
        with open(output_path, 'w') as f:
            json.dump(self.cell_types_to_ensembl, f)
        return output_path

class HPA(GeneExpressionAtlas):
    def __init__(self, data_path):
        columns_to_check = [
            "RNA tissue specific nTPM",
            "RNA single cell type specific nTPM",
            "RNA blood cell specific nTPM",
            "RNA blood lineage specific nTPM"
        ]
        super().__init__(data_path, columns_to_check)

class hECA(GeneExpressionAtlas):
    # hECA specific implementation would go here, if needed.
    pass

# Example usage for HPA
hpa_data_path = "unzipped_folder/proteinatlas.tsv"  # Replace with your actual path
hpa = HPA(hpa_data_path)

# Extract cell types and associated Ensembl Gene IDs
hpa_cell_types_to_ensembl = hpa.extract_cell_types()

# Directory where the aggregated results will be saved
output_directory = "output_files"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Write the results to a JSON file
json_file_path = hpa.to_json(f'{output_directory}/cell_types_to_ensembl.json')
print(f"Data written to {json_file_path}")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Data written to output_files/cell_types_to_ensembl.json


In [None]:
# Import necessary libraries
from collections import defaultdict
import json

# Function to load a genelist from a file
def load_genelist(file_path):
    with open(file_path, 'r') as f:
        # Remove quotes and strip whitespace from each line
        return [line.strip().strip('"') for line in f.read().splitlines()]

# Load the cell_types_to_ensembl.json file
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Load all genelists using file paths for the gene lists
genelists_files = [f'genelists/Genelist{i}.txt' for i in range(1, 7)]

# Initialize a dictionary to store cell type frequencies for all genelists
cell_types_for_all_genelists = {}

# Loop through each genelist file
for i, genelist_file in enumerate(genelists_files, 1):
    # Load the current genelist
    genelist = load_genelist(genelist_file)
    
    # Initialize a defaultdict to store the results for the current genelist
    cell_types_for_genelist = defaultdict(int)

    # Identify cell types associated with the genes in the current genelist
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        for ensembl_id in genelist:
            if ensembl_id in ensembl_ids:
                cell_types_for_genelist[cell_type] += 1

    # Store the results for the current genelist
    cell_types_for_all_genelists[f'Genelist{i}'] = cell_types_for_genelist

# Save the aggregated results to a JSON file
cell_types_for_all_genelists_file = 'output_files/cell_types_for_all_genelists.json'
with open(cell_types_for_all_genelists_file, 'w') as f:
    json.dump(cell_types_for_all_genelists, f)

cell_types_for_all_genelists_file

'output_files/cell_types_for_all_genelists.json'

In [None]:
import pandas as pd

# Load the protein atlas data
protein_atlas_filepath = 'unzipped_folder/proteinatlas.tsv'
protein_atlas_df = pd.read_csv(protein_atlas_filepath, sep='\t')

# Extract the Ensembl IDs from the 'Gene' column
ensembl_ids = protein_atlas_df['Ensembl'].unique().tolist()

# Save the Ensembl IDs to a JSON file
ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'
with open(ensembl_ids_filepath, 'w') as f:
    json.dump(ensembl_ids, f)

# Check the first 5 Ensembl IDs
ensembl_ids[:5], ensembl_ids_filepath

(['ENSG00000000003',
  'ENSG00000000005',
  'ENSG00000000419',
  'ENSG00000000457',
  'ENSG00000000460'],
 'output_files/protein_atlas_ensembl_ids.json')

In [None]:
%pip install scipy statsmodels
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from collections import defaultdict
import json

# Load all genelists using file paths for the gene lists
genelist_filepaths = {f'Genelist{i}': f'genelists/Genelist{i}.txt' for i in range(1, 7)}

# Paths to the input files
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
protein_atlas_ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'

# Load the Protein Atlas Ensembl IDs
with open(protein_atlas_ensembl_ids_filepath, 'r') as f:
    protein_atlas_ensembl_ids = set(json.load(f))

# Load the mapping of cell types to Ensembl IDs
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Initialize a dictionary to store the results
fisher_test_results = defaultdict(dict)

# Perform Fisher's Exact Test for each genelist
for genelist_name, genelist_filepath in genelist_filepaths.items():
    # Load the genelist
    with open(genelist_filepath, 'r') as f:
        genelist = set(line.strip().strip('"') for line in f.readlines())

    # Total number of genes in the genelist and in the Protein Atlas
    total_genes_genelist = len(genelist)
    total_genes_atlas = len(protein_atlas_ensembl_ids)

    p_values = []

    # Perform the test for each cell type
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        ensembl_ids_set = set(ensembl_ids)

        # Count of genes in both the genelist and the cell type
        count_in_both = len(genelist.intersection(ensembl_ids_set))
        count_in_genelist_not_cell_type = len(genelist.difference(ensembl_ids_set))
        count_in_cell_type_not_genelist = len(ensembl_ids_set.difference(genelist))
        count_in_neither = total_genes_atlas - (count_in_both + count_in_genelist_not_cell_type + count_in_cell_type_not_genelist)

        # Construct the contingency table
        table = [
            [count_in_both, count_in_cell_type_not_genelist],
            [count_in_genelist_not_cell_type, count_in_neither]
        ]

        # Perform Fisher's Exact Test
        odds_ratio, p_value = fisher_exact(table, alternative='greater')
        p_values.append(p_value)

        # Store the results without adjusted p-values first
        fisher_test_results[genelist_name][cell_type] = {
            'p_value': p_value,
            'odds_ratio': odds_ratio,
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        }

    # Adjust p-values using the Benjamini-Hochberg procedure
    _, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

    # Store the adjusted p-values in the results
    for (cell_type, _), adj_p_value in zip(fisher_test_results[genelist_name].items(), pvals_corrected):
        fisher_test_results[genelist_name][cell_type]['adjusted_p_value'] = adj_p_value
    
    # Sort results by adjusted p_value in ascending order
    fisher_test_results[genelist_name] = dict(sorted(fisher_test_results[genelist_name].items(), key=lambda x: x[1].get('adjusted_p_value', 1)))

# Save the results to a JSON file
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'w') as f:
    json.dump(fisher_test_results, f)

results_filepath


Collecting scipy
  Downloading scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting statsmodels
  Downloading statsmodels-0.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.4 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.8/37.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading statsmodels-0.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0mm
[?25hDownloading patsy-0.5.6-py2.py3-

'output_files/fisher_test_results.json'

In [None]:
%pip install statsmodels
import json
from statsmodels.stats.multitest import multipletests

# Load Fisher test results
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'r') as file:
    fisher_test_results = json.load(file)

# Extract insights from the Fisher test results
insights = {}
p_value_threshold = 0.05  # Adjusted p-value threshold

for genelist, cell_types in fisher_test_results.items():
    associations = []
    p_values = []
    for cell_type, stats in cell_types.items():
        p_value = stats.get('p_value', 1)
        count_in_both = stats.get('count_in_both', 0)
        count_in_genelist_not_cell_type = stats.get('count_in_genelist_not_cell_type', 0)
        count_in_cell_type_not_genelist = stats.get('count_in_cell_type_not_genelist', 0)
        count_in_neither = stats.get('count_in_neither', 0)

        # Calculate the odds ratio
        odds_ratio = (count_in_both * count_in_neither) / (count_in_genelist_not_cell_type * count_in_cell_type_not_genelist) if count_in_genelist_not_cell_type and count_in_cell_type_not_genelist else float('inf')

        associations.append({
            'cell_type': cell_type,
            'p_value': p_value,
            'odds_ratio': odds_ratio,  # Include the odds ratio
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        })
        p_values.append(p_value)

    # Adjust p-values using the Benjamini-Hochberg procedure
    reject, pvals_corrected, _, _ = multipletests(p_values, alpha=p_value_threshold, method='fdr_bh')

    # Store only associations with an adjusted p-value below the threshold and less than 1.0
    significant_associations = []
    for association, adj_p_value, rej in zip(associations, pvals_corrected, reject):
        if rej and adj_p_value < 1.0:
            association['adjusted_p_value'] = adj_p_value
            significant_associations.append(association)

    # Sort the significant associations by adjusted p_value in ascending order
    significant_associations.sort(key=lambda x: x.get('adjusted_p_value', 1))
    insights[genelist] = significant_associations

# Write the insights to a JSON file
output_filepath = 'output_files/fisher_test_insights.json'
with open(output_filepath, 'w') as f:
    json.dump(insights, f)

# Displaying the first few insights for review
{key: insights[key][:5] for key in insights.keys()}



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


{'Genelist1': [{'cell_type': 'esophagus',
   'p_value': 7.601491563693231e-83,
   'odds_ratio': 38.03038138332256,
   'count_in_both': 81,
   'count_in_genelist_not_cell_type': 119,
   'count_in_cell_type_not_genelist': 351,
   'count_in_neither': 19611,
   'adjusted_p_value': 1.0262013610985863e-80},
  {'cell_type': 'Suprabasal keratinocytes',
   'p_value': 4.524016635921081e-70,
   'odds_ratio': 26.866237987563593,
   'count_in_both': 78,
   'count_in_genelist_not_cell_type': 122,
   'count_in_cell_type_not_genelist': 464,
   'count_in_neither': 19498,
   'adjusted_p_value': 3.0537112292467296e-68},
  {'cell_type': 'vagina',
   'p_value': 1.6908530223420254e-62,
   'odds_ratio': 65.7748344370861,
   'count_in_both': 49,
   'count_in_genelist_not_cell_type': 151,
   'count_in_cell_type_not_genelist': 98,
   'count_in_neither': 19864,
   'adjusted_p_value': 7.608838600539114e-61},
  {'cell_type': 'Basal keratinocytes',
   'p_value': 7.369658263665469e-38,
   'odds_ratio': 21.5081838252