In [1]:
import requests
import os # yes or no
from bs4 import BeautifulSoup

def download_files_from_hpa(url, max_size_gb=1, subfolder="downloads"):
    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    # Convert the max size from GB to bytes
    max_size_bytes = max_size_gb * 1e9

    # Make an HTTP GET request to the provided URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a successful response

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Search for all <a> tags with the specified href structure
    links = soup.find_all('a', href=True)

    # Base URL to prepend to relative file paths
    base_url = "https://www.proteinatlas.org"

    for link in links:
        file_url = link['href']
        if file_url.endswith('.zip'):  # Check if the link is to a .zip file
            full_url = base_url + file_url

            # Extract filename from the URL
            filename = file_url.split('/')[-1]

            # Create the full path to save the file
            save_path = os.path.join(subfolder, filename)

            # Check if the file already exists
            if os.path.exists(save_path):
                print(f"{filename} already exists. Skipping download.")
                continue

            # Check file size without downloading the entire file
            file_response = requests.head(full_url)
            file_size = int(file_response.headers.get('Content-Length', 0))

            if file_size <= max_size_bytes:
                # Download the file if it's within the size limit
                print(f"Downloading {filename}...")
                file_response = requests.get(full_url, stream=True)
                with open(save_path, 'wb') as file:
                    for chunk in file_response.iter_content(chunk_size=8192):
                        file.write(chunk)
                print(f"{filename} downloaded!")
            else:
                print(f"Skipping {filename} as it exceeds the size limit.")

# Example usage
download_files_from_hpa("https://www.proteinatlas.org/about/download")


Downloading normal_tissue.tsv.zip...
normal_tissue.tsv.zip downloaded!
Downloading pathology.tsv.zip...
pathology.tsv.zip downloaded!
Downloading subcellular_location.tsv.zip...
subcellular_location.tsv.zip downloaded!
Downloading rna_tissue_consensus.tsv.zip...
rna_tissue_consensus.tsv.zip downloaded!
Downloading rna_tissue_hpa.tsv.zip...
rna_tissue_hpa.tsv.zip downloaded!
Downloading rna_tissue_hpa_description.tsv.zip...
rna_tissue_hpa_description.tsv.zip downloaded!
Downloading rna_brain_hpa.tsv.zip...
rna_brain_hpa.tsv.zip downloaded!
Downloading rna_pfc_brain_hpa.tsv.zip...
rna_pfc_brain_hpa.tsv.zip downloaded!
Downloading rna_tissue_gtex.tsv.zip...
rna_tissue_gtex.tsv.zip downloaded!
Downloading rna_tissue_fantom.tsv.zip...
rna_tissue_fantom.tsv.zip downloaded!
Downloading rna_single_cell_type.tsv.zip...
rna_single_cell_type.tsv.zip downloaded!
Downloading rna_single_cell_type_tissue.tsv.zip...
rna_single_cell_type_tissue.tsv.zip downloaded!
Downloading rna_single_cell_cluster_de

In [2]:
%pip install pandas
import pandas as pd
import zipfile
import os

def unzip_file_if_not_exists(zip_file_path, output_folder_path, target_file_path):
    if not os.path.exists(target_file_path):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(output_folder_path)
            print(f"Unzipped files to {output_folder_path}")
    else:
        print(f"{target_file_path} already exists, skipping unzip.")

def process_and_sort_tsv(input_file_path, output_file_path, cell_type_column):
    # Check if the sorted file already exists
    if os.path.exists(output_file_path):
        print(f"{output_file_path} already exists, skipping sorting.")
        return

    # Create the output directory if it doesn't exist
    output_directory = os.path.dirname(output_file_path)
    os.makedirs(output_directory, exist_ok=True)

    # Read the tsv file into a DataFrame
    df = pd.read_csv(input_file_path, sep='\t')

    # Make column names case-insensitive and strip extra spaces
    normalized_columns = {col.strip().lower(): col for col in df.columns}

    try:
        # Extract the 'Gene' and cell_type_column columns
        actual_cell_type_column = normalized_columns.get(cell_type_column.lower())
        extracted_df = df[['Gene', actual_cell_type_column]]

        # Sort by cell_type_column
        sorted_df = extracted_df.sort_values(by=actual_cell_type_column)
        
        # Save the sorted data to a new .tsv file
        sorted_df.to_csv(output_file_path, sep='\t', index=False)
        print(f"Sorted data saved to {output_file_path}")
    except KeyError as e:
        print(f"Could not find column: {e}")
        print(f"Available columns in the DataFrame: {df.columns.tolist()}")
        print(f"Error: Failed to write sorted data to {output_file_path} ")

# List of TSV files to process
file_names = [
    {'name': 'normal_tissue', 'cell_type_column': 'Cell type'},
    {'name': 'rna_single_cell_type', 'cell_type_column': 'Cell type'},
    {'name': 'rna_single_cell_type_tissue', 'cell_type_column': 'Cell type'},
    {'name': 'rna_immune_cell', 'cell_type_column': 'Immune cell'},
    {'name': 'rna_immune_cell_schmiedel', 'cell_type_column': 'Immune cell'},
    {'name': 'rna_immune_cell_monaco', 'cell_type_column': 'Immune cell'}
]

# Loop to handle each file
for file_info in file_names:
    file_name = file_info['name']
    cell_type_column = file_info['cell_type_column']
    zip_file_path = f"downloads/{file_name}.tsv.zip"
    target_file_path = f"unzipped_folder/{file_name}.tsv"
    output_file_path = f"sorted_data/sorted_{file_name}.tsv"

    # Check if the file exists, if not then unzip
    unzip_file_if_not_exists(zip_file_path, 'unzipped_folder', target_file_path)
    
    # Process and sort the TSV file
    process_and_sort_tsv(target_file_path, output_file_path, cell_type_column)


Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/d9/26/895a49ebddb4211f2d777150f38ef9e538deff6df7e179a3624c663efc98/pandas-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pandas-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy>=1.23.2 (from pandas)
  Obtaining dependency information for numpy>=1.23.2 from https://files.pythonhosted.org/packages/c4/36/161e2f8110f8c49e59f6107bd6da4257d30aff9f06373d0471811f73dcc5/numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packag

In [3]:
import os
import pandas as pd
import json

# Directory containing the sorted .tsv files
input_directory = "sorted_data"

# Directory where the aggregated results will be saved
output_directory = "aggregated_genes"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Initialize a dictionary to hold unique Ensembl Gene IDs for each cell type
ensembl_gene_ids_by_cell_type = {}

# Iterate over all files in the input directory
for filename in os.listdir(input_directory):
    filepath = os.path.join(input_directory, filename)
    
    # Check if the entry is a file and has a .tsv extension
    if os.path.isfile(filepath) and filename.endswith('.tsv'):
        print(f"Reading file: {filepath}")
        
        # Read the .tsv file into a DataFrame
        df = pd.read_csv(filepath, sep='\t')
        
        # Drop rows where 'Gene' is NaN
        df = df[df['Gene'].notna()]
        
        # Assuming that the second column is the "Cell type" column
        cell_type_column = df.columns[1]
        
        # Group by 'Cell type' and aggregate unique 'Gene' values
        grouped_df = df.groupby(cell_type_column)['Gene'].unique()
        
        # Merge the current file's grouping into the overall dictionary
        for cell_type, gene_ids in grouped_df.items():
            if cell_type in ensembl_gene_ids_by_cell_type:
                ensembl_gene_ids_by_cell_type[cell_type] = list(set(ensembl_gene_ids_by_cell_type[cell_type] + list(gene_ids)))
            else:
                ensembl_gene_ids_by_cell_type[cell_type] = list(gene_ids)

# Save the aggregated results to a JSON file
output_path = os.path.join(output_directory, "aggregated_ensembl_gene_ids_by_cell_type.json")
with open(output_path, 'w') as json_file:
    json.dump(ensembl_gene_ids_by_cell_type, json_file)

print(f"Ensembl Gene IDs by cell type from all files written to: {output_path}")


Reading file: sorted_data/sorted_normal_tissue.tsv
Reading file: sorted_data/sorted_rna_single_cell_type.tsv
Reading file: sorted_data/sorted_rna_single_cell_type_tissue.tsv
Reading file: sorted_data/sorted_rna_immune_cell.tsv
Reading file: sorted_data/sorted_rna_immune_cell_schmiedel.tsv
Reading file: sorted_data/sorted_rna_immune_cell_monaco.tsv
Ensembl Gene IDs by cell type from all files written to: aggregated_genes/aggregated_ensembl_gene_ids_by_cell_type.json


In [4]:
import zipfile

# Function to unzip a file
def unzip_file(zip_file_path, output_folder_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder_path)
        print(f"Unzipped files to {output_folder_path}")

# Unzip the file
unzip_file('downloads/proteinatlas.tsv.zip', 'unzipped_folder')

# Read the first 5 lines of the unzipped .tsv file
try:
    with open('unzipped_folder/proteinatlas.tsv', 'r') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            print(line.strip().replace('\t', ','))
except FileNotFoundError:
    print("The file 'proteinatlas.tsv' was not found in the 'unzipped_folder'.")


Unzipped files to unzipped_folder
Gene,"Gene synonym",Ensembl,"Gene description",Uniprot,Chromosome,Position,"Protein class","Biological process","Molecular function","Disease involvement",Evidence,"HPA evidence","UniProt evidence","NeXtProt evidence","RNA tissue specificity","RNA tissue distribution","RNA tissue specificity score","RNA tissue specific nTPM","RNA single cell type specificity","RNA single cell type distribution","RNA single cell type specificity score","RNA single cell type specific nTPM","RNA cancer specificity","RNA cancer distribution","RNA cancer specificity score","RNA cancer specific FPKM","RNA brain regional specificity","RNA brain regional distribution","RNA brain regional specificity score","RNA brain regional specific nTPM","RNA blood cell specificity","RNA blood cell distribution","RNA blood cell specificity score","RNA blood cell specific nTPM","RNA blood lineage specificity","RNA blood lineage distribution","RNA blood lineage specificity score","RNA blood l

In [None]:
# Import required libraries
import pandas as pd
import json
from collections import defaultdict

# Define a function to extract cell types from specific columns
def extract_cell_types(df, columns_to_check):
    cell_types_to_ensembl = defaultdict(set)

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        ensembl_id = row['Ensembl']
        for col in columns_to_check:
            cell_type_data = row[col]
            if pd.notna(cell_type_data):
                # Split the cell_type_data by ';' to get each cell type and its nTPM
                for item in cell_type_data.split(';'):
                    # Extract the cell type name (ignoring nTPM value)
                    cell_type = item.split(':')[0].strip()
                    # Add the Ensembl ID to the set associated with this cell type
                    cell_types_to_ensembl[cell_type].add(ensembl_id)

    # Convert sets to lists for JSON serialization
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        cell_types_to_ensembl[cell_type] = list(ensembl_ids)
        
    return cell_types_to_ensembl

# Specify columns to check for cell types
columns_to_check = [
    "RNA tissue specific nTPM",
    "RNA single cell type specific nTPM",
    "RNA blood cell specific nTPM",
    "RNA blood lineage specific nTPM"
]

# Read the first 5 rows of proteinatlas.tsv into a DataFrame for demonstration
# Uncomment the line below when you want to read the entire file
# df = pd.read_csv("/mnt/data/proteinatlas.tsv", sep='\t', nrows=5)

# For demonstration, read the first 5 lines from the uploaded file
df = pd.read_csv("unzipped_folder/proteinatlas.tsv", sep='\t') #, nrows=5)

# Extract cell types and associated Ensembl Gene IDs
cell_types_to_ensembl = extract_cell_types(df, columns_to_check)

# Write the results to a JSON file
json_file_path = 'output_files/cell_types_to_ensembl.json'
with open(json_file_path, 'w') as f:
    json.dump(cell_types_to_ensembl, f)

json_file_path


In [5]:
import json
from collections import defaultdict

# Function to load a list of genes from a file
def load_genelist(file_path):
    with open(file_path, 'r') as f:
        # Remove quotes from each line
        return [line.strip('"') for line in f.read().splitlines()]

# Load the Genelist1
genelist1 = load_genelist('Genelists/Genelist1.txt')

# Function to load the cell types to Ensembl mapping from a JSON file
def load_cell_types_to_ensembl(file_path):
    with open(file_path, 'r') as f:
        mapping = json.load(f)
    print(f"Loaded cell types to Ensembl mapping from {file_path} with {len(mapping)} cell types.")
    return mapping

# Load the cell types to Ensembl mapping
cell_types_to_ensembl_full = load_cell_types_to_ensembl('aggregated_genes/aggregated_ensembl_gene_ids_by_cell_type.json')

# Function to find cell types for a given genelist
def find_cell_types_for_genelist(genelist, cell_types_to_ensembl):
    cell_types_for_genelist = defaultdict(int)
    print(f"Checking {len(genelist)} genes against {len(cell_types_to_ensembl)} cell types.")
    for gene in genelist:
        print(f"Checking gene: {gene}")
        for cell_type, ensembl_ids in cell_types_to_ensembl.items():
            if gene in ensembl_ids:
                print(f"Match found: {gene} in {cell_type}")
                cell_types_for_genelist[cell_type] += 1
    return cell_types_for_genelist

# Example usage
print("Loaded 200 genes from Genelists/Genelist1.txt")
print(f"Loaded cell types to Ensembl mapping from aggregated_genes/aggregated_ensembl_gene_ids_by_cell_type.json with {len(cell_types_to_ensembl_full)} cell types.")
cell_types_for_genelist1 = find_cell_types_for_genelist(genelist1, cell_types_to_ensembl_full)
print(f"Final cell type counts for Genelist1: {cell_types_for_genelist1}")




Loaded cell types to Ensembl mapping from aggregated_genes/aggregated_ensembl_gene_ids_by_cell_type.json with 325 cell types.
Loaded 200 genes from Genelists/Genelist1.txt
Loaded cell types to Ensembl mapping from aggregated_genes/aggregated_ensembl_gene_ids_by_cell_type.json with 325 cell types.
Checking 200 genes against 325 cell types.
Checking gene: ENSG00000143556
Match found: ENSG00000143556 in Leydig cells
Match found: ENSG00000143556 in Purkinje cells
Match found: ENSG00000143556 in adipocytes
Match found: ENSG00000143556 in alveolar cells
Match found: ENSG00000143556 in cardiomyocytes
Match found: ENSG00000143556 in cells in basal layer
Match found: ENSG00000143556 in cells in corneal layer
Match found: ENSG00000143556 in cells in endometrial stroma
Match found: ENSG00000143556 in cells in glomeruli
Match found: ENSG00000143556 in cells in granular layer
Match found: ENSG00000143556 in cells in molecular layer
Match found: ENSG00000143556 in cells in red pulp
Match found: ENSG

In [6]:
# Import necessary libraries
from collections import defaultdict
import json

# Function to load a genelist from a file
def load_genelist(file_path):
    with open(file_path, 'r') as f:
        # Remove quotes and strip whitespace from each line
        return [line.strip().strip('"') for line in f.read().splitlines()]

# Load the cell_types_to_ensembl.json file
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Load all genelists
genelists_files = [
    'Genelists/Genelist1.txt',
    'Genelists/Genelist2.txt',
    'Genelists/Genelist3.txt',
    'Genelists/Genelist4.txt',
    'Genelists/Genelist5.txt',
    'Genelists/Genelist6.txt'
]

# Initialize a dictionary to store cell type frequencies for all genelists
cell_types_for_all_genelists = {}

# Loop through each genelist file
for i, genelist_file in enumerate(genelists_files, 1):
    # Load the current genelist
    genelist = load_genelist(genelist_file)
    
    # Initialize a defaultdict to store the results for the current genelist
    cell_types_for_genelist = defaultdict(int)

    # Identify cell types associated with the genes in the current genelist
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        for ensembl_id in genelist:
            if ensembl_id in ensembl_ids:
                cell_types_for_genelist[cell_type] += 1

    # Store the results for the current genelist
    cell_types_for_all_genelists[f'Genelist{i}'] = cell_types_for_genelist

# Save the aggregated results to a JSON file
cell_types_for_all_genelists_file = 'output_files/cell_types_for_all_genelists.json'
with open(cell_types_for_all_genelists_file, 'w') as f:
    json.dump(cell_types_for_all_genelists, f)

cell_types_for_all_genelists_file

'output_files/cell_types_for_all_genelists.json'

In [8]:
###
### This file is not needed
###

import json
from collections import defaultdict

# Function to load a gene list from a text file
def load_genelist(file_path):
    with open(file_path, 'r') as f:
        return [line.strip().strip('"') for line in f.readlines()]

# Function to load a JSON file into a dictionary
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Function to generate insights for a gene list
def generate_insights(genelist, cell_types_to_genes):
    insights = defaultdict(int)
    for gene in genelist:
        for cell_type, associated_genes in cell_types_to_genes.items():
            if gene in associated_genes:
                insights[cell_type] += 1
    return insights

# File paths for the gene lists and cell type to gene associations
genelist_files = [f'Genelists/Genelist{i}.txt' for i in range(1, 7)]
cell_types_to_genes_file = 'aggregated_genes/aggregated_ensembl_gene_ids_by_cell_type.json'

# Load the cell type to gene associations
cell_types_to_genes = load_json(cell_types_to_genes_file)

# Initialize dictionary to hold insights for all gene lists
insights_for_all_genelists = {}

# Generate and print insights for each gene list
for i, genelist_file in enumerate(genelist_files, 1):
    genelist = load_genelist(genelist_file)
    insights = generate_insights(genelist, cell_types_to_genes)
    insights_for_all_genelists[f'Genelist{i}'] = insights
    print(f'Insights for Genelist{i}:')
    for cell_type, count in insights.items():
        print(f'{cell_type}: {count} genes from the list are associated.')
    print()

# Save the insights to a JSON file
with open('output_files/insights_for_all_genelists.json', 'w') as f:
    json.dump(insights_for_all_genelists, f)


Insights for Genelist1:
Leydig cells: 190 genes from the list are associated.
Purkinje cells: 140 genes from the list are associated.
adipocytes: 190 genes from the list are associated.
alveolar cells: 125 genes from the list are associated.
cardiomyocytes: 190 genes from the list are associated.
cells in basal layer: 52 genes from the list are associated.
cells in corneal layer: 52 genes from the list are associated.
cells in endometrial stroma: 146 genes from the list are associated.
cells in glomeruli: 146 genes from the list are associated.
cells in granular layer: 145 genes from the list are associated.
cells in molecular layer: 140 genes from the list are associated.
cells in red pulp: 146 genes from the list are associated.
cells in seminiferous ducts: 104 genes from the list are associated.
cells in spinous layer: 52 genes from the list are associated.
cells in tubules: 127 genes from the list are associated.
cells in white pulp: 146 genes from the list are associated.
cholangi

In [None]:
import pandas as pd

# Load the protein atlas data
protein_atlas_filepath = 'unzipped_folder/proteinatlas.tsv'
protein_atlas_df = pd.read_csv(protein_atlas_filepath, sep='\t')

# Extract the Ensembl IDs from the 'Gene' column
ensembl_ids = protein_atlas_df['Ensembl'].unique().tolist()

# Save the Ensembl IDs to a JSON file
ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'
with open(ensembl_ids_filepath, 'w') as f:
    json.dump(ensembl_ids, f)

# Check the first 5 Ensembl IDs
ensembl_ids[:5], ensembl_ids_filepath

In [3]:
### not working

%pip install scipy
import json
from scipy.stats import fisher_exact

# Load the necessary data
with open('output_files/protein_atlas_ensembl_ids.json', 'r') as f:
    protein_atlas_ensembl_ids = json.load(f)

with open('Genelists/Genelist1.txt', 'r') as f:
    genelist1 = [line.strip().strip('"') for line in f.readlines()]

with open('output_files/cell_types_for_all_genelists.json', 'r') as f:
    cell_types_for_all_genelists = json.load(f)

# Extract the count of genes associated with each cell type for Genelist1
cell_types_for_genelist1 = cell_types_for_all_genelists.get('Genelist1', {})

# Initialize variables
total_genes_atlas = len(protein_atlas_ensembl_ids)
total_genes_genelist1 = len(genelist1)

# Perform Fisher's Exact Test for each cell type in Genelist1
for cell_type, genes in cell_types_for_genelist1.items():
    print(cell_type)
    print(genes)
    count = len(genes)
    total_genes_cell_type_atlas = len([gene for gene in protein_atlas_ensembl_ids if gene in cell_types_for_genelist1[cell_type]])
    table = [
        [count, total_genes_cell_type_atlas - count],
        [total_genes_genelist1 - count, total_genes_atlas - total_genes_cell_type_atlas - (total_genes_genelist1 - count)]
    ]
    odds_ratio, p_value = fisher_exact(table, alternative='greater')
    print(f"{cell_type}: p_value = {p_value}, odds_ratio = {odds_ratio}")


Late spermatids
10


TypeError: object of type 'int' has no len()

In [7]:
from scipy.stats import fisher_exact
from collections import defaultdict
import json

# Paths to the input files
genelist_filepaths = {
    'Genelist1': 'Genelists/Genelist1.txt',
    'Genelist2': 'Genelists/Genelist2.txt',
    'Genelist3': 'Genelists/Genelist3.txt',
    'Genelist4': 'Genelists/Genelist4.txt',
    'Genelist5': 'Genelists/Genelist5.txt',
    'Genelist6': 'Genelists/Genelist6.txt',
}
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
protein_atlas_ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'

# Load the Protein Atlas Ensembl IDs
with open(protein_atlas_ensembl_ids_filepath, 'r') as f:
    protein_atlas_ensembl_ids = set(json.load(f))

# Load the mapping of cell types to Ensembl IDs
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Initialize a dictionary to store the results
fisher_test_results = defaultdict(dict)

# Perform Fisher's Exact Test for each genelist
for genelist_name, genelist_filepath in genelist_filepaths.items():
    # Load the genelist
    with open(genelist_filepath, 'r') as f:
        genelist = set(line.strip().strip('"') for line in f.readlines())

    # Total number of genes in the genelist and in the Protein Atlas
    total_genes_genelist = len(genelist)
    total_genes_atlas = len(protein_atlas_ensembl_ids)

    # Perform the test for each cell type
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        ensembl_ids_set = set(ensembl_ids)

        # Count of genes in both the genelist and the cell type
        count_in_both = len(genelist.intersection(ensembl_ids_set))

        # Count of genes in the genelist but not in the cell type
        count_in_genelist_not_cell_type = len(genelist.difference(ensembl_ids_set))

        # Count of genes in the cell type but not in the genelist
        count_in_cell_type_not_genelist = len(ensembl_ids_set.difference(genelist))

        # Count of genes neither in the genelist nor in the cell type
        count_in_neither = total_genes_atlas - (count_in_both + count_in_genelist_not_cell_type + count_in_cell_type_not_genelist)

        # Construct the contingency table
        table = [
            [count_in_both, count_in_cell_type_not_genelist],
            [count_in_genelist_not_cell_type, count_in_neither]
        ]

        # Perform Fisher's Exact Test
        odds_ratio, p_value = fisher_exact(table, alternative='greater')

        # Store the results
        fisher_test_results[genelist_name][cell_type] = {
            'p_value': p_value,
            'odds_ratio': odds_ratio,
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        }

# Save the results to a JSON file
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'w') as f:
    json.dump(fisher_test_results, f)

results_filepath


'output_files/fisher_test_results.json'