In [2]:
import requests
import os # yes or no
from bs4 import BeautifulSoup

def download_files_from_hpa(url, max_size_gb=1, subfolder="downloaded_hpa_files"):
    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    # Convert the max size from GB to bytes
    max_size_bytes = max_size_gb * 1e9

    # Make an HTTP GET request to the provided URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a successful response

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Search for all <a> tags with the specified href structure
    links = soup.find_all('a', href=True)

    # Base URL to prepend to relative file paths
    base_url = "https://www.proteinatlas.org"

    for link in links:
        file_url = link['href']
        if file_url.endswith('.zip'):  # Check if the link is to a .zip file
            full_url = base_url + file_url

            # Extract filename from the URL
            filename = file_url.split('/')[-1]

            # Create the full path to save the file
            save_path = os.path.join(subfolder, filename)
            
            # Check if the file already exists
            if os.path.exists(save_path):
                print(f"{filename} already exists. Skipping download.")
                continue

            if filename == "proteinatlas.tsv.zip":
                # Check file size without downloading the entire file
                file_response = requests.head(full_url)
                file_size = int(file_response.headers.get('Content-Length', 0))

                if file_size <= max_size_bytes:
                    # Download the file if it's within the size limit
                    print(f"Downloading {filename}...")
                    file_response = requests.get(full_url, stream=True)
                    with open(save_path, 'wb') as file:
                        for chunk in file_response.iter_content(chunk_size=8192):
                            file.write(chunk)
                    print(f"{filename} downloaded!")
                else:
                    print(f"Skipping {filename} as it exceeds the size limit.")

# Example usage
download_files_from_hpa("https://www.proteinatlas.org/about/download")


proteinatlas.tsv.zip already exists. Skipping download.


In [3]:
import zipfile

# Function to unzip a file
def unzip_file(zip_file_path, output_folder_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder_path)
        print(f"Unzipped files to {output_folder_path}")

# Unzip the file
unzip_file('downloaded_hpa_files/proteinatlas.tsv.zip', 'unzipped_folder')

# Read the first 5 lines of the unzipped .tsv file
try:
    with open('unzipped_folder/proteinatlas.tsv', 'r') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            print(line.strip().replace('\t', ','))
except FileNotFoundError:
    print("The file 'proteinatlas.tsv' was not found in the 'unzipped_folder'.")


Unzipped files to unzipped_folder
Gene,"Gene synonym",Ensembl,"Gene description",Uniprot,Chromosome,Position,"Protein class","Biological process","Molecular function","Disease involvement",Evidence,"HPA evidence","UniProt evidence","NeXtProt evidence","RNA tissue specificity","RNA tissue distribution","RNA tissue specificity score","RNA tissue specific nTPM","RNA single cell type specificity","RNA single cell type distribution","RNA single cell type specificity score","RNA single cell type specific nTPM","RNA cancer specificity","RNA cancer distribution","RNA cancer specificity score","RNA cancer specific FPKM","RNA brain regional specificity","RNA brain regional distribution","RNA brain regional specificity score","RNA brain regional specific nTPM","RNA blood cell specificity","RNA blood cell distribution","RNA blood cell specificity score","RNA blood cell specific nTPM","RNA blood lineage specificity","RNA blood lineage distribution","RNA blood lineage specificity score","RNA blood l

In [4]:
import requests
import os

def download_file_from_github(url, save_path, folder_name="ontology"):
    # Create folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Define the full path to save the file
    full_save_path = os.path.join(folder_name, save_path)

    # Download the file
    response = requests.get(url)
    with open(full_save_path, 'wb') as file:
        file.write(response.content)

    print(f"File downloaded and saved as {full_save_path}")

# URL to the hECA marker gene annotation file
heca_url = "https://github.com/XuegongLab/hECA/raw/main/UHAF/uHAF%20marker%20reference.xlsx"

# Name of the file to save
heca_save_path = "uhaf_marker_reference.xlsx"

# Download the file
download_file_from_github(heca_url, heca_save_path)


File downloaded and saved as ontology/uhaf_marker_reference.xlsx


In [5]:
'''
%pip install pandas openpyxl
from collections import defaultdict
import pandas as pd
import json

def convert_to_json(file_path, output_directory, is_hpa):
    # Create dictionaries for tissues/organs and cell types to markers
    tissue_to_marker = defaultdict(set)
    cell_type_to_marker = defaultdict(set)

    if is_hpa:
        # Read the HPA Excel file
        df = pd.read_excel(file_path)

        # Iterate through the DataFrame
        for index, row in df.iterrows():
            tissue = row['Tissue']  # 'Tissue' column used for HPA
            cell_type = row['Cell type']
            marker = row['Marker']
            
            tissue_to_marker[tissue.strip()].add(marker.strip())
            cell_type_to_marker[cell_type.strip()].add(marker.strip())
    else:
        # Read the hECA Excel file
        xls = pd.ExcelFile(file_path)

        # Iterate over each sheet
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name)

            # Check for 'Marker' or 'marker' column
            marker_column = 'Marker' if 'Marker' in df.columns else 'marker'
            if marker_column not in df.columns:
                raise KeyError(f"Column 'Marker' or 'marker' not found in sheet {sheet_name}")

            # Iterate through the DataFrame
            for index, row in df.iterrows():
                tissue = sheet_name  # Use the sheet name as the tissue/organ for hECA
                cell_type = row['cell_type']
                if pd.isnull(cell_type):
                    continue  # Skip rows where cell type is NaN or None
            cell_type = str(cell_type).strip()  # Convert to string and strip whitespace
        
            markers = set(map(str.strip, str(row[marker_column]).split(',')))  # Convert to string and split

            tissue_to_marker[tissue.strip()].update(markers)
            cell_type_to_marker[cell_type].update(markers)

    # Save to JSON files
    for category, cat_to_marker in [('tissue', tissue_to_marker), ('cell_type', cell_type_to_marker)]:
        json_file_path = os.path.join(output_directory, f"{'hpa' if is_hpa else 'heca'}_{category}_to_marker.json")
        with open(json_file_path, 'w') as json_file:
            json.dump({k: list(v) for k, v in cat_to_marker.items()}, json_file, indent=4)
        print(f"{'hpa' if is_hpa else 'heca'} {category} to marker JSON saved at: {json_file_path}")

# Correct file paths before running the function
hpa_excel_file_path = 'ontology/hpa_marker_reference.xlsx'
heca_excel_file_path = 'ontology/uhaf_marker_reference.xlsx'
output_directory = 'ontology/json_output'

# Make sure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Convert and save the HPA Excel file to JSON
convert_to_json(hpa_excel_file_path, output_directory, is_hpa=True)

# Convert and save the hECA Excel file to JSON
convert_to_json(heca_excel_file_path, output_directory, is_hpa=False)
'''

'\n%pip install pandas openpyxl\nfrom collections import defaultdict\nimport pandas as pd\nimport json\n\ndef convert_to_json(file_path, output_directory, is_hpa):\n    # Create dictionaries for tissues/organs and cell types to markers\n    tissue_to_marker = defaultdict(set)\n    cell_type_to_marker = defaultdict(set)\n\n    if is_hpa:\n        # Read the HPA Excel file\n        df = pd.read_excel(file_path)\n\n        # Iterate through the DataFrame\n        for index, row in df.iterrows():\n            tissue = row[\'Tissue\']  # \'Tissue\' column used for HPA\n            cell_type = row[\'Cell type\']\n            marker = row[\'Marker\']\n            \n            tissue_to_marker[tissue.strip()].add(marker.strip())\n            cell_type_to_marker[cell_type.strip()].add(marker.strip())\n    else:\n        # Read the hECA Excel file\n        xls = pd.ExcelFile(file_path)\n\n        # Iterate over each sheet\n        for sheet_name in xls.sheet_names:\n            df = pd.read_exc

In [6]:
%pip install pandas openpyxl
from collections import defaultdict
import pandas as pd
import json
import os

def convert_to_json(file_path, output_directory, is_hpa):
    tissue_to_marker = defaultdict(set)
    cell_type_to_marker = defaultdict(set)

    if is_hpa:
        df = pd.read_excel(file_path)
        for _, row in df.iterrows():
            tissue = row['Tissue']
            cell_type = row['Cell type']
            marker = row['Marker']
            
            tissue_to_marker[tissue.strip()].add(marker.strip())
            cell_type_to_marker[cell_type.strip()].add(marker.strip())
    else:
        xls = pd.ExcelFile(file_path)
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name)
            marker_column = 'Marker' if 'Marker' in df.columns else 'marker'
            for _, row in df.iterrows():
                cell_type = row['cell_type'] if 'cell_type' in row else None
                if pd.isnull(cell_type):
                    continue
                cell_type = str(cell_type).strip()
        
                markers = set(map(str.strip, str(row[marker_column]).split(',')))
                tissue_to_marker[sheet_name.strip()].update(markers)
                cell_type_to_marker[cell_type].update(markers)

    for category, cat_to_marker in [('tissue', tissue_to_marker), ('cell_type', cell_type_to_marker)]:
        json_file_path = os.path.join(output_directory, f"{'hpa' if is_hpa else 'heca'}_{category}_to_marker.json")
        with open(json_file_path, 'w') as json_file:
            json.dump({k: list(v) for k, v in cat_to_marker.items()}, json_file, indent=4)
        print(f"{'hpa' if is_hpa else 'heca'} {category} to marker JSON saved at: {json_file_path}")

output_directory = 'ontology/json_output'
os.makedirs(output_directory, exist_ok=True)

# Convert and save the HPA Excel file to JSON
hpa_excel_file_path = 'ontology/hpa_marker_reference.xlsx'
convert_to_json(hpa_excel_file_path, output_directory, is_hpa=True)

# Convert and save the hECA Excel file to JSON
heca_excel_file_path = 'ontology/uhaf_marker_reference.xlsx'
convert_to_json(heca_excel_file_path, output_directory, is_hpa=False)


Collecting pandas
  Downloading pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting openpyxl
  Using cached openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting numpy<2,>=1.26.0 (from pandas)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Downloading pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
Using cached numpy-1.26.4-cp312-cp312-many

In [7]:
import re
import json

def create_custom_mapping():
    """
    Define a mapping for cell type synonyms and specific exceptions.
    This function now focuses on special cases not handled by automatic pluralization and normalization.
    """
    return {
        # Explicit mappings for cases with missing words or different word ending
        'bronchial epithelium, club cells': 'club cell/bronchiolar exocrine cell/clara cell',
        'alveolar cells type 1': 'type I alveolar cell/type I pneumocyte',
        'alveolar cells type 2': 'type II alveolar cell/type II pneumocyte',
        'cone photoreceptor cells': 'cone cell',
        'muller glia cells': 'muller cell',
        'rod photoreceptor cells': 'rod cell',
        'cardiomyocytes': 'cardiomyocyte cell',
        'collecting duct cells': 'collecting duct',
        'distal tubular cells': 'distal convoluted tubule',
        'proximal tubular cells': 'proximal convoluted tubule',
        'exocrine glandular cells': 'exocrine cell',
        'basal glandular cells': 'basal cell',
        'suprabasal keratinocytes': 'keratinocyte',
        'spermatogonia ': 'differentiating spermatogonia',
        'spermatogonia ': 'differentiated spermatogonia',
        # Add or adjust mappings as needed for specific cases
    }

def normalize_name(name, custom_mapping):
    """
    Normalize cell type names by applying custom mappings for known discrepancies, 
    including automatic adjustments for case, hyphens, underscores, 
    and handling singular/plural forms relevant to cell types.
    """
    # Convert to lowercase and replace hyphens/underscores with spaces
    normalized = name.strip().lower().replace("-", " ").replace("_", " ")

    # Apply custom mappings first for specific synonyms and exceptions
    normalized = custom_mapping.get(normalized, normalized)

    # Automatically handle plural forms with basic English rules, tailored for cell types
    if normalized.endswith('ies'):
        normalized = re.sub('ies$', 'y', normalized)  # Correct rule for converting plurals ending in 'ies' to 'y'
    elif normalized.endswith('es'):
        # Correct handling for plurals ending in 'es', which might be common for certain biological terms
        normalized = normalized[:-2]  # Removes the 'es', e.g., "paneth cells" to "paneth cell"
    elif normalized.endswith('s') and not normalized.endswith('ss'):
        normalized = normalized[:-1]  # General case for plurals not ending in 'ss', e.g., "cells" to "cell"

    return normalized

def load_and_normalize_data(file_path, custom_mapping):
    """
    Load JSON data from a file, normalize cell type names using the custom mapping,
    and return a dictionary with normalized cell type names as keys.
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    # Normalize cell type names and return a new dictionary
    return {normalize_name(key, custom_mapping): set(value) for key, value in data.items()}

# Define your custom mapping
custom_mapping = create_custom_mapping()

# Load and normalize the data from JSON files
heca_cell_type_to_marker = load_and_normalize_data('ontology/json_output/heca_cell_type_to_marker.json', custom_mapping)
hpa_cell_type_to_marker = load_and_normalize_data('ontology/json_output/hpa_cell_type_to_marker.json', custom_mapping)

common_cell_types = set(heca_cell_type_to_marker.keys()) & set(hpa_cell_type_to_marker.keys())
print(common_cell_types)
print('\n')

similarity_stats = {}

for cell_type in common_cell_types:
    heca_markers = heca_cell_type_to_marker[cell_type]
    hpa_markers = hpa_cell_type_to_marker[cell_type]
    common_markers = heca_markers & hpa_markers
    
    total_markers_heca = len(heca_markers)
    total_markers_hpa = len(hpa_markers)
    total_common_markers = len(common_markers)
    
    # Only add to similarity_stats if total_common_markers is greater than 0
    if total_common_markers > 0:
        similarity_stats[cell_type] = {
            "common_markers": total_common_markers,
            "percentage_common_markers_heca": round((total_common_markers / total_markers_heca) * 100, 1) if total_markers_heca else 0,
            "percentage_common_markers_hpa": round((total_common_markers / total_markers_hpa) * 100, 1) if total_markers_hpa else 0,
        }

for cell_type, stats in similarity_stats.items():
    print(f"{cell_type}: {stats}")



{'paneth cell', 't cell', 'endothelial cell', 'erythroid cell', 'cardiomyocyte cell', 'ductal cell', 'basal cell', 'dendritic cell', 'exocrine cell', 'fibroblast', 'proximal convoluted tubule', 'rod cell', 'smooth muscle cell', 'b cell', 'collecting duct', 'horizontal cell', 'urothelial cell', 'muller cell', 'distal convoluted tubule', 'goblet cell', 'sertoli cell', 'keratinocyte', 'cone cell', 'club cell/bronchiolar exocrine cell/clara cell'}


t cell: {'common_markers': 1, 'percentage_common_markers_heca': 2.8, 'percentage_common_markers_hpa': 33.3}
endothelial cell: {'common_markers': 4, 'percentage_common_markers_heca': 10.8, 'percentage_common_markers_hpa': 100.0}
cardiomyocyte cell: {'common_markers': 3, 'percentage_common_markers_heca': 42.9, 'percentage_common_markers_hpa': 75.0}
ductal cell: {'common_markers': 1, 'percentage_common_markers_heca': 25.0, 'percentage_common_markers_hpa': 33.3}
fibroblast: {'common_markers': 2, 'percentage_common_markers_heca': 7.4, 'percentage_co

In [8]:
%pip install mygene

import mygene
import json

def convert_symbols_to_ensembl(input_json_path, output_json_path):
    # Load gene symbols from the input JSON file
    with open(input_json_path, 'r') as file:
        data = json.load(file)

    # Initialize MyGeneInfo
    mg = mygene.MyGeneInfo()

    # Prepare a list of all unique gene symbols across all cell types
    gene_symbols = set()
    for cell_type, markers in data.items():
        gene_symbols.update(markers)

    # Query MyGene.info to convert gene symbols to Ensembl IDs
    query_results = mg.querymany(list(gene_symbols), scopes='symbol', fields='ensembl.gene', species='human')

    # Create a mapping of gene symbols to Ensembl Gene IDs
    symbol_to_ensembl = {}
    for result in query_results:
        if 'ensembl' in result:
            if isinstance(result['ensembl'], list):  # Handling multiple Ensembl IDs for a gene symbol
                symbol_to_ensembl[result['query']] = [ensembl['gene'] for ensembl in result['ensembl'] if 'gene' in ensembl]
            else:
                symbol_to_ensembl[result['query']] = result['ensembl']['gene']

    # Convert the original data to use Ensembl Gene IDs
    converted_data = {}
    for cell_type, markers in data.items():
        converted_markers = []
        for marker in markers:
            if marker in symbol_to_ensembl:
                converted_markers.append(symbol_to_ensembl[marker])
        converted_data[cell_type] = converted_markers

    # Save the converted data to a new JSON file
    with open(output_json_path, 'w') as outfile:
        json.dump(converted_data, outfile, indent=4)

    print(f"Conversion completed. Output saved to {output_json_path}")

# Example usage:
convert_symbols_to_ensembl('ontology/json_output/hpa_cell_type_to_marker.json', 'ontology/json_output/hpa_cell_type_to_ensembl.json')
convert_symbols_to_ensembl('ontology/json_output/heca_cell_type_to_marker.json', 'ontology/json_output/heca_cell_type_to_ensembl.json')

Collecting mygene
  Using cached mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biothings-client>=0.2.6 (from mygene)
  Using cached biothings_client-0.3.1-py2.py3-none-any.whl.metadata (9.8 kB)
Using cached biothings_client-0.3.1-py2.py3-none-any.whl (29 kB)
Installing collected packages: biothings-client, mygene
Successfully installed biothings-client-0.3.1 mygene-3.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


1 input query terms found no hit:	['ACPP']


Conversion completed. Output saved to ontology/json_output/hpa_cell_type_to_ensembl.json


8 input query terms found dup hits:	[('IGHM', 2), ('TRBC1', 2), ('IGHG3', 2), ('TRBC2', 2), ('IGHG1', 2), ('IGHA1', 2), ('IGHA2', 2), ('
25 input query terms found no hit:	['CD11a', 'CD85g', 'NXRN1', 'CD41', 'CYP2F2', 'MTG', 'VIP1', 'SERPINBX', 'MT-CO1', 'CTGF', 'CD11c', 


Conversion completed. Output saved to ontology/json_output/heca_cell_type_to_ensembl.json


In [9]:
# Import required libraries
%pip install pandas
import pandas as pd
import json
import os
from collections import defaultdict

class GeneExpressionAtlas:
    def __init__(self, data_path, columns_to_check):
        self.data_path = data_path
        self.columns_to_check = columns_to_check
        self.cell_types_to_ensembl = defaultdict(set)

    def extract_cell_types(self):
        df = pd.read_csv(self.data_path, sep='\t')
        for index, row in df.iterrows():
            ensembl_id = row['Ensembl']
            for col in self.columns_to_check:
                cell_type_data = row[col]
                if pd.notna(cell_type_data):
                    for item in cell_type_data.split(';'):
                        cell_type = item.split(':')[0].strip()
                        self.cell_types_to_ensembl[cell_type].add(ensembl_id)
        return self.cell_types_to_ensembl

    def to_json(self, output_path):
        # Convert sets to lists for JSON serialization
        for cell_type, ensembl_ids in self.cell_types_to_ensembl.items():
            self.cell_types_to_ensembl[cell_type] = list(ensembl_ids)
        with open(output_path, 'w') as f:
            json.dump(self.cell_types_to_ensembl, f)
        return output_path

class HPA(GeneExpressionAtlas):
    def __init__(self, data_path):
        columns_to_check = [
            "RNA tissue specific nTPM",
            "RNA single cell type specific nTPM",
            "RNA blood cell specific nTPM",
            "RNA blood lineage specific nTPM"
        ]
        super().__init__(data_path, columns_to_check)

class hECA(GeneExpressionAtlas):
    pass

# Example usage for HPA
hpa_data_path = "unzipped_folder/proteinatlas.tsv"
hpa = HPA(hpa_data_path)

# Extract cell types and associated Ensembl Gene IDs
hpa_cell_types_to_ensembl = hpa.extract_cell_types()

# Directory where the aggregated results will be saved
output_directory = "output_files"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Write the results to a JSON file
json_file_path = hpa.to_json(f'{output_directory}/cell_types_to_ensembl.json')
print(f"Data written to {json_file_path}")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Data written to output_files/cell_types_to_ensembl.json


In [10]:
%pip install pandas mygene
import json
import mygene
import pandas as pd

def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Load marker genes from HPA and hECA
hpa_markers = load_json('ontology/json_output/heca_cell_type_to_marker.json')
heca_markers = load_json('ontology/json_output/hpa_cell_type_to_marker.json')

# Combine all unique gene symbols from both sets
all_genes = set()
for genes in hpa_markers.values():
    all_genes.update(genes)
for genes in heca_markers.values():
    all_genes.update(genes)

print(all_genes)

# Initialize mygene.MyGeneInfo
mg = mygene.MyGeneInfo()

# Query for Ensembl Gene IDs using gene symbols
gene_info = mg.querymany(list(all_genes), scopes='symbol', fields='ensembl.gene', species='human')


# Process query results to extract Ensembl Gene IDs, accommodating multiple IDs per gene symbols
symbol_to_ensembl = {}
for item in gene_info:
    # Check for missing hits
    if 'notfound' not in item:
        # Initialize the list for this gene symbol if it's the first time we see it
        if item['query'] not in symbol_to_ensembl:
            symbol_to_ensembl[item['query']] = []
        
        # Handle cases where multiple Ensembl IDs are returned
        if 'ensembl' in item:
            if isinstance(item['ensembl'], list):
                # Add all Ensembl IDs to the list
                symbol_to_ensembl[item['query']].extend([gene['gene'] for gene in item['ensembl']])
            else:
                # Only one Ensembl ID, add it to the list
                symbol_to_ensembl[item['query']].append(item['ensembl']['gene'])
    else:
        # Handle genes that were not found by adding an empty list
        symbol_to_ensembl[item['query']] = []


# Load protein atlas data
protein_atlas_df = pd.read_csv('unzipped_folder/proteinatlas.tsv', sep='\t')

# Since some genes might not be found, filter out empty lists to avoid errors
all_ensembl_ids = set(ensembl_id for ids_list in symbol_to_ensembl.values() for ensembl_id in ids_list if ids_list)


# Filter rows where 'Ensembl' column matches any of the Ensembl IDs we found
filtered_protein_atlas_df = protein_atlas_df[protein_atlas_df['Ensembl'].isin(all_ensembl_ids)]

with open('ontology/json_output/gene_symbol_to_ensembl.json', 'w') as f:
    json.dump(symbol_to_ensembl, f, indent=4)

print("Completed mapping gene symbols to Ensembl IDs.")




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
{'THBD', 'KRT5', 'AQP2', 'REG1A', 'CX3CR1', 'MYL7', 'APLNR', 'SCRG1', 'TMEM119', 'ELF3', 'OIT3', 'HLA-DPA1', 'CD99', 'LIPF', 'TM4SF4', 'CD19', 'KLRB1', 'IGLC2', 'FABP1', 'RAC2', 'KRT18', 'GNLY', 'TOP2A', 'THBS4', 'CAV2', 'SCGB1A1', 'FABP6', 'CHAD', 'ITLN1', 'UMOD', 'LDHB', 'TMEM213', 'SEMG1', 'LGALS2', 'CD11a', 'ADH1B', 'IGKC', 'VPREB1', 'CYSTM1', 'RPS27', 'IL32', 'CD85g', 'TTN', 'MUC5AC', 'HPX', 'LYVE1', 'HBA2', 'HLA-DRA', 'CLDN8', 'NCMAP', 'GATA2', 'COL3A1', 'TMEM174', 'TFF3', 'CD79A', 'CHST2', 'ARHGDIB', 'HBB', 'SELE', 'FCER1G', 'SLURP1', 'TUBA1B', 'NUPR1', 'CCL3L3', 'ACP3', 'KIR2DL4', 'IER2', 'ACTC1', 'CD3D', 'CYP17A1', 'IL1B', 'LGR5', 'PROX1', 'GC', 'SFTPA1', 'SLC6A5', 

8 input query terms found dup hits:	[('IGHM', 2), ('TRBC1', 2), ('IGHG3', 2), ('TRBC2', 2), ('IGHG1', 2), ('IGHA1', 2), ('IGHA2', 2), ('
26 input query terms found no hit:	['CD11a', 'CD85g', 'NXRN1', 'CD41', 'CYP2F2', 'MTG', 'VIP1', 'SERPINBX', 'ACPP', 'MT-CO1', 'CTGF', '


Completed mapping gene symbols to Ensembl IDs.


In [6]:
# Import necessary libraries
from collections import defaultdict
import json

# Function to load a genelist from a file
def load_genelist(file_path):
    with open(file_path, 'r') as f:
        # Remove quotes and strip whitespace from each line
        return [line.strip().strip('"') for line in f.read().splitlines()]

# Load the cell_types_to_ensembl.json file
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Load all genelists using file paths for the gene lists
genelists_files = [f'genelists/Genelist{i}.txt' for i in range(1, 7)]

# Initialize a dictionary to store cell type frequencies for all genelists
cell_types_for_all_genelists = {}

# Loop through each genelist file
for i, genelist_file in enumerate(genelists_files, 1):
    # Load the current genelist
    genelist = load_genelist(genelist_file)
    
    # Initialize a defaultdict to store the results for the current genelist
    cell_types_for_genelist = defaultdict(int)

    # Identify cell types associated with the genes in the current genelist
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        for ensembl_id in genelist:
            if ensembl_id in ensembl_ids:
                cell_types_for_genelist[cell_type] += 1

    # Store the results for the current genelist
    cell_types_for_all_genelists[f'Genelist{i}'] = cell_types_for_genelist

# Save the aggregated results to a JSON file
cell_types_for_all_genelists_file = 'output_files/cell_types_for_all_genelists.json'
with open(cell_types_for_all_genelists_file, 'w') as f:
    json.dump(cell_types_for_all_genelists, f)

cell_types_for_all_genelists_file

'output_files/cell_types_for_all_genelists.json'

In [None]:
import pandas as pd

# Load the protein atlas data
protein_atlas_filepath = 'unzipped_folder/proteinatlas.tsv'
protein_atlas_df = pd.read_csv(protein_atlas_filepath, sep='\t')

# Extract the Ensembl IDs from the 'Gene' column
ensembl_ids = protein_atlas_df['Ensembl'].unique().tolist()

# Save the Ensembl IDs to a JSON file
ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'
with open(ensembl_ids_filepath, 'w') as f:
    json.dump(ensembl_ids, f)

# Check the first 5 Ensembl IDs
ensembl_ids[:5], ensembl_ids_filepath

(['ENSG00000000003',
  'ENSG00000000005',
  'ENSG00000000419',
  'ENSG00000000457',
  'ENSG00000000460'],
 'output_files/protein_atlas_ensembl_ids.json')

In [10]:
%pip install scipy statsmodels
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from collections import defaultdict
import json

# Load all genelists using file paths for the gene lists
genelist_filepaths = {f'Genelist{i}': f'genelists/Genelist{i}.txt' for i in range(1, 7)}

# Paths to the input files
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
protein_atlas_ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'

# Load the Protein Atlas Ensembl IDs
with open(protein_atlas_ensembl_ids_filepath, 'r') as f:
    protein_atlas_ensembl_ids = set(json.load(f))

# Load the mapping of cell types to Ensembl IDs
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Initialize a dictionary to store the results
fisher_test_results = defaultdict(dict)

# Perform Fisher's Exact Test for each genelist
for genelist_name, genelist_filepath in genelist_filepaths.items():
    # Load the genelist
    with open(genelist_filepath, 'r') as f:
        genelist = set(line.strip().strip('"') for line in f.readlines())

    # Total number of genes in the genelist and in the Protein Atlas
    total_genes_genelist = len(genelist)
    total_genes_atlas = len(protein_atlas_ensembl_ids)

    p_values = []

    # Perform the test for each cell type
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        ensembl_ids_set = set(ensembl_ids)

        # Count of genes in both the genelist and the cell type
        count_in_both = len(genelist.intersection(ensembl_ids_set))
        count_in_genelist_not_cell_type = len(genelist.difference(ensembl_ids_set))
        count_in_cell_type_not_genelist = len(ensembl_ids_set.difference(genelist))
        count_in_neither = total_genes_atlas - (count_in_both + count_in_genelist_not_cell_type + count_in_cell_type_not_genelist)

        # Construct the contingency table
        table = [
            [count_in_both, count_in_cell_type_not_genelist],
            [count_in_genelist_not_cell_type, count_in_neither]
        ]

        # Perform Fisher's Exact Test
        odds_ratio, p_value = fisher_exact(table, alternative='greater')
        p_values.append(p_value)

        # Store the results without adjusted p-values first
        fisher_test_results[genelist_name][cell_type] = {
            'p_value': p_value,
            'odds_ratio': odds_ratio,
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        }

    # Adjust p-values using the Benjamini-Hochberg procedure
    _, pvals_corrected, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

    # Store the adjusted p-values in the results
    for (cell_type, _), adj_p_value in zip(fisher_test_results[genelist_name].items(), pvals_corrected):
        fisher_test_results[genelist_name][cell_type]['adjusted_p_value'] = adj_p_value
    
    # Sort results by adjusted p_value in ascending order
    fisher_test_results[genelist_name] = dict(sorted(fisher_test_results[genelist_name].items(), key=lambda x: x[1].get('adjusted_p_value', 1)))

# Save the results to a JSON file
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'w') as f:
    json.dump(fisher_test_results, f)

results_filepath


Collecting scipy
  Using cached scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting statsmodels
  Using cached statsmodels-0.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting numpy<1.29.0,>=1.22.4 (from scipy)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas!=2.1.0,>=1.0 (from statsmodels)
  Using cached pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting patsy>=0.5.4 (from statsmodels)
  Using cached patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting pytz>=2020.1 (from pandas!=2.1.0,>=1.0->statsmodels)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas!=2.1.0,>=1.0->statsmodels)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux

'output_files/fisher_test_results.json'

In [3]:
import json

# Load the cell types to Ensembl IDs mapping from the file
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
with open(cell_types_to_ensembl_filepath, 'r') as file:
    cell_types_to_ensembl = json.load(file)


def cell_types_to_genes(cell_type):
    """
    Returns the list of Ensembl Gene IDs associated with a given cell type.

    Parameters:
    - cell_type (str): The cell type for which to retrieve associated gene IDs.

    Returns:
    - list: A list of Ensembl Gene IDs associated with the cell type.
    """
    # Directly return the list of genes from the dictionary
    # If the cell type is not found, return an empty list
    return cell_types_to_ensembl.get(cell_type, [])

# Example cell type
example_cell_type = 'esophagus'
# Retrieve the genes associated with the example cell type
genes_for_example_cell_type = cell_types_to_genes(example_cell_type)

print(f"Genes associated with {example_cell_type}: {genes_for_example_cell_type}")



Genes associated with esophagus: ['ENSG00000168350', 'ENSG00000147144', 'ENSG00000167916', 'ENSG00000141527', 'ENSG00000188643', 'ENSG00000171916', 'ENSG00000104881', 'ENSG00000104371', 'ENSG00000265190', 'ENSG00000237330', 'ENSG00000261272', 'ENSG00000172382', 'ENSG00000100078', 'ENSG00000163623', 'ENSG00000170545', 'ENSG00000170786', 'ENSG00000182782', 'ENSG00000243566', 'ENSG00000163435', 'ENSG00000188522', 'ENSG00000130701', 'ENSG00000185479', 'ENSG00000186806', 'ENSG00000104055', 'ENSG00000154227', 'ENSG00000157379', 'ENSG00000174226', 'ENSG00000133020', 'ENSG00000137857', 'ENSG00000203499', 'ENSG00000160213', 'ENSG00000134531', 'ENSG00000198643', 'ENSG00000169474', 'ENSG00000175984', 'ENSG00000142627', 'ENSG00000153294', 'ENSG00000165272', 'ENSG00000117595', 'ENSG00000258691', 'ENSG00000168143', 'ENSG00000189143', 'ENSG00000162398', 'ENSG00000152766', 'ENSG00000172478', 'ENSG00000160349', 'ENSG00000255398', 'ENSG00000163220', 'ENSG00000177106', 'ENSG00000143369', 'ENSG00000170477

In [2]:
'''
%pip install mygene scipy statsmodels
import mygene
import json
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from collections import defaultdict

# Function to perform mapping of genes to GO terms
def map_genes_to_go_terms(genelist):
    mg = mygene.MyGeneInfo()
    go_terms = mg.querymany(genelist, scopes='ensembl.gene', fields='go', species='human')
    return {result['query']: result.get('go', {}) for result in go_terms}

# Load the genelists and their file paths
genelist_filepaths = {f'Genelist{i}': f'genelists/Genelist{i}.txt' for i in range(1, 7)}

# Load the fisher test results
fisher_test_results_path = 'output_files/fisher_test_results.json'
with open(fisher_test_results_path, 'r') as file:
    fisher_test_results = json.load(file)

# Dictionary to store the GO terms mapping for each genelist
go_terms_mappings = {}

# Perform Fisher's Exact Test and map to GO terms for each genelist
for genelist_name, genelist_filepath in genelist_filepaths.items():
    # Load the genelist
    with open(genelist_filepath, 'r') as f:
        genelist = set(line.strip().strip('"') for line in f.readlines())
    
    # Map genes to GO terms
    go_terms_mapping = map_genes_to_go_terms(genelist)
    go_terms_mappings[genelist_name] = go_terms_mapping
    
    # Add the GO terms to the fisher_test_results
    for cell_type in fisher_test_results[genelist_name].keys():
        # Here we assume that the genes are the keys in the `go_terms_mapping`
        # And that each key has a dict with 'BP', 'MF', and 'CC' as keys for biological processes, molecular functions, and cellular components
        associated_go_terms = {gene: go_terms_mapping.get(gene, {}) for gene in genelist}
        fisher_test_results[genelist_name][cell_type]['go_terms'] = associated_go_terms

# Save the enriched fisher_test_results to a JSON file
enriched_results_filepath = 'output_files/enriched_fisher_test_results.json'
with open(enriched_results_filepath, 'w') as f:
    json.dump(fisher_test_results, f, indent=4)
'''

'\n%pip install mygene scipy statsmodels\nimport mygene\nimport json\nfrom scipy.stats import fisher_exact\nfrom statsmodels.stats.multitest import multipletests\nfrom collections import defaultdict\n\n# Function to perform mapping of genes to GO terms\ndef map_genes_to_go_terms(genelist):\n    mg = mygene.MyGeneInfo()\n    go_terms = mg.querymany(genelist, scopes=\'ensembl.gene\', fields=\'go\', species=\'human\')\n    return {result[\'query\']: result.get(\'go\', {}) for result in go_terms}\n\n# Load the genelists and their file paths\ngenelist_filepaths = {f\'Genelist{i}\': f\'genelists/Genelist{i}.txt\' for i in range(1, 7)}\n\n# Load the fisher test results\nfisher_test_results_path = \'output_files/fisher_test_results.json\'\nwith open(fisher_test_results_path, \'r\') as file:\n    fisher_test_results = json.load(file)\n\n# Dictionary to store the GO terms mapping for each genelist\ngo_terms_mappings = {}\n\n# Perform Fisher\'s Exact Test and map to GO terms for each genelist\nfo

In [1]:
%pip install statsmodels
import json
from statsmodels.stats.multitest import multipletests

# Load Fisher test results
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'r') as file:
    fisher_test_results = json.load(file)

# Extract insights from the Fisher test results
insights = {}
p_value_threshold = 0.05  # Adjusted p-value threshold

for genelist, cell_types in fisher_test_results.items():
    associations = []
    p_values = []
    for cell_type, stats in cell_types.items():
        p_value = stats.get('p_value', 1)
        count_in_both = stats.get('count_in_both', 0)
        count_in_genelist_not_cell_type = stats.get('count_in_genelist_not_cell_type', 0)
        count_in_cell_type_not_genelist = stats.get('count_in_cell_type_not_genelist', 0)
        count_in_neither = stats.get('count_in_neither', 0)

        # Calculate the odds ratio
        odds_ratio = (count_in_both * count_in_neither) / (count_in_genelist_not_cell_type * count_in_cell_type_not_genelist) if count_in_genelist_not_cell_type and count_in_cell_type_not_genelist else float('inf')

        associations.append({
            'cell_type': cell_type,
            'p_value': p_value,
            'odds_ratio': odds_ratio,  # Include the odds ratio
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        })
        p_values.append(p_value)

    # Adjust p-values using the Benjamini-Hochberg procedure
    reject, pvals_corrected, _, _ = multipletests(p_values, alpha=p_value_threshold, method='fdr_bh')

    # Store only associations with an adjusted p-value below the threshold and less than 1.0
    significant_associations = []
    for association, adj_p_value, rej in zip(associations, pvals_corrected, reject):
        if rej and adj_p_value < 1.0:
            association['adjusted_p_value'] = adj_p_value
            significant_associations.append(association)

    # Sort the significant associations by adjusted p_value in ascending order
    significant_associations.sort(key=lambda x: x.get('adjusted_p_value', 1))
    insights[genelist] = significant_associations

# Write the insights to a JSON file
output_filepath = 'output_files/fisher_test_insights.json'
with open(output_filepath, 'w') as f:
    json.dump(insights, f)

# Displaying the first few insights for review
{key: insights[key][:5] for key in insights.keys()}


Collecting statsmodels
  Using cached statsmodels-0.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting numpy<2,>=1.18 (from statsmodels)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy!=1.9.2,>=1.4 (from statsmodels)
  Using cached scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting pandas!=2.1.0,>=1.0 (from statsmodels)
  Using cached pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting patsy>=0.5.4 (from statsmodels)
  Using cached patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting pytz>=2020.1 (from pandas!=2.1.0,>=1.0->statsmodels)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas!=2.1.0,>=1.0->statsmodels)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached statsmodels-0.14.1-cp312-

{'Genelist1': [{'cell_type': 'esophagus',
   'p_value': 7.601491563693231e-83,
   'odds_ratio': 38.03038138332256,
   'count_in_both': 81,
   'count_in_genelist_not_cell_type': 119,
   'count_in_cell_type_not_genelist': 351,
   'count_in_neither': 19611,
   'adjusted_p_value': 1.0262013610985863e-80},
  {'cell_type': 'Suprabasal keratinocytes',
   'p_value': 4.524016635921081e-70,
   'odds_ratio': 26.866237987563593,
   'count_in_both': 78,
   'count_in_genelist_not_cell_type': 122,
   'count_in_cell_type_not_genelist': 464,
   'count_in_neither': 19498,
   'adjusted_p_value': 3.0537112292467296e-68},
  {'cell_type': 'vagina',
   'p_value': 1.6908530223420254e-62,
   'odds_ratio': 65.7748344370861,
   'count_in_both': 49,
   'count_in_genelist_not_cell_type': 151,
   'count_in_cell_type_not_genelist': 98,
   'count_in_neither': 19864,
   'adjusted_p_value': 7.608838600539114e-61},
  {'cell_type': 'Basal keratinocytes',
   'p_value': 7.369658263665469e-38,
   'odds_ratio': 21.5081838252

In [23]:
%pip install mygene pandas
import pandas as pd
import mygene
import json
import gzip

mg = mygene.MyGeneInfo()

# Function to fetch GO terms
def fetch_go_terms_for_genes(genes):
    results = mg.querymany(genes, scopes='ensembl.gene', fields='go', species='human')
    return {result['query']: result.get('go', {}) for result in results if 'go' in result}


# Load the insights and mapping
with open('output_files/fisher_test_insights.json', 'r') as file:
    fisher_test_insights = json.load(file)

with open('output_files/cell_types_to_ensembl.json', 'r') as file:
    cell_types_to_ensembl = json.load(file)

p_value_threshold = 0.05

# Iterate through each gene list
for genelist_name, cell_types in fisher_test_insights.items():
    print(f"Processing {genelist_name}")
    significant_genes = set()
    
    # Identify significant cell types and corresponding genes
    for cell_type_info in cell_types:
        if cell_type_info['adjusted_p_value'] < p_value_threshold:
            cell_type = cell_type_info['cell_type']
            genes = cell_types_to_ensembl.get(cell_type, [])
            significant_genes.update(genes)
    
    # If more than 3 significant genes, limit to top 3 based on your criteria (e.g., adjusted p-value)
    significant_genes = significant_genes[:3] if len(significant_genes) > 3 else significant_genes
    
     # Fetch GO terms for significant genes
    go_terms = fetch_go_terms_for_genes(list(significant_genes))
    
    # Print detailed GO query results for the top 3 significant genes
    for i, (gene, go_info) in enumerate(go_terms.items()):
        if i < 3:  # Limit to top 3
            print(f"{genelist_name} - Gene: {gene}, GO query result: {go_info}")
        else:
            break

    # Optional: Save full results (consider using json.dump without as_dataframe)

    # Optional: Save the GO terms results to a compressed file
    output_path = f'output_files/{genelist_name}_go_terms_detailed.json.gz'
    with gzip.open(output_path, 'wt', encoding='UTF-8') as zipfile:
        json.dump(go_terms, zipfile)




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Processing Genelist1


TypeError: 'set' object is not subscriptable

In [15]:
import gzip
import json

def read_zipped_json(filepath):
    """
    Reads a zipped JSON file and returns its content.

    Parameters:
    - filepath (str): The path to the zipped JSON file.

    Returns:
    - dict or list: The content of the JSON file.
    """
    with gzip.open(filepath, 'rt', encoding='utf-8') as zipfile:
        data = json.load(zipfile)
    return data

def print_first_10_lines(data):
    """
    Prints the first 10 lines of the data.

    Parameters:
    - data (dict or list): The data to print.
    """
    if isinstance(data, dict):
        for i, (key, value) in enumerate(data.items()):
            print(f"{key}: {value}")
            if i >= 9:  # Stop after printing 10 items
                break
    elif isinstance(data, list):
        for i, item in enumerate(data):
            print(item)
            if i >= 9:  # Stop after printing 10 items
                break
    else:
        print("Unsupported data type.")

# Example usage
zipped_json_filepath = 'output_files/Genelist1_go_terms.json.gz'
data = read_zipped_json(zipped_json_filepath)

# Print the first 10 lines of the data
print_first_10_lines(data)


ENSG00000119915: {'BP': [{'evidence': 'IEA', 'gocategory': 'BP', 'id': 'GO:0006636', 'qualifier': 'involved_in', 'term': 'unsaturated fatty acid biosynthetic process'}, {'evidence': 'IBA', 'gocategory': 'BP', 'id': 'GO:0019367', 'qualifier': 'involved_in', 'term': 'fatty acid elongation, saturated fatty acid'}, {'evidence': 'IDA', 'gocategory': 'BP', 'id': 'GO:0019367', 'pubmed': 20937905, 'qualifier': 'involved_in', 'term': 'fatty acid elongation, saturated fatty acid'}, {'evidence': 'IBA', 'gocategory': 'BP', 'id': 'GO:0030148', 'qualifier': 'involved_in', 'term': 'sphingolipid biosynthetic process'}, {'evidence': 'IBA', 'gocategory': 'BP', 'id': 'GO:0034625', 'qualifier': 'involved_in', 'term': 'fatty acid elongation, monounsaturated fatty acid'}, {'evidence': 'IDA', 'gocategory': 'BP', 'id': 'GO:0034625', 'pubmed': 20937905, 'qualifier': 'involved_in', 'term': 'fatty acid elongation, monounsaturated fatty acid'}, {'evidence': 'IBA', 'gocategory': 'BP', 'id': 'GO:0034626', 'qualifie

In [1]:
''''
%pip install goatools
import requests
import gzip
import os
import urllib.request
from collections import defaultdict
import goatools
from goatools.obo_parser import GODag
from goatools.associations import read_ncbi_gene2go

class GeneExpressionAtlas:
    def __init__(self, data_path, columns_to_check, custom_mapping=None):
        self.data_path = data_path
        self.columns_to_check = columns_to_check
        self.custom_mapping = custom_mapping if custom_mapping else self.create_custom_mapping()
        self.cell_types_to_ensembl = defaultdict(set)
        
        # Specify paths for the ontology files
        ontology_folder = "ontology"
        self.go_obo_path = os.path.join(ontology_folder, "go-basic.obo")
        self.gene2go_path = os.path.join(ontology_folder, "gene2go")
        
        # Download required files before loading them
        self.download_and_prepare_ontology_files()

        # Load the GO DAG and gene2go data
        if os.path.exists(self.go_obo_path):
            self.go_dag = GODag(self.go_obo_path)
        if os.path.exists(self.gene2go_path):
            self.gene2go = read_ncbi_gene2go(self.gene2go_path, taxids=[9606])  # Assuming human genes
    
    def create_custom_mapping(self):
        """
        Define a mapping for cell type synonyms and specific exceptions.
        This function now focuses on special cases not handled by automatic pluralization and normalization.
        """
        return {
            # Explicit mappings for cases with missing words or different word ending
            'bronchial epithelium, club cells': 'club cell/bronchiolar exocrine cell/clara cell',
            'alveolar cells type 1': 'type I alveolar cell/type I pneumocyte',
            'alveolar cells type 2': 'type II alveolar cell/type II pneumocyte',
            'cone photoreceptor cells': 'cone cell',
            'muller glia cells': 'muller cell',
            'rod photoreceptor cells': 'rod cell',
            'cardiomyocytes': 'cardiomyocyte cell',
            'collecting duct cells': 'collecting duct',
            'distal tubular cells': 'distal convoluted tubule',
            'proximal tubular cells': 'proximal convoluted tubule',
            'exocrine glandular cells': 'exocrine cell',
            'basal glandular cells': 'basal cell',
            'suprabasal keratinocytes': 'keratinocyte',
            'spermatogonia ': 'differentiating spermatogonia',
            'spermatogonia ': 'differentiated spermatogonia',
            # Add or adjust mappings as needed for specific cases
        }
    
    def map_symbols_to_ensembl(self, symbols):
        """Map a list of gene symbols to Ensembl IDs."""
        result = self.mg.querymany(symbols, scopes='symbol', fields='ensembl.gene', species='human')
        return {item['query']: item['ensembl']['gene'] for item in result if 'ensembl' in item}

    def fetch_go_terms(self, ensembl_ids):
        """Fetch GO terms for given Ensembl gene IDs using QuickGO API."""
        url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"
        headers = {"Accept": "application/json"}
        params = {
            "geneProductId": ",".join(ensembl_ids),
            "limit": 100  # Adjust based on needs
        }
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            go_terms = response.json()['results']
            return go_terms
        else:
            print("Failed to fetch GO terms")
            return []
'''


'\'\n%pip install goatools\nimport requests\nimport gzip\nimport os\nimport urllib.request\nfrom collections import defaultdict\nimport goatools\nfrom goatools.obo_parser import GODag\nfrom goatools.associations import read_ncbi_gene2go\n\nclass GeneExpressionAtlas:\n    def __init__(self, data_path, columns_to_check, custom_mapping=None):\n        self.data_path = data_path\n        self.columns_to_check = columns_to_check\n        self.custom_mapping = custom_mapping if custom_mapping else self.create_custom_mapping()\n        self.cell_types_to_ensembl = defaultdict(set)\n        \n        # Specify paths for the ontology files\n        ontology_folder = "ontology"\n        self.go_obo_path = os.path.join(ontology_folder, "go-basic.obo")\n        self.gene2go_path = os.path.join(ontology_folder, "gene2go")\n        \n        # Download required files before loading them\n        self.download_and_prepare_ontology_files()\n\n        # Load the GO DAG and gene2go data\n        if os.p

In [None]:
'''
    def download_and_prepare_ontology_files(self):
        go_obo_url = "http://purl.obolibrary.org/obo/go/go-basic.obo"
        gene2go_url = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz"
        # Download the files
        self.download_file(go_obo_url, "ontology")
        self.download_file(gene2go_url, "ontology", decompress=True)

    def download_file(self, url, dest_folder, decompress=False):
        if not os.path.exists(dest_folder):
            os.makedirs(dest_folder)  # Create directory if it does not exist
        filename = url.split('/')[-1]
        dest_file_path = os.path.join(dest_folder, filename)

        if url.startswith('ftp://'):
            # Handle FTP download
            urllib.request.urlretrieve(url, dest_file_path)
            print(f"Downloaded {filename} to {dest_file_path}")
        else:
            # Handle HTTP/HTTPS download
            with requests.get(url, stream=True) as response:
                if response.status_code == 200:
                    with open(dest_file_path, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=8192):
                            f.write(chunk)
                    print(f"Downloaded {filename} to {dest_file_path}")
                else:
                    print(f"Error downloading {filename}: Status Code {response.status_code}")

        # Decompress if necessary
        if decompress and filename.endswith('.gz'):
            with gzip.open(dest_file_path, 'rb') as f_in:
                with open(dest_file_path[:-3], 'wb') as f_out:  # Remove .gz extension
                    f_out.write(f_in.read())
            os.remove(dest_file_path)  # Remove the compressed file
            print(f"File decompressed to {dest_file_path[:-3]}")

# Example usage
gene_expression_atlas = GeneExpressionAtlas(
    data_path="unzipped_folder/proteinatlas.tsv",
    columns_to_check=["column1", "column2"]
)
'''