In [2]:
import requests
import os # yes or no
from bs4 import BeautifulSoup

def download_files_from_hpa(url, max_size_gb=1, subfolder="downloaded_hpa_files"):
    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    # Convert the max size from GB to bytes
    max_size_bytes = max_size_gb * 1e9

    # Make an HTTP GET request to the provided URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a successful response

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Search for all <a> tags with the specified href structure
    links = soup.find_all('a', href=True)

    # Base URL to prepend to relative file paths
    base_url = "https://www.proteinatlas.org"

    for link in links:
        file_url = link['href']
        if file_url.endswith('.zip'):  # Check if the link is to a .zip file
            full_url = base_url + file_url

            # Extract filename from the URL
            filename = file_url.split('/')[-1]

            # Create the full path to save the file
            save_path = os.path.join(subfolder, filename)
            
            # Check if the file already exists
            if os.path.exists(save_path):
                print(f"{filename} already exists. Skipping download.")
                continue

            if filename == "proteinatlas.tsv.zip":
                # Check file size without downloading the entire file
                file_response = requests.head(full_url)
                file_size = int(file_response.headers.get('Content-Length', 0))

                if file_size <= max_size_bytes:
                    # Download the file if it's within the size limit
                    print(f"Downloading {filename}...")
                    file_response = requests.get(full_url, stream=True)
                    with open(save_path, 'wb') as file:
                        for chunk in file_response.iter_content(chunk_size=8192):
                            file.write(chunk)
                    print(f"{filename} downloaded!")
                else:
                    print(f"Skipping {filename} as it exceeds the size limit.")

# Example usage
download_files_from_hpa("https://www.proteinatlas.org/about/download")


proteinatlas.tsv.zip already exists. Skipping download.


In [3]:
import zipfile

# Function to unzip a file
def unzip_file(zip_file_path, output_folder_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder_path)
        print(f"Unzipped files to {output_folder_path}")

# Unzip the file
unzip_file('downloaded_hpa_files/proteinatlas.tsv.zip', 'unzipped_folder')

# Read the first 5 lines of the unzipped .tsv file
try:
    with open('unzipped_folder/proteinatlas.tsv', 'r') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            print(line.strip().replace('\t', ','))
except FileNotFoundError:
    print("The file 'proteinatlas.tsv' was not found in the 'unzipped_folder'.")


Unzipped files to unzipped_folder
Gene,"Gene synonym",Ensembl,"Gene description",Uniprot,Chromosome,Position,"Protein class","Biological process","Molecular function","Disease involvement",Evidence,"HPA evidence","UniProt evidence","NeXtProt evidence","RNA tissue specificity","RNA tissue distribution","RNA tissue specificity score","RNA tissue specific nTPM","RNA single cell type specificity","RNA single cell type distribution","RNA single cell type specificity score","RNA single cell type specific nTPM","RNA cancer specificity","RNA cancer distribution","RNA cancer specificity score","RNA cancer specific FPKM","RNA brain regional specificity","RNA brain regional distribution","RNA brain regional specificity score","RNA brain regional specific nTPM","RNA blood cell specificity","RNA blood cell distribution","RNA blood cell specificity score","RNA blood cell specific nTPM","RNA blood lineage specificity","RNA blood lineage distribution","RNA blood lineage specificity score","RNA blood l

In [5]:
# Import required libraries
%pip install pandas
import pandas as pd
import json
import os
from collections import defaultdict

# Define a function to extract cell types from specific columns
def extract_cell_types(df, columns_to_check):
    cell_types_to_ensembl = defaultdict(set)

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        ensembl_id = row['Ensembl']
        for col in columns_to_check:
            cell_type_data = row[col]
            if pd.notna(cell_type_data):
                # Split the cell_type_data by ';' to get each cell type and its nTPM
                for item in cell_type_data.split(';'):
                    # Extract the cell type name (ignoring nTPM value)
                    cell_type = item.split(':')[0].strip()
                    # Add the Ensembl ID to the set associated with this cell type
                    cell_types_to_ensembl[cell_type].add(ensembl_id)

    # Convert sets to lists for JSON serialization
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        cell_types_to_ensembl[cell_type] = list(ensembl_ids)
        
    return cell_types_to_ensembl

# Specify columns to check for cell types
columns_to_check = [
    "RNA tissue specific nTPM",
    "RNA single cell type specific nTPM",
    "RNA blood cell specific nTPM",
    "RNA blood lineage specific nTPM"
]

# Read the first 5 rows of proteinatlas.tsv into a DataFrame for demonstration
# Uncomment the line below when you want to read the entire file
# df = pd.read_csv("/mnt/data/proteinatlas.tsv", sep='\t', nrows=5)

# For demonstration, read the first 5 lines from the uploaded file
df = pd.read_csv("unzipped_folder/proteinatlas.tsv", sep='\t') #, nrows=5)

# Extract cell types and associated Ensembl Gene IDs
cell_types_to_ensembl = extract_cell_types(df, columns_to_check)

# Directory where the aggregated results will be saved
output_directory = "output_files"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Write the results to a JSON file
json_file_path = 'output_files/cell_types_to_ensembl.json'
with open(json_file_path, 'w') as f:
    json.dump(cell_types_to_ensembl, f)

json_file_path


Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/de/ce/b5d9c7ce1aaf9023b823c81932a50cd5e8f407198a696b0d1c6025a40b03/pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy>=1.23.2 (from pandas)
  Obtaining dependency information for numpy>=1.23.2 from https://files.pythonhosted.org/packages/c4/36/161e2f8110f8c49e59f6107bd6da4257d30aff9f06373d0471811f73dcc5/numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/32/4d/aaf7eff5deb402fd9a24a1449a8119f00d74ae9c2efa79f8ef9994261fc2/pytz-2023.3.post1-py2.py3-none-any.whl.metadata
 

'output_files/cell_types_to_ensembl.json'

In [7]:
# Import necessary libraries
from collections import defaultdict
import json

# Function to load a genelist from a file
def load_genelist(file_path):
    with open(file_path, 'r') as f:
        # Remove quotes and strip whitespace from each line
        return [line.strip().strip('"') for line in f.read().splitlines()]

# Load the cell_types_to_ensembl.json file
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Load all genelists using file paths for the gene lists
genelists_files = [f'genelists/Genelist{i}.txt' for i in range(1, 7)]

# Initialize a dictionary to store cell type frequencies for all genelists
cell_types_for_all_genelists = {}

# Loop through each genelist file
for i, genelist_file in enumerate(genelists_files, 1):
    # Load the current genelist
    genelist = load_genelist(genelist_file)
    
    # Initialize a defaultdict to store the results for the current genelist
    cell_types_for_genelist = defaultdict(int)

    # Identify cell types associated with the genes in the current genelist
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        for ensembl_id in genelist:
            if ensembl_id in ensembl_ids:
                cell_types_for_genelist[cell_type] += 1

    # Store the results for the current genelist
    cell_types_for_all_genelists[f'Genelist{i}'] = cell_types_for_genelist

# Save the aggregated results to a JSON file
cell_types_for_all_genelists_file = 'output_files/cell_types_for_all_genelists.json'
with open(cell_types_for_all_genelists_file, 'w') as f:
    json.dump(cell_types_for_all_genelists, f)

cell_types_for_all_genelists_file

'output_files/cell_types_for_all_genelists.json'

In [8]:
import pandas as pd

# Load the protein atlas data
protein_atlas_filepath = 'unzipped_folder/proteinatlas.tsv'
protein_atlas_df = pd.read_csv(protein_atlas_filepath, sep='\t')

# Extract the Ensembl IDs from the 'Gene' column
ensembl_ids = protein_atlas_df['Ensembl'].unique().tolist()

# Save the Ensembl IDs to a JSON file
ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'
with open(ensembl_ids_filepath, 'w') as f:
    json.dump(ensembl_ids, f)

# Check the first 5 Ensembl IDs
ensembl_ids[:5], ensembl_ids_filepath

(['ENSG00000000003',
  'ENSG00000000005',
  'ENSG00000000419',
  'ENSG00000000457',
  'ENSG00000000460'],
 'output_files/protein_atlas_ensembl_ids.json')

In [13]:
%pip install scipy
from scipy.stats import fisher_exact
from collections import defaultdict
import json

# Load all genelists using file paths for the gene lists
genelist_filepaths = {f'Genelist{i}': f'genelists/Genelist{i}.txt' for i in range(1, 7)}

# Paths to the input files
cell_types_to_ensembl_filepath = 'output_files/cell_types_to_ensembl.json'
protein_atlas_ensembl_ids_filepath = 'output_files/protein_atlas_ensembl_ids.json'

# Load the Protein Atlas Ensembl IDs
with open(protein_atlas_ensembl_ids_filepath, 'r') as f:
    protein_atlas_ensembl_ids = set(json.load(f))

# Load the mapping of cell types to Ensembl IDs
with open(cell_types_to_ensembl_filepath, 'r') as f:
    cell_types_to_ensembl = json.load(f)

# Initialize a dictionary to store the results
fisher_test_results = defaultdict(dict)

# Perform Fisher's Exact Test for each genelist
for genelist_name, genelist_filepath in genelist_filepaths.items():
    # Load the genelist
    with open(genelist_filepath, 'r') as f:
        genelist = set(line.strip().strip('"') for line in f.readlines())

    # Total number of genes in the genelist and in the Protein Atlas
    total_genes_genelist = len(genelist)
    total_genes_atlas = len(protein_atlas_ensembl_ids)

    # Perform the test for each cell type
    for cell_type, ensembl_ids in cell_types_to_ensembl.items():
        ensembl_ids_set = set(ensembl_ids)

        # Count of genes in both the genelist and the cell type
        count_in_both = len(genelist.intersection(ensembl_ids_set))

        # Count of genes in the genelist but not in the cell type
        count_in_genelist_not_cell_type = len(genelist.difference(ensembl_ids_set))

        # Count of genes in the cell type but not in the genelist
        count_in_cell_type_not_genelist = len(ensembl_ids_set.difference(genelist))

        # Count of genes neither in the genelist nor in the cell type
        count_in_neither = total_genes_atlas - (count_in_both + count_in_genelist_not_cell_type + count_in_cell_type_not_genelist)

        # Construct the contingency table
        table = [
            [count_in_both, count_in_cell_type_not_genelist],
            [count_in_genelist_not_cell_type, count_in_neither]
        ]

        # Perform Fisher's Exact Test
        odds_ratio, p_value = fisher_exact(table, alternative='greater')

        # Store the results
        fisher_test_results[genelist_name][cell_type] = {
            'p_value': p_value,
            'odds_ratio': odds_ratio,
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        }

# Save the results to a JSON file
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'w') as f:
    json.dump(fisher_test_results, f)

results_filepath


Note: you may need to restart the kernel to use updated packages.


'output_files/fisher_test_results.json'

In [2]:
import json

# Load Fisher test results
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'r') as file:
    fisher_test_results = json.load(file)

# Extract insights from the Fisher test results
insights = {}

# Define a threshold for significance and high association
p_value_threshold = 1e-5  # Consider p-values smaller than this threshold as significant
odds_ratio_threshold = 5  # Consider odds ratios greater than this threshold as high association

for genelist, cell_types in fisher_test_results.items():
    significant_associations = []
    for cell_type, stats in cell_types.items():
        p_value = stats.get('p_value', 1)
        odds_ratio = stats.get('odds_ratio', 0)
        count_in_both = stats.get('count_in_both', 0)
        
        # Check if the cell type is significantly associated with the genelist
        if p_value < p_value_threshold and odds_ratio > odds_ratio_threshold:
            significant_associations.append({
                'cell_type': cell_type,
                'p_value': p_value,
                'odds_ratio': odds_ratio,
                'count_in_both': count_in_both
            })
    
    # Sort the significant associations by p_value in ascending order
    significant_associations.sort(key=lambda x: x['p_value'])
    insights[genelist] = significant_associations

# Displaying the first few insights for review
display_insights = {key: insights[key][:5] for key in insights.keys()}

# Write the insights to a JSON file
output_filepath = 'output_files/insights.json'
with open(output_filepath, 'w') as file:
    json.dump(insights, file)

display_insights


{'Genelist1': [{'cell_type': 'esophagus',
   'p_value': 7.601491563693231e-83,
   'odds_ratio': 38.03038138332256,
   'count_in_both': 81},
  {'cell_type': 'Suprabasal keratinocytes',
   'p_value': 4.524016635921081e-70,
   'odds_ratio': 26.866237987563593,
   'count_in_both': 78},
  {'cell_type': 'vagina',
   'p_value': 1.6908530223420254e-62,
   'odds_ratio': 65.7748344370861,
   'count_in_both': 49},
  {'cell_type': 'Basal keratinocytes',
   'p_value': 7.369658263665469e-38,
   'odds_ratio': 21.508183825208718,
   'count_in_both': 43},
  {'cell_type': 'skin 1',
   'p_value': 8.6332271896158e-28,
   'odds_ratio': 10.425534896757918,
   'count_in_both': 46}],
 'Genelist2': [{'cell_type': 'Leydig cells',
   'p_value': 2.1788500557207554e-12,
   'odds_ratio': 7.860914148576415,
   'count_in_both': 22},
  {'cell_type': 'Fibroblasts',
   'p_value': 1.0161292677628894e-10,
   'odds_ratio': 6.352030434397947,
   'count_in_both': 22},
  {'cell_type': 'Peritubular cells',
   'p_value': 1.2083

In [3]:
%pip install statsmodels
import json
from statsmodels.stats.multitest import multipletests

# Load Fisher test results
results_filepath = 'output_files/fisher_test_results.json'
with open(results_filepath, 'r') as file:
    fisher_test_results = json.load(file)

# Extract insights from the Fisher test results
insights = {}
p_value_threshold = 0.05  # Adjusted p-value threshold

for genelist, cell_types in fisher_test_results.items():
    associations = []
    p_values = []
    for cell_type, stats in cell_types.items():
        p_value = stats.get('p_value', 1)
        count_in_both = stats.get('count_in_both', 0)
        count_in_genelist_not_cell_type = stats.get('count_in_genelist_not_cell_type', 0)
        count_in_cell_type_not_genelist = stats.get('count_in_cell_type_not_genelist', 0)
        count_in_neither = stats.get('count_in_neither', 0)
        
        associations.append({
            'cell_type': cell_type,
            'p_value': p_value,
            'count_in_both': count_in_both,
            'count_in_genelist_not_cell_type': count_in_genelist_not_cell_type,
            'count_in_cell_type_not_genelist': count_in_cell_type_not_genelist,
            'count_in_neither': count_in_neither
        })
        p_values.append(p_value)

    # Adjust p-values using the Benjamini-Hochberg procedure
    reject, pvals_corrected, _, _ = multipletests(p_values, alpha=p_value_threshold, method='fdr_bh')
    
    # Store associations with adjusted p-value below the threshold
    significant_associations = [association for association, adj_p_value, rej in zip(associations, pvals_corrected, reject) if rej]
    for association, adj_p_value in zip(significant_associations, pvals_corrected):
        association['adjusted_p_value'] = adj_p_value

    # Sort the significant associations by adjusted p_value in ascending order
    significant_associations.sort(key=lambda x: x.get('adjusted_p_value', 1))
    insights[genelist] = significant_associations

# Write the insights to a JSON file
output_filepath = 'output_files/fisher_test_insights.json'
with open(output_filepath, 'w') as f:
    json.dump(insights, f)

# Displaying the first few insights for review
{key: insights[key][:5] for key in insights.keys()}


Note: you may need to restart the kernel to use updated packages.


{'Genelist1': [{'cell_type': 'Suprabasal keratinocytes',
   'p_value': 4.524016635921081e-70,
   'count_in_both': 78,
   'count_in_genelist_not_cell_type': 122,
   'count_in_cell_type_not_genelist': 464,
   'count_in_neither': 19498,
   'adjusted_p_value': 3.6284431765772604e-17},
  {'cell_type': 'Glandular and luminal cells',
   'p_value': 0.007302404241966121,
   'count_in_both': 9,
   'count_in_genelist_not_cell_type': 191,
   'count_in_cell_type_not_genelist': 333,
   'count_in_neither': 19629,
   'adjusted_p_value': 4.188196739858385e-05},
  {'cell_type': 'Basal keratinocytes',
   'p_value': 7.369658263665469e-38,
   'count_in_both': 43,
   'count_in_genelist_not_cell_type': 157,
   'count_in_cell_type_not_genelist': 251,
   'count_in_neither': 19711,
   'adjusted_p_value': 4.322499406691555e-05},
  {'cell_type': 'neutrophil',
   'p_value': 4.1371234193335395e-07,
   'count_in_both': 29,
   'count_in_genelist_not_cell_type': 171,
   'count_in_cell_type_not_genelist': 1005,
   'cou