In [1]:
import requests
import os # yes or no
from bs4 import BeautifulSoup

def download_files_from_hpa(url, max_size_gb=1, subfolder="downloads"):
    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    # Convert the max size from GB to bytes
    max_size_bytes = max_size_gb * 1e9

    # Make an HTTP GET request to the provided URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a successful response

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Search for all <a> tags with the specified href structure
    links = soup.find_all('a', href=True)

    # Base URL to prepend to relative file paths
    base_url = "https://www.proteinatlas.org"

    for link in links:
        file_url = link['href']
        if file_url.endswith('.zip'):  # Check if the link is to a .zip file
            full_url = base_url + file_url

            # Extract filename from the URL
            filename = file_url.split('/')[-1]

            # Create the full path to save the file
            save_path = os.path.join(subfolder, filename)

            # Check if the file already exists
            if os.path.exists(save_path):
                print(f"{filename} already exists. Skipping download.")
                continue

            # Check file size without downloading the entire file
            file_response = requests.head(full_url)
            file_size = int(file_response.headers.get('Content-Length', 0))

            if file_size <= max_size_bytes:
                # Download the file if it's within the size limit
                print(f"Downloading {filename}...")
                file_response = requests.get(full_url, stream=True)
                with open(save_path, 'wb') as file:
                    for chunk in file_response.iter_content(chunk_size=8192):
                        file.write(chunk)
                print(f"{filename} downloaded!")
            else:
                print(f"Skipping {filename} as it exceeds the size limit.")

# Example usage
download_files_from_hpa("https://www.proteinatlas.org/about/download")


Downloading normal_tissue.tsv.zip...
normal_tissue.tsv.zip downloaded!
Downloading pathology.tsv.zip...
pathology.tsv.zip downloaded!
Downloading subcellular_location.tsv.zip...
subcellular_location.tsv.zip downloaded!
Downloading rna_tissue_consensus.tsv.zip...
rna_tissue_consensus.tsv.zip downloaded!
Downloading rna_tissue_hpa.tsv.zip...
rna_tissue_hpa.tsv.zip downloaded!
Downloading rna_tissue_hpa_description.tsv.zip...
rna_tissue_hpa_description.tsv.zip downloaded!
Downloading rna_brain_hpa.tsv.zip...
rna_brain_hpa.tsv.zip downloaded!
Downloading rna_pfc_brain_hpa.tsv.zip...
rna_pfc_brain_hpa.tsv.zip downloaded!
Downloading rna_tissue_gtex.tsv.zip...
rna_tissue_gtex.tsv.zip downloaded!
Downloading rna_tissue_fantom.tsv.zip...
rna_tissue_fantom.tsv.zip downloaded!
Downloading rna_single_cell_type.tsv.zip...
rna_single_cell_type.tsv.zip downloaded!
Downloading rna_single_cell_type_tissue.tsv.zip...
rna_single_cell_type_tissue.tsv.zip downloaded!
Downloading rna_single_cell_cluster_de

In [21]:
import pandas as pd
import zipfile
import os

def unzip_file_if_not_exists(zip_file_path, output_folder_path, target_file_path):
    # Check if the ZIP file exists in the 'downloads' folder
    if not os.path.exists(zip_file_path):
        print(f"{zip_file_path} does not exist. Skipping unzip.")
        return

    # Check if the target file already exists
    if not os.path.exists(target_file_path):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(output_folder_path)
            print(f"Unzipped files to {output_folder_path}")
    else:
        print(f"{target_file_path} already exists, skipping unzip.")

def process_and_sort_tsv(input_file_path, output_file_path):
    # Create the output directory if it doesn't exist
    output_directory = os.path.dirname(output_file_path)
    os.makedirs(output_directory, exist_ok=True)

    # Read the tsv file into a DataFrame
    df = pd.read_csv(input_file_path, sep='\t')

    # Extract the 'Gene' and 'Cell type' columns
    extracted_df = df[['Gene', 'Cell type']]

    # Sort by 'Cell type'
    sorted_df = extracted_df.sort_values(by='Cell type')
    
    # Save the sorted data to a new .tsv file
    sorted_df.to_csv(output_file_path, sep='\t', index=False)
    print(f"Sorted data saved to {output_file_path}")

# List of TSV files to process
file_names = ['normal_tissue', 'rna_immune_cell', 'rna_single_cell_type', 'rna_single_cell_type_tissue']

# Loop to handle each file
for file_name in file_names:
    zip_file_path = f"downloads/{file_name}.tsv.zip"
    target_file_path = f"unzipped_folder/{file_name}.tsv"
    output_file_path = f"sorted_data/sorted_{file_name}.tsv"

    # Check if the file exists, if not then unzip
    unzip_file_if_not_exists(zip_file_path, 'unzipped_folder', target_file_path)
    
    # If the target file exists, process and sort the TSV file
    if os.path.exists(target_file_path):
        process_and_sort_tsv(target_file_path, output_file_path)


unzipped_folder/normal_tissue.tsv already exists, skipping unzip.
Sorted data saved to sorted_data/sorted_normal_tissue.tsv
Unzipped files to unzipped_folder


KeyError: "['Cell type'] not in index"

In [30]:
import pandas as pd
import zipfile
import os

def unzip_file_if_not_exists(zip_file_path, output_folder_path, target_file_path):
    if not os.path.exists(target_file_path):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(output_folder_path)
            print(f"Unzipped files to {output_folder_path}")
    else:
        print(f"{target_file_path} already exists, skipping unzip.")

def process_and_sort_tsv(input_file_path, output_file_path, cell_type_column):
    # Check if the sorted file already exists
    if os.path.exists(output_file_path):
        print(f"{output_file_path} already exists, skipping sorting.")
        return

    # Create the output directory if it doesn't exist
    output_directory = os.path.dirname(output_file_path)
    os.makedirs(output_directory, exist_ok=True)

    # Read the tsv file into a DataFrame
    df = pd.read_csv(input_file_path, sep='\t')

    # Make column names case-insensitive and strip extra spaces
    normalized_columns = {col.strip().lower(): col for col in df.columns}

    try:
        # Extract the 'Gene' and cell_type_column columns
        actual_cell_type_column = normalized_columns.get(cell_type_column.lower())
        extracted_df = df[['Gene', actual_cell_type_column]]

        # Sort by cell_type_column
        sorted_df = extracted_df.sort_values(by=actual_cell_type_column)
        
        # Save the sorted data to a new .tsv file
        sorted_df.to_csv(output_file_path, sep='\t', index=False)
        print(f"Sorted data saved to {output_file_path}")
    except KeyError as e:
        print(f"Could not find column: {e}")
        print(f"Available columns in the DataFrame: {df.columns.tolist()}")

# List of TSV files to process
file_names = [
    {'name': 'normal_tissue', 'cell_type_column': 'Cell type'},
    {'name': 'rna_single_cell_type', 'cell_type_column': 'Cell type'},
    {'name': 'rna_single_cell_type_tissue', 'cell_type_column': 'Immune cell'},
    {'name': 'rna_immune_cell', 'cell_type_column': 'Immune cell'},
    {'name': 'rna_immune_cell_schmiedel', 'cell_type_column': 'Immune cell'},
    {'name': 'rna_immune_cell_monaco', 'cell_type_column': 'Immune cell'}
]

# Loop to handle each file
for file_info in file_names:
    file_name = file_info['name']
    cell_type_column = file_info['cell_type_column']
    zip_file_path = f"{file_name}.tsv.zip"
    target_file_path = f"unzipped_folder/{file_name}.tsv"
    output_file_path = f"sorted_data/sorted_{file_name}.tsv"

    # Check if the file exists, if not then unzip
    unzip_file_if_not_exists(zip_file_path, 'unzipped_folder', target_file_path)
    
    # Process and sort the TSV file
    process_and_sort_tsv(target_file_path, output_file_path, cell_type_column)


unzipped_folder/normal_tissue.tsv already exists, skipping unzip.
sorted_data/sorted_normal_tissue.tsv already exists, skipping sorting.
unzipped_folder/rna_single_cell_type.tsv already exists, skipping unzip.
sorted_data/sorted_rna_single_cell_type.tsv already exists, skipping sorting.
unzipped_folder/rna_single_cell_type_tissue.tsv already exists, skipping unzip.
sorted_data/sorted_rna_single_cell_type_tissue.tsv already exists, skipping sorting.
unzipped_folder/rna_immune_cell.tsv already exists, skipping unzip.


Sorted data saved to sorted_data/sorted_rna_immune_cell.tsv
unzipped_folder/rna_immune_cell_schmiedel.tsv already exists, skipping unzip.
Sorted data saved to sorted_data/sorted_rna_immune_cell_schmiedel.tsv
unzipped_folder/rna_immune_cell_monaco.tsv already exists, skipping unzip.
Sorted data saved to sorted_data/sorted_rna_immune_cell_monaco.tsv


In [32]:
import os
import pandas as pd
import json

# Directory containing the sorted .tsv files
directory = "sorted_data"

# Initialize a dictionary to hold unique Ensembl Gene IDs for each cell type
ensembl_gene_ids_by_cell_type = {}

# Iterate over all files in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    
    # Check if the entry is a file and has a .tsv extension
    if os.path.isfile(filepath) and filename.endswith('.tsv'):
        print(f"Reading file: {filepath}")
        
        # Read the .tsv file into a DataFrame
        df = pd.read_csv(filepath, sep='\t')
        
        # Drop rows where 'Gene' is NaN
        df = df[df['Gene'].notna()]
        
        # Assuming that the second column is the "Cell type" column
        cell_type_column = df.columns[1]
        
        # Group by 'Cell type' and aggregate unique 'Gene' values
        grouped_df = df.groupby(cell_type_column)['Gene'].unique()
        
        # Merge the current file's grouping into the overall dictionary
        for cell_type, gene_ids in grouped_df.items():
            if cell_type in ensembl_gene_ids_by_cell_type:
                ensembl_gene_ids_by_cell_type[cell_type] = list(set(ensembl_gene_ids_by_cell_type[cell_type] + list(gene_ids)))
            else:
                ensembl_gene_ids_by_cell_type[cell_type] = list(gene_ids)

# Save the aggregated results to a JSON file
output_path = os.path.join(directory, "aggregated_ensembl_gene_ids_by_cell_type.json")
with open(output_path, 'w') as json_file:
    json.dump(ensembl_gene_ids_by_cell_type, json_file)

print(f"Ensembl Gene IDs by cell type from all files written to: {output_path}")


Reading file: sorted_data/sorted_normal_tissue.tsv
Reading file: sorted_data/sorted_rna_single_cell_type.tsv
Reading file: sorted_data/sorted_rna_single_cell_type_tissue.tsv
Reading file: sorted_data/rna_immune_cell.tsv
Reading file: sorted_data/sorted_rna_immune_cell.tsv
Reading file: sorted_data/sorted_rna_immune_cell_schmiedel.tsv
Reading file: sorted_data/sorted_rna_immune_cell_monaco.tsv
Ensembl Gene IDs by cell type from all files written to: sorted_data/aggregated_ensembl_gene_ids_by_cell_type.json
