In [36]:
import pandas as pd 
import numpy as np 
import scanpy as sc
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import warnings
from datetime import date
import hisepy
import os
import shutil
import tarfile
from tqdm import tqdm
warnings.filterwarnings("ignore")
sc.settings.n_jobs = 60
print("Current working directory:", os.getcwd())

Current working directory: /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts


In [37]:
def extract_tar(tar_path, extract_path):
    with tarfile.open(tar_path, 'r') as tar:
        tar.extractall(path=extract_path)
    print(f'Extracted {tar_path} to {extract_path}')

def untar_all_files_in_folder_parallel(folder_path, extract_path):
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)

    tar_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.tar')]

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_tar, tar_path, extract_path) for tar_path in tar_files]
        for future in futures:
            future.result()

# Download MetaData

In [3]:
df=hisepy.list_files_in_project_store('cohorts')

#### BRI

In [4]:
pattern1="scRNA_meta_data-2024-05-09"
pattern2="csv"
filtered_df = df[df['name'].str.contains(pattern1) & df['name'].str.contains(pattern2)]

In [5]:
hise_res = hisepy.reader.cache_files(filtered_df['id'].tolist())

downloading fileID: 9c417a2e-ee43-43b8-9f05-7940b7a42920
Files have been successfully downloaded!


In [6]:
destination_dir=os.getcwd()
os.makedirs(destination_dir, exist_ok=True)

In [7]:
for i in filtered_df['id'].tolist():
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [8]:
filtered_df.to_csv("scRNA_BRI_meta_data_uuid.csv")

#### SF4

# Download scRNA data

In [9]:
df=hisepy.list_files_in_project_store('cohorts')

## h5-BRI

In [10]:
pattern1="tar"
pattern2="polonium-tin-curium"
filtered_df = df[df['name'].str.contains(pattern1) & df['name'].str.contains(pattern2)]

In [11]:
hise_res = hisepy.reader.cache_files(filtered_df['id'].tolist())

downloading fileID: 04666e28-8443-4a51-8670-f409a7b5afe5
downloading fileID: ae2996c3-eab5-4d61-a997-084351727413
downloading fileID: b8f48340-ec96-4ed9-bad1-23fcb1a64e70
downloading fileID: 11d754d9-0323-400b-8c47-8b9193d254d6
downloading fileID: 6c6c9bbb-ac49-42f1-9e4f-f6a00766f331
downloading fileID: 712082ed-2fe2-4121-9f89-7f732b4a58a7
downloading fileID: e1fe73c4-44d1-4092-ba72-72c5efe657d1
downloading fileID: dd3c4973-439f-4987-ac52-12cd86b31021
Files have been successfully downloaded!


In [53]:
destination_dir=os.getcwd()+"/scRNA/BRI/h5ad/"
os.makedirs(destination_dir, exist_ok=True)

In [54]:
destination_dir

'/home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/'

In [55]:
for i in filtered_df['id'].tolist():
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [56]:
folder_with_tars = destination_dir
directory_to_extract = destination_dir
untar_all_files_in_folder_parallel(folder_with_tars, directory_to_extract)

Extracted /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/diha_BR1_Male_Positive_h5ads_2024-05-05.tar to /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/
Extracted /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/diha_BR1_Female_Positive_h5ads_2024-05-05.tar to /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/
Extracted /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/diha_BR2_Female_Negative_h5ads_2024-05-05.tar to /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/
Extracted /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/h5ad/diha_BR1_Female_Negative_h5ads_2024-05-05.

In [57]:
# rename h5ad
directory = 'scRNA/BRI/h5ad/sample_h5ad/'

for filename in tqdm(os.listdir(directory)):
    parts = filename.split('_')
    if len(parts) > 1:  
        prefix = parts[0]
        extension = filename.split('.')[-1]
        newname = f'{prefix}.{extension}'
        old_filepath = os.path.join(directory, filename)
        new_filepath = os.path.join(directory, newname)
        os.rename(old_filepath, new_filepath)

print('Renaming complete.')

100% 868/868 [00:00<00:00, 12005.42it/s]

Renaming complete.





In [58]:
filtered_df.to_csv("scRNA_BRI_h5ad_uuid.csv")

## Pesudobulk-BRI

#### Aggregated_Counts

In [59]:
pattern1="tar"
pattern2="Aggregated_Counts_Tar_BRI-QG-2024-05-09"
filtered_df = df[df['name'].str.contains(pattern1) & df['name'].str.contains(pattern2)]

In [60]:
hise_res = hisepy.reader.cache_files(filtered_df['id'].tolist())

downloading fileID: d6be049e-4ed5-4abc-a172-f48aefb7825d
Files have been successfully downloaded!


In [61]:
destination_dir=os.getcwd()+"/scRNA/BRI/Aggregated_Count/"
os.makedirs(destination_dir, exist_ok=True)

In [62]:
for i in filtered_df['id'].tolist():
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [63]:
folder_with_tars = destination_dir
directory_to_extract = destination_dir
untar_all_files_in_folder_parallel(folder_with_tars, directory_to_extract)

Extracted /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/Aggregated_Count/Aggregated_Raw.tar to /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/Aggregated_Count/


In [64]:
filtered_df.to_csv("scRNA_BRI_Aggregated_Count_uuid.csv")

#### Mean_Expression of log normalized

In [65]:
pattern1="tar"
pattern2="Average_LogNormalized_Expression_Tar_BRI"
filtered_df = df[df['name'].str.contains(pattern1) & df['name'].str.contains(pattern2)]

In [66]:
hise_res = hisepy.reader.cache_files(filtered_df['id'].tolist())

downloading fileID: 00478a17-3721-4cf3-b6fe-a3473a67e575
Files have been successfully downloaded!


In [67]:
destination_dir=os.getcwd()+"/scRNA/BRI/Average_LogNormalized_Expression/"
os.makedirs(destination_dir, exist_ok=True)

In [68]:
for i in filtered_df['id'].tolist():
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [69]:
folder_with_tars = destination_dir
directory_to_extract = destination_dir
untar_all_files_in_folder_parallel(folder_with_tars, directory_to_extract)

Extracted /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/Average_LogNormalized_Expression/Average_LogNormalized_Expression.tar to /home/jupyter/IH-A-Aging-Analysis-Notebooks_old/Mansi_Notebooks/scRNA/CertPro_analysis/scripts/scRNA/BRI/Average_LogNormalized_Expression/


In [70]:
filtered_df.to_csv("scRNA_BRI_Average_LogNormalized_Expression_uuid.csv")

## h5-SF4

In [73]:
pattern1="h5ad"
pattern2="SF4_cleaned-QG-2024-05-07"
filtered_df = df[df['name'].str.contains(pattern1) & df['name'].str.contains(pattern2)]

In [74]:
hise_res = hisepy.reader.cache_files(filtered_df['id'].tolist())

downloading fileID: a2ab3850-d13d-4212-b552-993ff1b86983
downloading fileID: d9ee49ae-8b43-465b-ad97-776b348c682e
downloading fileID: b5b523f6-cb17-4498-9265-ac80d77da037
downloading fileID: ee138c07-09e5-438b-9461-e0401010c263
downloading fileID: 10c6be2b-ebaf-492e-8391-c7541b68364b
downloading fileID: 9ebb6e38-54ab-44ed-bfec-e3d2c5947b41
downloading fileID: f860e0f7-f9e3-4b98-bcf0-8882454d52d4
downloading fileID: 161802cf-30cb-4a7d-ac19-67732b646433
downloading fileID: 16f9d200-ff64-4aea-b306-f01ca9fa4856
downloading fileID: 8859ddc8-c366-4797-bf7a-5a30318d22c8
downloading fileID: b34498a4-bdcd-4c08-b1a0-78de74046a7a
downloading fileID: e0332824-6053-48d8-a8f0-04eb6ade6ffa
downloading fileID: 6e3bd73f-36f0-4ce6-af02-16449c9d59e9
downloading fileID: 522a8548-eba1-467b-9d5f-b3f849dc17e8
downloading fileID: 5274330d-886b-48de-a2d4-9164de074f2f
downloading fileID: 51e76a62-05e5-40dc-8486-d29d326ded2f
downloading fileID: 2d0dcd35-be00-4161-83ca-019573320fc9
downloading fileID: e6f52388-f2

In [75]:
destination_dir=os.getcwd()+"/scRNA/SF4/h5ad/"
os.makedirs(destination_dir, exist_ok=True)

In [76]:
for i in filtered_df['id'].tolist():
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [77]:
filtered_df.to_csv("scRNA_SF4_h5ad_uuid.csv")

# Download Olink

In [78]:
file_uuid=['c142f67d-403b-457f-ae17-accc5089399f',
           "b494cce8-1314-4f6e-9666-42f0c6e1c702",
           "00753770-4803-4947-a290-e7469c410067", 
           "59f5f656-085f-4d0b-a240-68a9eb15e68e", 
           "1a528317-bbf3-45b7-82c9-529577cf0b15", 
           "55d5d20a-506a-459e-8315-7d66276fb8f9"]

In [79]:
hise_res = hisepy.reader.cache_files(file_uuid)

downloading fileID: c142f67d-403b-457f-ae17-accc5089399f
downloading fileID: b494cce8-1314-4f6e-9666-42f0c6e1c702
downloading fileID: 00753770-4803-4947-a290-e7469c410067
downloading fileID: 59f5f656-085f-4d0b-a240-68a9eb15e68e
downloading fileID: 1a528317-bbf3-45b7-82c9-529577cf0b15
downloading fileID: 55d5d20a-506a-459e-8315-7d66276fb8f9
Files have been successfully downloaded!


In [80]:
destination_dir=os.getcwd()+"/Olink/"
os.makedirs(destination_dir, exist_ok=True)

In [81]:
for i in file_uuid:
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [82]:
pd.DataFrame(file_uuid).to_csv("Olink_BRI_uuid.csv")

# Download FlowCyto

In [83]:
query_dict={'fileType':["FlowCytometry-labeled-expr-csv"],
            "panel":["PT1"],
            "cohortGuid":["BR1","BR2"],
           "visitName":["Flu Year 1 Day 0","Flu Year 1 Day 7"]}

In [84]:
df=hisepy.get_file_descriptors(query_dict=query_dict)

In [85]:
df=df['descriptors']

In [86]:
hise_res = hisepy.reader.cache_files(df['file.id'].tolist())

downloading fileID: 89431dd4-b67f-4bd0-ac7b-9a903a9d552d
downloading fileID: 8a21fe5c-1e11-4698-b12f-5f8f266e08f2
downloading fileID: 295d4a7c-3eae-40fe-8877-58880aa14db1
downloading fileID: b0a4cbdf-937a-4366-b18b-efe3c9dd3c69
downloading fileID: e3ce727a-60ed-468d-abe5-380925fe1cfe
downloading fileID: 4f7307f2-7474-4222-9287-78bb4cdb394d
downloading fileID: d2fec743-03e3-4a04-96f3-aac213ecef76
downloading fileID: 19adbe98-6db8-4560-bb20-63476d85ee75
downloading fileID: 374b776c-5a89-4e30-bcb6-8e173a1ea575
downloading fileID: 569489c9-b35f-4bf6-ab9d-a7d54e1dbce2
downloading fileID: f889ec20-82c9-448a-a6b6-5462d51dd1b1
downloading fileID: f5d95252-75c1-4d8c-b0b1-cbff829b41b0
downloading fileID: f2b86af0-ba6f-4430-b26b-37776f0fbada
downloading fileID: 35296f85-40b5-42a5-b276-8795bc024235
downloading fileID: a2688351-a787-4bdd-9d59-7f224bb818c2
downloading fileID: e84c6693-821f-4581-80e3-1d050ccad1df
downloading fileID: 8b9f737b-981c-46ba-a8fc-97a88dbad757
downloading fileID: fd764d02-d3

In [87]:
destination_dir=os.getcwd()+"/FlowCyto/"
os.makedirs(destination_dir, exist_ok=True)

In [88]:
for i in file_uuid:
    source_dir = f'/home/jupyter/cache/{i}'
    if os.path.exists(source_dir):
        files = os.listdir(source_dir)
        for file in files:
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                shutil.move(file_path, destination_dir)

In [89]:
df.to_csv("FlowCyto_BRI_uuid.csv")