In [1]:
import os
import pandas as pd
from pathlib import Path
import requests
import json
import re
import shutil

This workbook contains the flow I used for pulling and parsing through different files. 

The script below pulls the Lung Transplant Studies from the Red Cap database and saves it to the folder in this directory. Comment it out once done

In [None]:
api_url = "https://redcap.nubic.northwestern.edu/redcap/api/"
api_token = "BE24D55CBF39107AD9CA7EB215E9E2E5"
recs = requests.post(api_url, #get all the records names
  data={
    'token':   api_token,
    'content': 'record',
    'format':  'json'
  }
).json()

print(f"Got {len(recs)} records\n")


# print("First 3 records preview:")
# print(json.dumps(recs[:3], indent=2)) 

record_ids = sorted(set([r['record_id'] for r in recs]))
print(record_ids) #onle keep unique record names

meta = requests.post(api_url, data={
  'token': api_token,
  'content': 'metadata',
  'format': 'json'
}).json()


file_fields = [m['field_name']
               for m in meta
               if m['field_type']=='file']
print("File‑upload fields:", file_fields)


folder = 'all_LT_new_trial'
os.makedirs(folder, exist_ok=True)

file_fields_project_specific = ['bal1_sort_report', 'bal1_sort_fcs', 'bal2_sort_report', 'bal2_sort_fcs', 
                                'bal3_sort_report', 'bal3_sort_fcs', 'bal4_sort_report', 'bal4_sort_fcs']  

for rec in record_ids: #for each record id pull the file
    for fld in file_fields_project_specific:
        payload = {
            'token':   api_token,
            'content': 'file',
            'action':  'export',
            'record':  rec,
            'field':   fld,            # one field per request
        }
        r = requests.post(api_url, data=payload)
        if r.status_code == 200 and r.content:
            ctype = r.headers.get('Content-Type', '')
            if 'text/xml' in ctype:
                print(f"No file for {rec}-{fld}, got XML error")
                continue

            # extract the name="..." parameter
            m = re.search(r'name="([^"]+)"', ctype)
            filename = m.group(1)   # e.g. "20210923_LT001_LUL_23092021171938.pdf"
            # write out the file
            outpath = os.path.join(folder, filename)
            with open(outpath, 'wb') as f:
                f.write(r.content)
            print(f"Wrote {outpath}")
        else:
             print(f"No file for {rec}‑{fld} (HTTP {r.status_code})")


Got 14887 records

['LT001', 'LT002', 'LT003', 'LT004', 'LT005', 'LT006', 'LT007', 'LT008', 'LT009', 'LT010', 'LT011', 'LT012', 'LT013', 'LT014', 'LT015', 'LT016', 'LT017', 'LT018', 'LT019', 'LT020', 'LT021', 'LT022', 'LT023', 'LT024', 'LT025', 'LT026', 'LT027', 'LT028', 'LT029', 'LT030', 'LT031', 'LT032', 'LT033', 'LT034', 'LT035', 'LT036', 'LT037', 'LT038', 'LT039', 'LT040', 'LT041', 'LT042', 'LT043', 'LT044', 'LT045', 'LT046', 'LT047', 'LT048', 'LT049', 'LT050', 'LT051', 'LT052', 'LT053', 'LT054', 'LT055', 'LT056', 'LT057', 'LT058', 'LT059', 'LT060', 'LT061', 'LT062', 'LT063', 'LT064', 'LT065', 'LT066', 'LT067', 'LT068', 'LT069', 'LT070', 'LT071', 'LT072', 'LT073', 'LT074', 'LT075', 'LT076', 'LT077', 'LT078', 'LT079', 'LT080', 'LT081', 'LT082', 'LT083', 'LT084', 'LT085', 'LT086', 'LT087', 'LT088', 'LT089', 'LT090', 'LT091', 'LT092', 'LT093', 'LT094', 'LT095', 'LT096', 'LT097', 'LT098', 'LT099', 'LT100', 'LT101', 'LT102', 'LT103', 'LT104', 'LT105', 'LT106', 'LT107', 'LT108', 'LT109',

No file for LT012‑bal4_sort_fcs (HTTP 400)
Wrote all_LT_new_trial/20240104_LT013 LUL_04012024162148.pdf
Wrote all_LT_new_trial/20240104_LT013 LUL_001.fcs
Wrote all_LT_new_trial/20240104_LT013 RML_04012024162857.pdf
Wrote all_LT_new_trial/20240104_LT013 RML_001.fcs
No file for LT013‑bal3_sort_report (HTTP 400)
No file for LT013‑bal3_sort_fcs (HTTP 400)
No file for LT013‑bal4_sort_report (HTTP 400)
No file for LT013‑bal4_sort_fcs (HTTP 400)
Wrote all_LT_new_trial/20210805_LT014_05082021175022.pdf
Wrote all_LT_new_trial/20210805_LT014_001.fcs
No file for LT014‑bal2_sort_report (HTTP 400)
No file for LT014‑bal2_sort_fcs (HTTP 400)
No file for LT014‑bal3_sort_report (HTTP 400)
No file for LT014‑bal3_sort_fcs (HTTP 400)
No file for LT014‑bal4_sort_report (HTTP 400)
No file for LT014‑bal4_sort_fcs (HTTP 400)
Wrote all_LT_new_trial/20210805_LT015_05082021180626.pdf
Wrote all_LT_new_trial/20210805_LT015_001.fcs
No file for LT015‑bal2_sort_report (HTTP 400)
No file for LT015‑bal2_sort_fcs (HTTP 

No file for LT034‑bal3_sort_report (HTTP 400)
No file for LT034‑bal3_sort_fcs (HTTP 400)
No file for LT034‑bal4_sort_report (HTTP 400)
No file for LT034‑bal4_sort_fcs (HTTP 400)
Wrote all_LT_new_trial/20210812_LT035 LUL_12082021171335.pdf
Wrote all_LT_new_trial/20210812_LT035 LUL_001.fcs
No file for LT035‑bal2_sort_report (HTTP 400)
No file for LT035‑bal2_sort_fcs (HTTP 400)
No file for LT035‑bal3_sort_report (HTTP 400)
No file for LT035‑bal3_sort_fcs (HTTP 400)
No file for LT035‑bal4_sort_report (HTTP 400)
No file for LT035‑bal4_sort_fcs (HTTP 400)
Wrote all_LT_new_trial/20230727_LT036 LUL_27072023184634.pdf
Wrote all_LT_new_trial/20230727_LT036 LUL_001.fcs
Wrote all_LT_new_trial/20230727_LT036 RML_27072023192627.pdf
Wrote all_LT_new_trial/20230727_LT036 RML_001.fcs
No file for LT036‑bal3_sort_report (HTTP 400)
No file for LT036‑bal3_sort_fcs (HTTP 400)
No file for LT036‑bal4_sort_report (HTTP 400)
No file for LT036‑bal4_sort_fcs (HTTP 400)
No file for LT037‑bal1_sort_report (HTTP 400

Wrote all_LT_new_trial/20221014_LT056,3f,,3f,RML_14102022173443.pdf
Wrote all_LT_new_trial/20221014_LT056,3f,,3f,RML_001.fcs
No file for LT056‑bal3_sort_report (HTTP 400)
No file for LT056‑bal3_sort_fcs (HTTP 400)
No file for LT056‑bal4_sort_report (HTTP 400)
No file for LT056‑bal4_sort_fcs (HTTP 400)
Wrote all_LT_new_trial/20220407_LT057 LUL_07042022160949.pdf
Wrote all_LT_new_trial/20220407_LT057 LUL_001.fcs
Wrote all_LT_new_trial/20220407_LT057 RML_07042022161913.pdf
Wrote all_LT_new_trial/20220407_LT057 RML_001.fcs
No file for LT057‑bal3_sort_report (HTTP 400)
No file for LT057‑bal3_sort_fcs (HTTP 400)


Some of the files do not come from RedCap but from OneDrive, so they have to be moved to the directory where the following jupyter notebook is located. These folders have to be parsed through using parse_fcs_folder, and recorded in the inventory. The function below assume that the parent folder had subfolders and uses metadata from those subfolders to collect the data into the inventory. 

In [12]:
def parse_fcs_folder(main_folder):
    study_condition_map = {
        'PASC': 'long COVID (Post-acute sequelae of SARS-CoV-2 infection)',
        'Abbvie': 'scleroderma',
        'Duke_ozone': 'control samples, healthy volunteers',
        'WashU_BAL': 'pneumonia',
        'SCRIPT': 'pneumonia'
    } #the folders names should be here
    #add more if needed

    records = []

    for subfolder in os.listdir(main_folder):
        subfolder_path = os.path.join(main_folder, subfolder)
        if not os.path.isdir(subfolder_path):
            continue

        study_name = os.path.basename(main_folder)
        condition = study_condition_map.get(study_name, 'Unknown')

        fcs_files = [f for f in os.listdir(subfolder_path) if f.endswith('.fcs')]
        pdf_files = [f for f in os.listdir(subfolder_path) if f.endswith('.pdf')]

        for fcs in fcs_files:
            fcs_parts = fcs.split('_')
            fcs_key = '_'.join(fcs_parts[:2]) if len(fcs_parts) >= 2 else fcs_parts[0]
            base = fcs_parts[0]
            date_str = base[:8]
            year = date_str[:4]
            month = date_str[4:6]

            matched_pdfs = [
                pdf for pdf in pdf_files if pdf.startswith(fcs_key)
            ]

            records.append({
                'Study': study_name,
                'Disease/Condition': condition,
                'Subfolder': subfolder,
                'FCS File': fcs,
                'Year': year,
                'Month': month,
                'Is there a PDF report?': bool(matched_pdfs),
                '# of PDF reports associated with fcs file': len(matched_pdfs),
                'PDF reports names:': ', '.join(matched_pdfs)
                
            })

    df = pd.DataFrame(records)
    return df

Some of the folders dont have the subfolders, so we need to use a different function for them. 

In [38]:
def parse_fcs_folder_no_subfolders(folder):
    study_condition_map = {
        'PASC': 'long COVID (Post-acute sequelae of SARS-CoV-2 infection)',
        'Abbvie': 'scleroderma',
        'Duke_ozone': 'control samples, healthy volunteers',
        'WashU_BAL': 'pneumonia',
        'SCRIPT': 'pneumonia',
        "LungTransplant":"pulmonary fibrosis"
    }

    records = []


    study_name = os.path.basename(folder)
    condition = study_condition_map.get(study_name, 'Unknown')

    fcs_files = [f for f in os.listdir(folder) if f.endswith('.fcs')]
    pdf_files = [f for f in os.listdir(folder) if f.endswith('.pdf')]

    for fcs in fcs_files:
        fcs_parts = fcs.split('_')
        fcs_key = '_'.join(fcs_parts[:2]) if len(fcs_parts) >= 2 else fcs_parts[0]
        base = fcs_parts[0]
        date_str = base[:8]
        year = date_str[:4]
        month = date_str[4:6]

        matched_pdfs = [
            pdf for pdf in pdf_files if pdf.startswith(fcs_key)
        ]

        records.append({
            'Study': study_name,
            'Disease/Condition': condition,
            'Subfolder': f"{os.path.join(os.getcwd(),folder)}",
            'FCS File': fcs,
            'Year': year,
            'Month': month,
            'Is there a PDF report?': bool(matched_pdfs),
            '# of PDF reports associated with fcs file': len(matched_pdfs),
            'PDF reports names:': ', '.join(matched_pdfs)

        })

    df = pd.DataFrame(records)
    return df

In [3]:
os.getcwd()

'/gpfs/projects/b1042/MisharinLab/anna'

In [9]:
# pasc = parse_fcs_folder('/mnt/c/Users/Anechka/Documents/Northwestern/files/Sasha_one_drive/PASC')

# pasc

# abbvie = parse_fcs_folder('/mnt/c/Users/Anechka/Documents/Northwestern/files/Sasha_one_drive/Abbvie')

# abbvie

# duke = parse_fcs_folder('/mnt/c/Users/Anechka/Documents/Northwestern/files/Sasha_one_drive/Duke_ozone')

# duke

# wash = parse_fcs_folder('/mnt/c/Users/Anechka/Documents/Northwestern/files/Sasha_one_drive/WashU_BAL')

# wash

# concatenated = pd.concat([pasc,abbvie,wash,duke], ignore_index = True)

# filename = "one_drive_inventory.csv"
# path = '/mnt/c/Users/Anechka/Documents/Northwestern/files/Sasha_one_drive'
# csv_ed = concatenated.to_csv(os.path.join(path,filename))

In [40]:
#sample run 
script_path = "SCRIPT" #indicate the name of the folder
script = parse_fcs_folder(script_path) #parse throught the folder
script.head()


Unnamed: 0,Study,Disease/Condition,Subfolder,FCS File,Year,Month,Is there a PDF report?,# of PDF reports associated with fcs file,PDF reports names:
0,SCRIPT,pneumonia,2022_08,20220818_1647-BAL-00_001.fcs,2022,8,True,1,20220818_1647-BAL-00_18082022141905.pdf
1,SCRIPT,pneumonia,2022_08,20220802_1644-BAL-00_001.fcs,2022,8,True,1,20220802_1644-BAL-00_02082022173627.pdf
2,SCRIPT,pneumonia,2022_08,20220825_1650-BAL-07_001.fcs,2022,8,True,1,20220825_1650-BAL-07_31082022172231.pdf
3,SCRIPT,pneumonia,2022_08,20220825_1628-BAL-90_001.fcs,2022,8,True,1,20220825_1628-BAL-90_31082022174051.pdf
4,SCRIPT,pneumonia,2022_08,20220818_1648-BAL-00_001.fcs,2022,8,True,1,20220818_1648-BAL-00_18082022143036.pdf


Optional, but recommended: save to csv

In [41]:
filename = "csved_script.csv" 
path = "."
csv_ed = script.to_csv(os.path.join(path,filename))

Some folders do not have the formating suitable to run function parse_fcs_files. Especially those that came from manually downloading from RedCap (SCRIPT or Abbvie). So they require renaming of the files. For renaming the files, we need the csv report that can map the names of the files to its nonconforming names. 

In [2]:
path_abbvie_report = "AbbSSc (all available samples)/MolecularBiomarkersT-FSMFiles_DATA_2025-05-02_1210.csv" #import the RedCap csv report
root = Path('AbbSSc (all available samples)')
abbvie_report = pd.read_csv(path_abbvie_report)
abbvie_report

In [25]:
"""Function to use for renaming Abbvie Files downloaded as Zip from the RedCap report"""

from pathlib import Path
import pandas as pd

def rename_abbvie_files(abbvie_report: pd.DataFrame,
                        root: Path,
                        dry_run: bool = True) -> None:
    """
    Rename AbbVie BAL or whole‑blood files inside *root* based on filenames
    stored in `abbvie_report`.

    Parameters
    ----------
    abbvie_report : pd.DataFrame
        Must contain columns like 'bal1_sort_report', 'bal2_sort_fcs',
        and 'whole_blood_fcs_1' plus the usual metadata columns:
        'study_code' and 'redcap_event_name'.
    root : pathlib.Path
        Folder where the files live.
    dry_run : bool, default True
        If True, only prints what *would* happen.  Set False to rename for real.
    """
    for _, row in abbvie_report.iterrows():
        study = row.study_code
        event = row.redcap_event_name

        for bal in (1, 2):
            for kind, ext in (('sort_report', 'pdf'),
                              ('sort_fcs',    'fcs')):
                # grab both “new” names
                main_name  = str(row.get(f'bal{bal}_{kind}', '')).strip()
                blood_name = str(row.get('whole_blood_fcs_1', '')).strip()

                # (search‑pattern, target‑name) pairs
                candidates = [
                    (f"{study}_{event}_bal{bal}_{kind}.{ext}", main_name),
                    (f"{study}_{event}_whole_blood_fcs_1.{ext}", blood_name)
                ]

                for stub, new_name in candidates:
                    if not new_name or new_name.lower() == 'nan':
                        continue  # nothing to rename to

                    for old_path in root.glob(stub):
                        new_path = root / new_name
                        print(f"{old_path.name}  →  {new_path.name}")
                        if not dry_run:
                            old_path.rename(new_path)


In [None]:
# #to run uncomment this:

root = Path("AbbSSc (all available samples)") #specify your path
rename_abbvie_files(abbvie_report, root)          # check if the renaming is correct - dry run
rename_abbvie_files(abbvie_report, root, False)   # call this for complete renaming


In [26]:
os.rename('AbbSSc (all available samples)', "Abbvie") #rename the folder before running the parse function

In [43]:
abbvie_2 = parse_fcs_folder_no_subfolders("Abbvie")
abbvie_2

filename = "csved_abbvie.csv"
path = "."
csv_ed = abbvie_2.to_csv(os.path.join(path,filename))

In [8]:
# LT_report = pd.read_csv("LungTransplant/LungTransplantBiorep-FCSFiles_DATA_2025-05-08_1250.csv")

# LT_report.columns

# root = Path("LungTransplant")
# def rename_LT_files(LT_report: pd.DataFrame,
#                         root: Path,
#                         dry_run: bool = True) -> None:
#     """
#     Rename lung transplant files inside *root* based on filenames
#     stored in `abbvie_report`.

#     Parameters
#     ----------
#     abbvie_report : pd.DataFrame
#         Must contain columns like 'bal1_sort_report', 'bal2_sort_fcs',
#         and 'whole_blood_fcs_1' plus the usual metadata columns:
#         'study_code' and 'redcap_event_name'.
#     root : pathlib.Path
#         Folder where the files live.
#     dry_run : bool, default True
#         If True, only prints what *would* happen.  Set False to rename for real.
#     """
#     for _, row in LT_report.iterrows():
#         for bal in (1, 2):
#             for kind, ext in (('sort_report', 'pdf'),
#                               ('sort_fcs',    'fcs')):
#                 # grab both “new” names
#                 new_name  = str(row.get(f'bal{bal}_{kind}', '')).strip()
#                 old_stab = f"{row.record_id}_{row.redcap_repeat_instrument}_{row.redcap_repeat_instance}_bal{bal}_{kind}.{ext}"
#                 if new_name == 'nan':
#                     continue
#                 for old_path in root.glob(old_stab):
#                     new_path = root / new_name
#                     print(f"{old_path.name}  →  {new_path.name}")
#                     if not dry_run:
#                         old_path.rename(new_path)

# root = Path("LungTransplant")
# rename_LT_files(LT_report,root,dry_run=False)

# LT = parse_fcs_folder_no_subfolders("LungTransplant")

# LT.head()
# filename = "csved_LT.csv"
# path = "."
# csv_ed = LT.to_csv(os.path.join(path,filename))

The script below is an example how to pull the csv report from the RedCap -> Optional 

In [1]:
import requests

data = {
    'token': 'BE24D55CBF39107AD9CA7EB215E9E2E5',
    'content': 'report',
    'format': 'csv',             
    'report_id': '55649',
    'csvDelimiter': ',',          
    'rawOrLabel': 'raw',
    'rawOrLabelHeaders': 'raw',
    'exportCheckboxLabel': 'false'
}

r = requests.post('https://redcap.nubic.northwestern.edu/redcap/api/', data=data)
print('HTTP Status:', r.status_code)

if r.ok:
    # write the CSV out to disk
    with open('report_LT_redcap.csv', 'w', newline='') as f:
        f.write(r.text)
    print('Saved report to report_LT_redcap.csv')
else:
    print('Error:', r.text)

HTTP Status: 200
Saved report to report_LT_redcap.csv


Now, since we have assembled the entire inventory, we may now select at least 20 (choose your number) at random from each study. 

In [31]:
inventory = pd.read_csv("ML flow analysis - Copy of Inventory 2025.csv")
inventory.head()

Unnamed: 0,Study,Disease/Condition,Subfolder,FCS File,Year,Month,Is there a PDF report?,# of PDF reports associated with fcs file,PDF reports names:,Unnamed: 9,Unnamed: 10
0,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20210413 PASC0019,20210413_LC001_BAL_01_001.fcs,2021,4,True,1,20210413_LC001_BAL_01_13042021165347.pdf,,
1,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20210416 PASC0020,20210416_LC002_001.fcs,2021,4,True,1,20210416_LC002_16042021164810.pdf,,
2,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20210416 PASC0022,20210416_LC003_001.fcs,2021,4,True,1,20210416_LC003_16042021165838.pdf,,
3,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20210427 PASC0021,20210427_LC004_001.fcs,2021,4,True,1,20210427_LC004_27042021140712.pdf,,
4,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20210511 PASC0017,20210511_LC005_001.fcs,2021,5,True,2,"20210511_LC005_11052021145607.pdf, 20210511_LC...",,


Now get rid of the rows where fcs files dont have the pdfs. 

In [10]:
inventory_clean = inventory[inventory["Is there a PDF report?"] == True]

groups_count = inventory_clean.groupby("Study").size()

# print(groups_count)
# print(groups_count["WashU_BAL"])

Study
Abbvie              27
Duke_ozone          42
Lung Transplant    644
PASC                30
SCRIPT             988
WashU_BAL           13
dtype: int64
13


In [12]:
studies = inventory_clean['Study'].unique()
sampled_dfs = []

for study in studies:
    if groups_count[study] >= 20:
        study_sample = inventory_clean[inventory_clean['Study'] == study].sample(n=20, random_state=42) # Sample 20 rows for each study
        sampled_dfs.append(study_sample)
        
    else:
        sampled_dfs.append(inventory_clean[inventory_clean['Study'] == study])
        
sampled_df = pd.concat(sampled_dfs, ignore_index=True)# Concatenate all samples

sampled_df.columns

Index(['Study', 'Disease/Condition', 'Subfolder', 'FCS File', 'Year', 'Month',
       'Is there a PDF report?', '# of PDF reports associated with fcs file',
       'PDF reports names:', 'Unnamed: 9', 'Unnamed: 10'],
      dtype='object')

In [13]:
sampled_df.drop(columns = ['Unnamed: 9', 'Unnamed: 10'])

Unnamed: 0,Study,Disease/Condition,Subfolder,FCS File,Year,Month,Is there a PDF report?,# of PDF reports associated with fcs file,PDF reports names:
0,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20220719 PASC0175,20220719_PASC0175_001.fcs,2022,7,True,1,20220719_PASC0175_19072022165905.pdf
1,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20220131 PASC0113,20220131_PASC0113_001.fcs,2022,1,True,1,20220131_PASC0113_31012022135046.pdf
2,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20220418 PASC0145,20220418_PASC0145_001.fcs,2022,4,True,1,20220418_PASC0145_18042022154345.pdf
3,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20220209 PASC0122,20220209_PASC0122_001.fcs,2022,2,True,1,20220209_PASC0122_09022022164948.pdf
4,PASC,long COVID (Post-acute sequelae of SARS-CoV-2 ...,20210727 PASC0049,20210727_LC009_BAL01_001.fcs,2021,7,True,1,20210727_LC009_BAL01_27072021164852.pdf
...,...,...,...,...,...,...,...,...,...
108,Lung Transplant,pulmonary fibrosis,/gpfs/projects/b1042/MisharinLab/anna/all_LT_n...,20210805_LT014_001.fcs,2021,8,True,1,20210805_LT014_05082021175022.pdf
109,Lung Transplant,pulmonary fibrosis,/gpfs/projects/b1042/MisharinLab/anna/all_LT_n...,20230927_LT243 LUL-3_001.fcs,2023,9,True,1,20230927_LT243 LUL-3_27092023174017.pdf
110,Lung Transplant,pulmonary fibrosis,/gpfs/projects/b1042/MisharinLab/anna/all_LT_n...,20230310_LT175 RML03_001.fcs,2023,3,True,1,20230310_LT175 RML03_10032023174949.pdf
111,Lung Transplant,pulmonary fibrosis,/gpfs/projects/b1042/MisharinLab/anna/all_LT_n...,20230429_LT195 LUL-01_001.fcs,2023,4,True,1,20230429_LT195 LUL-01_29042023103718.pdf


In [14]:
sampled_df.to_csv(os.path.join(".","training_set_sampled.csv"))

All the scripts bellow how to tak the original folder, find the files from the training set dataset and save them to one folder. 

In [15]:
import shutil

SOURCE_FOLDER = "all_LT_new_trial"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Lung Transplant"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Copy each file
for filename in files_to_copy:
    source_path = os.path.join(SOURCE_FOLDER, filename)
    destination_path = os.path.join(DESTINATION_FOLDER, filename)
    
    try:
        shutil.copy2(source_path, destination_path)
        print(f"Copied: {filename}")
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error copying {filename}: {str(e)}")

print(f"\nFinished! Copied {len(files_to_copy)} files to {DESTINATION_FOLDER}")

Copied: 20220915_LT97 LUL_001.fcs
Copied: 20240308_LT260 LUL-1_001.fcs
Copied: 20230430_LT192 RML-04_001.fcs
Copied: 20221226_LT156 LUL DONOR_001.fcs
Copied: 20230927_LT243 LUL-1_001.fcs
Copied: 20230219_LT178 LUL-2_001.fcs
Copied: 20211111_LT017 RML_001.fcs
Copied: 20231029_LT240 LUL-01_001.fcs
Copied: 20230729_LT224 LUL-3_001.fcs
Copied: 20220901_LT093 LUL_001.fcs
Copied: 20240225_LT261-RML-25FEB2024-2_001.fcs
Copied: 20241110_LT323-LUL-3_001.fcs
Copied: 20230107_LT174 LLL_001.fcs
Copied: 2023909_LT232-RML-04_001.fcs
Copied: 20241231_LT335 LUL_001.fcs
Copied: 20210805_LT014_001.fcs
Copied: 20230927_LT243 LUL-3_001.fcs
Copied: 20230310_LT175 RML03_001.fcs
Copied: 20230429_LT195 LUL-01_001.fcs
Copied: 20240108_LT337-RML-2_001.fcs

Finished! Copied 20 files to testing_data


In [16]:
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Lung Transplant"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Copy each file
for filename in files_to_copy:
    source_path = os.path.join(SOURCE_FOLDER, filename)
    destination_path = os.path.join(DESTINATION_FOLDER, filename)
    
    try:
        shutil.copy2(source_path, destination_path)
        print(f"Copied: {filename}")
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error copying {filename}: {str(e)}")

print(f"\nFinished! Copied {len(files_to_copy)} files to {DESTINATION_FOLDER}")

Copied: 20220915_LT97 LUL_15092022181342.pdf
Copied: 20240308_LT260 LUL-1_08032024142010.pdf
Copied: 20230430_LT192 RML-04_30042023131201.pdf
Copied: 20221226_LT156 LUL DONOR_26122022204201.pdf
Copied: 20230927_LT243 LUL-1_27092023173126.pdf
Copied: 20230219_LT178 LUL-2_19022023170056.pdf
Copied: 20211111_LT017 RML_11112021171101.pdf
Copied: 20231029_LT240 LUL-01_29102023191800.pdf
Copied: 20230729_LT224 LUL-3_30072023022645.pdf
Copied: 20220901_LT093 LUL_01092022160209.pdf
Copied: 20240225_LT261-RML-25FEB2024-2_25022024165726.pdf
Copied: 20241110_LT323-LUL-3_10112024073924.pdf
Copied: 20230107_LT174 LLL_12012023150405.pdf
Copied: 2023909_LT232-RML-04_09092023144524.pdf
Copied: 20241231_LT335 LUL_31122024182312.pdf
Copied: 20210805_LT014_05082021175022.pdf
Copied: 20230927_LT243 LUL-3_27092023174017.pdf
Copied: 20230310_LT175 RML03_10032023174949.pdf
Copied: 20230429_LT195 LUL-01_29042023103718.pdf
Copied: 20240108_LT337-RML-2_08012025175016.pdf

Finished! Copied 20 files to testing_da

In [17]:
"""Repeat for SCRIPT"""

SOURCE_FOLDER = "SCRIPT"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "SCRIPT"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20241106_2131-BAL-00_001.fcs (from SCRIPT/2024_09_10_11_12)
Copied: 20250117_2145-BAL-00_001.fcs (from SCRIPT/2025_01_02_03_04_05)
Copied: 20250307_2159-BAL-00_001.fcs (from SCRIPT/2025_01_02_03_04_05)
Copied: 20250111_2140-BAL-07_001.fcs (from SCRIPT/2025_01_02_03_04_05)
Copied: 20250111_2143-BAL-00_001.fcs (from SCRIPT/2025_01_02_03_04_05)
Copied: 20240817_2112-BAL-00_001.fcs (from SCRIPT/2024_06_07_08)
Copied: 20200524_1293-BAL-09_003.fcs (from SCRIPT/2020_05)
Copied: 20220410_1619-BAL-02_001.fcs (from SCRIPT/2022_04)
Copied: 20200929_1386-BAL-00_002.fcs (from SCRIPT/2020_09)
Copied: 20210208_1472-BAL-00_001.fcs (from SCRIPT/2021_02)
Copied: 20210306_1483-BAL-00_001.fcs (from SCRIPT/2021_03)
Copied: 20230519_1714-BAL-00_001.fcs (from SCRIPT/2023_05)
Copied: 20221209_1679-BAL-00_001.fcs (from SCRIPT/2022_12)
Copied: 20210628_1529-BAL-04_001.fcs (from SCRIPT/2021_06)
Copied: 20230823_2022-BAL-07_001.fcs (from SCRIPT/2023_06_07_08)
Copied: 20210728_1539-BAL-04_001.fcs (from SCR

In [18]:
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "SCRIPT"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20241106_2131-BAL-00_06112024145829.pdf (from SCRIPT/2024_09_10_11_12)
Copied: 20250111_2140-BAL-07_11012025185416.pdf (from SCRIPT/2025_01_02_03_04_05)
Copied: 20250307_2159-BAL-00_07032025153324.pdf (from SCRIPT/2025_01_02_03_04_05)
Copied: 20250117_2145-BAL-00_17012025150930.pdf (from SCRIPT/2025_01_02_03_04_05)
Copied: 20250111_2143-BAL-00_11012025190140.pdf (from SCRIPT/2025_01_02_03_04_05)
Copied: 20240817_2112-BAL-00_17082024184946.pdf (from SCRIPT/2024_06_07_08)
Copied: 20200524_1293-BAL-09_24052020174439.pdf (from SCRIPT/2020_05)
Copied: 20220410_1619-BAL-02_10042022131259.pdf (from SCRIPT/2022_04)
Copied: 20200929_1386-BAL-00_29092020120023.pdf (from SCRIPT/2020_09)
Copied: 20210208_1472-BAL-00_09022021162816.pdf (from SCRIPT/2021_02)
Copied: 20210306_1483-BAL-00_06032021162128.pdf (from SCRIPT/2021_03)
Copied: 20230519_1714-BAL-00_19052023154815.pdf (from SCRIPT/2023_05)
Copied: 20221209_1679-BAL-00_09122022164849.pdf (from SCRIPT/2022_12)
Copied: 20210628_1529-BAL-0

In [19]:
SOURCE_FOLDER = "Abbvie"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Abbvie"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Copy each file
for filename in files_to_copy:
    source_path = os.path.join(SOURCE_FOLDER, filename)
    destination_path = os.path.join(DESTINATION_FOLDER, filename)
    
    try:
        shutil.copy2(source_path, destination_path)
        print(f"Copied: {filename}")
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error copying {filename}: {str(e)}")

print(f"\nFinished! Copied {len(files_to_copy)} files to {DESTINATION_FOLDER}")

Copied: 20231012_Abb Control-1-BAL-2_001.fcs
Copied: 20240417_AbbSSc06_RML_001.fcs
Copied: 20240918_AbbSSc03-9Mo-BAL_001.fcs
Copied: 20250122_AbbSSc06-9mo_blood_001.fcs
Copied: 20240117_Abb04_BAL_RML_001.fcs
Copied: 20241120_AbbSSc04-RML-20NOV2024_001.fcs
Copied: 20240821_AbbControl-02-RML_001.fcs
Copied: 20231006_AbbSsc02-2_001.fcs
Copied: 20240731_AbbVie_02_9mo_BAL_001.fcs
Copied: 20231012_Abb Control-1-BAL-1_001.fcs
File not found: 20240117_Abb04_Whole_Blood_001_001.fcs
File not found: 20231006_AbbSsc02 flow_001.fcs
Copied: 20231006_AbbSsc02-1_001.fcs
File not found: 20230926_AbbSsc003_RML_001.fcs
Copied: 20250311_AbbSSc_09_BAL_001.fcs
Copied: 20231006_AbbSsc02-1_001.fcs
File not found: 20230926_AbbSsc003_WholeBlood_001.fcs
Copied: 20231012_Abb Control-1-BAL-2_001.fcs
Copied: 20241106_AbbSSc-RML-05_001.fcs
Copied: 20250319_AbbSSc_11_BAL_001.fcs

Finished! Copied 20 files to testing_data


In [20]:
SOURCE_FOLDER = "Abbvie"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Abbvie"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Copy each file
for filename in files_to_copy:
    source_path = os.path.join(SOURCE_FOLDER, filename)
    destination_path = os.path.join(DESTINATION_FOLDER, filename)
    
    try:
        shutil.copy2(source_path, destination_path)
        print(f"Copied: {filename}")
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error copying {filename}: {str(e)}")

print(f"\nFinished! Copied {len(files_to_copy)} files to {DESTINATION_FOLDER}")

Copied: 20231012_Abb Control-1-BAL-2_13102023150141.pdf
Copied: 20240417_AbbSSc06_RML_17042024190721.pdf
Copied: 20240918_AbbSSc03-9Mo-BAL_18092024182229.pdf
Copied: 20250122_AbbSSc06-9mo_22012025184258.pdf
File not found: 20240117_Abb04_BAL_RML_17012024161829.pdf, 20240117_Abb04_Whole_Blood_001_17012024162431.pdf
Copied: 20241120_AbbSSc04-RML-20NOV2024_20112024172351.pdf
Copied: 20240821_AbbControl-02-RML_21082024173646.pdf
Copied: 20231006_AbbSsc02-2_06102023145840.pdf
Copied: 20240731_AbbVie_02_9mo_BAL_31072024153506.pdf
Copied: 20231012_Abb Control-1-BAL-1_13102023144758.pdf
File not found: 20240117_Abb04_BAL_RML_17012024161829.pdf, 20240117_Abb04_Whole_Blood_001_17012024162431.pdf
File not found: 20231006_AbbSsc02 flow_06102023150128.pdf
Copied: 20231006_AbbSsc02-1_06102023144916.pdf
File not found: 20230926_AbbSsc003_RML_26092023185702.pdf, 20230926_AbbSsc003_WholeBlood_26092023175830.pdf
Copied: 20250311_AbbSSc_09_BAL_11032025174753.pdf
Copied: 20231006_AbbSsc02-1_06102023144916

In [21]:
SOURCE_FOLDER = "Abbvie_local"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Abbvie"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20240117_Abb04_BAL_RML_001.fcs (from Abbvie_local/20230117 AbbSSc04)
Copied: 20240117_Abb04_Whole_Blood_001_001.fcs (from Abbvie_local/20230117 AbbSSc04)
Copied: 20230926_AbbSsc003_WholeBlood_001.fcs (from Abbvie_local/20230926 AbbSSc003)
Copied: 20230926_AbbSsc003_RML_001.fcs (from Abbvie_local/20230926 AbbSSc003)
Copied: 20231006_AbbSsc02 flow_001.fcs (from Abbvie_local/20231006 AbbSSc02)
Copied: 20231006_AbbSsc02-2_001.fcs (from Abbvie_local/20231006 AbbSSc02)
Copied: 20231006_AbbSsc02-1_001.fcs (from Abbvie_local/20231006 AbbSSc02)
Copied: 20231012_Abb Control-1-BAL-1_001.fcs (from Abbvie_local/20231013 abb-control-1 and 2)
Copied: 20231012_Abb Control-1-BAL-2_001.fcs (from Abbvie_local/20231013 abb-control-1 and 2)
File not found in any subfolder: 20240417_AbbSSc06_RML_001.fcs
File not found in any subfolder: 20240918_AbbSSc03-9Mo-BAL_001.fcs
File not found in any subfolder: 20250122_AbbSSc06-9mo_blood_001.fcs
File not found in any subfolder: 20241120_AbbSSc04-RML-20NOV202

In [22]:
SOURCE_FOLDER = "Abbvie_local"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Abbvie"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20231006_AbbSsc02-2_06102023145840.pdf (from Abbvie_local/20231006 AbbSSc02)
Copied: 20231006_AbbSsc02-1_06102023144916.pdf (from Abbvie_local/20231006 AbbSSc02)
Copied: 20231006_AbbSsc02 flow_06102023150128.pdf (from Abbvie_local/20231006 AbbSSc02)
Copied: 20231012_Abb Control-1-BAL-2_13102023150141.pdf (from Abbvie_local/20231013 abb-control-1 and 2)
Copied: 20231012_Abb Control-1-BAL-1_13102023144758.pdf (from Abbvie_local/20231013 abb-control-1 and 2)
File not found in any subfolder: 20240417_AbbSSc06_RML_17042024190721.pdf
File not found in any subfolder: 20240918_AbbSSc03-9Mo-BAL_18092024182229.pdf
File not found in any subfolder: 20250122_AbbSSc06-9mo_22012025184258.pdf
File not found in any subfolder: 20240117_Abb04_BAL_RML_17012024161829.pdf, 20240117_Abb04_Whole_Blood_001_17012024162431.pdf
File not found in any subfolder: 20241120_AbbSSc04-RML-20NOV2024_20112024172351.pdf
File not found in any subfolder: 20240821_AbbControl-02-RML_21082024173646.pdf
File not found in

In [24]:
SOURCE_FOLDER = "Duke_ozone"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Duke_ozone"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20220525_MOZEPH035-2_001.fcs (from Duke_ozone/20220525 MOZEPH035-2)
Copied: 20240229_MOLI005-BAL-01_001.fcs (from Duke_ozone/20240229 MOLI005-BAL-01)
Copied: 20211028_CXCL10_CXCR3_010-1_001.fcs (from Duke_ozone/20211028 CXCL10_CXCR3_010-1)
Copied: 20230831_MOLI_001_BAL_01_001.fcs (from Duke_ozone/20230831 MOLI001_BAL_1)
Copied: 20240222_MOLI006-BAL-01_001.fcs (from Duke_ozone/20240222 MOLI006-BAL-01)
Copied: 20220324_MOZEPH030-2_001.fcs (from Duke_ozone/20220324 MOZEPH030-2)
Copied: 20220519_MOZEPH034-1_001.fcs (from Duke_ozone/20220618 MOZEPH034-1)
Copied: 20220603_MOZEPH033-1_001.fcs (from Duke_ozone/20220603 MOZEPH033-1)
Copied: 20211104_MOZEPH015-2_001.fcs (from Duke_ozone/20211104 MOZEPH015-2)
Copied: 20230727_MOZEPH039-02_siglec8TEST_001.fcs (from Duke_ozone/20230727 MOZEPH039-2)
Copied: 20230727_MOZEPH039-02_001.fcs (from Duke_ozone/20230727 MOZEPH039-2)
Copied: 202101202_MOZEPH020-2_001.fcs (from Duke_ozone/20211202 MOZEPH020-2 DUKE)
Copied: 20220701_MOZEPH033-2_001.fcs

In [25]:
SOURCE_FOLDER = "Duke_ozone"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "Duke_ozone"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20220525_MOZEPH035-2_26052022151945.pdf (from Duke_ozone/20220525 MOZEPH035-2)
Copied: 20240229_MOLI005-BAL-01_29022024174300.pdf (from Duke_ozone/20240229 MOLI005-BAL-01)
Copied: 20211028_CXCL10_CXCR3_010-1_28102021123049.pdf (from Duke_ozone/20211028 CXCL10_CXCR3_010-1)
Copied: 20230831_MOLI_001_BAL_01_31082023161746.pdf (from Duke_ozone/20230831 MOLI001_BAL_1)
Copied: 20240222_MOLI006-BAL-01_22022024171339.pdf (from Duke_ozone/20240222 MOLI006-BAL-01)
Copied: 20220324_MOZEPH030-2_24032022173448.pdf (from Duke_ozone/20220324 MOZEPH030-2)
Copied: 20220519_MOZEPH034-1_22062022173858.pdf (from Duke_ozone/20220618 MOZEPH034-1)
Copied: 20220603_MOZEPH033-1_03062022162219.pdf (from Duke_ozone/20220603 MOZEPH033-1)
Copied: 202101202_MOZEPH020-2_02122021155727.pdf (from Duke_ozone/20211202 MOZEPH020-2 DUKE)
Copied: 20220701_MOZEPH033-2_01072022173031.pdf (from Duke_ozone/20220701 MOZEPH033-2)
Copied: 20240606_MOLI007-BAL-02_06062024145002.pdf (from Duke_ozone/20240606 MOLI007-BAL-2)


In [26]:
SOURCE_FOLDER = "WashU_BAL"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "WashU_BAL"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20240415_WU009-BAL-000_16042024162153.pdf (from WashU_BAL/WU009)
Copied: 20240514_WU014-BAL-000_14052024170113.pdf (from WashU_BAL/WU014)
Copied: 20240408_WU012-BAL-000_08052024181906.pdf (from WashU_BAL/WU012)
Copied: 20240120_WU006-BAL-000_20012024182416.pdf (from WashU_BAL/WU006)
Copied: 20240417_WU010-BAL-000_17042024190814.pdf (from WashU_BAL/WU010)
Copied: 20240113_WU003-BAL-001_13012024182143.pdf (from WashU_BAL/WU003)
Copied: 20240120_WU005-BAL-001_20012024180904.pdf (from WashU_BAL/WU005)
Copied: 20240111_WU002-BAL-000_11012024134612.pdf (from WashU_BAL/WU002)
Copied: 20240511_WU013-BAL-000_11052024203423.pdf (from WashU_BAL/WU013)
Copied: 20240404_WU011-BAL-000_03052024180310.pdf (from WashU_BAL/WU011)
Copied: 20240410_WU008-BAL-000_10042024172723.pdf (from WashU_BAL/WU008)
Copied: 20240104_WU001-BAL-000_04012024173621.pdf (from WashU_BAL/WU001)
Copied: 20240117_WU004-BAL-000_17012024155605 2.pdf (from WashU_BAL/WU004)

Summary:
Successfully copied: 13 files
Not found

In [27]:
SOURCE_FOLDER = "WashU_BAL"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "WashU_BAL"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20240415_WU009-BAL-000_001.fcs (from WashU_BAL/WU009)
Copied: 20240514_WU014-BAL-000_001.fcs (from WashU_BAL/WU014)
Copied: 20240408_WU012-BAL-000_001.fcs (from WashU_BAL/WU012)
Copied: 20240120_WU006-BAL-000_001.fcs (from WashU_BAL/WU006)
Copied: 20240417_WU010-BAL-000_001.fcs (from WashU_BAL/WU010)
Copied: 20240113_WU003-BAL-001_001.fcs (from WashU_BAL/WU003)
Copied: 20240120_WU005-BAL-001_001.fcs (from WashU_BAL/WU005)
Copied: 20240111_WU002-BAL-000_001.fcs (from WashU_BAL/WU002)
Copied: 20240511_WU013-BAL-000_001.fcs (from WashU_BAL/WU013)
Copied: 20240404_WU011-BAL-000_001.fcs (from WashU_BAL/WU011)
Copied: 20240410_WU008-BAL-000_001.fcs (from WashU_BAL/WU008)
Copied: 20240104_WU001-BAL-000_001.fcs (from WashU_BAL/WU001)
Copied: 20240117_WU004-BAL-000_001 2.fcs (from WashU_BAL/WU004)

Summary:
Successfully copied: 13 files
Not found: 0 files
Errors: 0 files
Destination: testing_data


In [28]:
SOURCE_FOLDER = "PASC"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "PDF reports names:"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "PASC"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20210416_LC002_16042021164810.pdf (from PASC/20210416 PASC0020)
Copied: 20220209_PASC0122_09022022164948.pdf (from PASC/20220209 PASC0122)
Copied: 20210416_LC003_16042021165838.pdf (from PASC/20210416 PASC0022)
Copied: 20220427_PASC0147_28042022152330.pdf (from PASC/20220427 PASC0147)
Copied: 20220127_PASC0115_27012022164834.pdf (from PASC/20220131 PASC0115)
Copied: 20220323_PASC0137_23032022102957.pdf (from PASC/20220322 PASC0137)
Copied: 20210727_LC009_BAL01_27072021164852.pdf (from PASC/20210727 PASC0049)
Copied: 20210805_LT039_05082021173239.pdf (from PASC/20210805 PASC0059)
Copied: 20210511_LC006_11052021145530.pdf (from PASC/20210511 PASC0018)
Copied: 202101220_PASC0107_20122021180655.pdf (from PASC/20211220 PASC0107)
Copied: 20210427_LC004_27042021140712.pdf (from PASC/20210427 PASC0021)
Copied: 20220105_PASC0111_05012022144531.pdf (from PASC/20220105 PASC0111)
Copied: 20210413_LC001_BAL_01_13042021165347.pdf (from PASC/20210413 PASC0019)
Copied: 20221004_PASC0192_041020

In [29]:
SOURCE_FOLDER = "PASC"  # Where the original files are stored
DESTINATION_FOLDER = "testing_data"  # Where to copy the files
FILE_COLUMN = "FCS File"  # Column in your DataFrame that contains filenames

# Make sure the destination folder exists
os.makedirs(DESTINATION_FOLDER, exist_ok=True)
sampled_LTs = sampled_df[sampled_df["Study"] == "PASC"]
# Get the list of files from your sampled DataFrame
files_to_copy = sampled_LTs[FILE_COLUMN].tolist()

# Create a dictionary to track found/not found files
copy_results = {
    'copied': 0,
    'not_found': [],
    'errors': []
}

# Walk through all subdirectories to find the files
for root, dirs, files in os.walk(SOURCE_FOLDER):
    for file in files:
        if file in files_to_copy:
            source_path = os.path.join(root, file)
            destination_path = os.path.join(DESTINATION_FOLDER, file)
            
            try:
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file} (from {root})")
                copy_results['copied'] += 1
                # Remove found file from list to avoid duplicate searches
                files_to_copy.remove(file)
            except Exception as e:
                print(f"Error copying {file}: {str(e)}")
                copy_results['errors'].append(file)

# Check for any files that weren't found
for missing_file in files_to_copy:
    print(f"File not found in any subfolder: {missing_file}")
    copy_results['not_found'].append(missing_file)

print(f"\nSummary:")
print(f"Successfully copied: {copy_results['copied']} files")
print(f"Not found: {len(copy_results['not_found'])} files")
print(f"Errors: {len(copy_results['errors'])} files")
print(f"Destination: {DESTINATION_FOLDER}")

Copied: 20210416_LC002_001.fcs (from PASC/20210416 PASC0020)
Copied: 20220209_PASC0122_001.fcs (from PASC/20220209 PASC0122)
Copied: 20210416_LC003_001.fcs (from PASC/20210416 PASC0022)
Copied: 20220427_PASC0147_001.fcs (from PASC/20220427 PASC0147)
Copied: 20220127_PASC0115_001.fcs (from PASC/20220131 PASC0115)
Copied: 20220323_PASC0137_001.fcs (from PASC/20220322 PASC0137)
Copied: 20210727_LC009_BAL01_001.fcs (from PASC/20210727 PASC0049)
Copied: 20210805_LT039_001.fcs (from PASC/20210805 PASC0059)
Copied: 20210511_LC006_001.fcs (from PASC/20210511 PASC0018)
Copied: 202101220_PASC0107_001.fcs (from PASC/20211220 PASC0107)
Copied: 20210427_LC004_001.fcs (from PASC/20210427 PASC0021)
Copied: 20220105_PASC0111_001.fcs (from PASC/20220105 PASC0111)
Copied: 20210511_LC005_001.fcs (from PASC/20210511 PASC0017)
Copied: 20210413_LC001_BAL_01_001.fcs (from PASC/20210413 PASC0019)
Copied: 20221004_PASC0192_001.fcs (from PASC/20221004 PASC0192)
Copied: 20220719_PASC0175_001.fcs (from PASC/20220