# Downloaded spreadsheet file inventory

In [3]:
import os
from datetime import datetime
import pandas as pd
from pathlib import Path


In [4]:
# modified Google Gemini output

path_obj = Path('/Volumes/ap180/spreadsheets_corpus')
data = []
max_depth = 0

# Iterate through all files and directories recursively
# We use rglob('*') to find all items, but only process files.
for item in path_obj.rglob('*'):
    if item.is_file(): # Only process files
        row_data = {
            'Directory': str(path_obj.name), # The name of the root directory being scanned
            'File': item.name,              # The name of the file
            'Type': 'File'                  # Type is always 'File'
        }

        current_level_parts = [] # To store the components of the subdirectory path for the current file

        # For a file, the subdirectory levels are determined by its parent's path relative to the root
        if item.parent != path_obj: # Check if the parent is not the root directory itself
            relative_parent_path = item.parent.relative_to(path_obj)
            print(relative_parent_path)
            current_level_parts = list(relative_parent_path.parts)
            print(current_level_parts)

        # Update the maximum depth found across all items to know how many level columns to create
        max_depth = max(max_depth, len(current_level_parts))

        # Assign each part of the subdirectory path to its respective 'Subdirectory_Level_X' column
        for i, part in enumerate(current_level_parts):
            row_data[f'Subdirectory_Level_{i + 1}'] = part

        data.append(row_data)

# Create the initial DataFrame from the collected data
df = pd.DataFrame(data)

# Ensure all potential 'Subdirectory_Level_X' columns exist and fill any missing values (NaNs)
# with empty strings for shallower paths (e.g., files directly in root)
for i in range(1, max_depth + 1):
    col_name = f'Subdirectory_Level_{i}'
    if col_name not in df.columns:
        df[col_name] = '' # Add column if it was not present in any row data
    df[col_name] = df[col_name].fillna('') # Fill NaN values with empty strings

# Define the desired order of columns in the final DataFrame
ordered_columns = ['Directory']
for i in range(1, max_depth + 1):
    ordered_columns.append(f'Subdirectory_Level_{i}')
ordered_columns.extend(['File', 'Type'])

# Filter the ordered_columns list to only include columns that actually exist in the DataFrame
df = df[[col for col in ordered_columns if col in df.columns]]

# Sort the DataFrame for better readability.
# The sort keys are dynamically constructed based on the available subdirectory level columns.
sort_by_cols = ['Directory']
for i in range(1, max_depth + 1):
    if f'Subdirectory_Level_{i}' in df.columns:
        sort_by_cols.append(f'Subdirectory_Level_{i}')
sort_by_cols.append('File')
df = df.sort_values(by=sort_by_cols).reset_index(drop=True)


3520917
['3520917']
3520917
['3520917']
3520917
['3520917']
3520917
['3520917']
3520917
['3520917']
3520917
['3520917']
3520917
['3520917']
10606483
['10606483']
10606483
['10606483']
10606483
['10606483']
10606483
['10606483']
7326500
['7326500']
11165918
['11165918']
11165918
['11165918']
11165918
['11165918']
11165918
['11165918']
7576974
['7576974']
5042194
['5042194']
7692555
['7692555']
7692555
['7692555']
7692555
['7692555']
7692555
['7692555']
13236575
['13236575']
10118192
['10118192']
10118192
['10118192']
10118192
['10118192']
10118192
['10118192']
10118192
['10118192']
10118192
['10118192']
5770442
['5770442']
5770442
['5770442']
5770442
['5770442']
5770442
['5770442']
5770442
['5770442']
6984998
['6984998']
6984998
['6984998']
6984998
['6984998']


In [5]:
df

Unnamed: 0,Directory,Subdirectory_Level_1,File,Type
0,spreadsheets_corpus,,.DS_Store,File
1,spreadsheets_corpus,10118192.0,Eukaryotic_virus_CV_Dataset-1.csv,File
2,spreadsheets_corpus,10118192.0,Prokaryotic_virus_CV_Dataset-1.csv,File
3,spreadsheets_corpus,10118192.0,Test_Eukaryotic_virus_Dataset-1.csv,File
4,spreadsheets_corpus,10118192.0,Test_Prokaryotic_virus_Dataset-1.csv,File
5,spreadsheets_corpus,10118192.0,gut_virome.csv,File
6,spreadsheets_corpus,10118192.0,marine_virome_id.csv,File
7,spreadsheets_corpus,10606483.0,Grain Size of Modern TX Environments.xlsx,File
8,spreadsheets_corpus,10606483.0,MAI49 and MAI35 210Pb 137Cs.xlsx,File
9,spreadsheets_corpus,10606483.0,MAI49 and MAI35 Grain Size.xlsx,File


In [8]:
df['Subdirectory_Level_1']

0             
1     10118192
2     10118192
3     10118192
4     10118192
5     10118192
6     10118192
7     10606483
8     10606483
9     10606483
10    10606483
11    11165918
12    11165918
13    11165918
14    11165918
15    13236575
16     3520917
17     3520917
18     3520917
19     3520917
20     3520917
21     3520917
22     3520917
23     5042194
24     5770442
25     5770442
26     5770442
27     5770442
28     5770442
29     6984998
30     6984998
31     6984998
32     7326500
33     7576974
34     7692555
35     7692555
36     7692555
37     7692555
Name: Subdirectory_Level_1, dtype: object

In [None]:
#df.loc[df['Subdirectory_Level_2'].str.contains('mondrian')]

In [7]:
filetime = datetime.now()
filetime = filetime.strftime('%Y-%m-%d_%I-%M_%p')
df.to_csv(f'file_inventory_{filetime}.csv',index=False)

In [None]:
# Full Google Gemini Response, Flash 2.5, 2025-06-26
# Prompts:
# 1: Can you write a short Python script using pathlib to walk through a directory and its subdirectories, creating columns in a Pandas dataframe for directory, subdirectories, and files?
# 2: I don't need to create a dummy directory, can you remove that part?
# 3: I may have two levels of subdirectories. Can you modify the script to put each level of subdirectory in its own column?
# 4: I do not need to store rows of type Directory in my dataframe. Can you remove that part of the script?

# import pandas as pd
# from pathlib import Path
# import os
# import tempfile
# import shutil

# def walk_directory_to_dataframe(directory_path: str) -> pd.DataFrame:
#     """
#     Walks through a specified directory and its subdirectories,
#     creating a Pandas DataFrame with columns for the root directory,
#     individual subdirectory levels (e.g., 'Subdirectory_Level_1', 'Subdirectory_Level_2'),
#     and files. Only files are included in the DataFrame.

#     Args:
#         directory_path (str): The path to the root directory to traverse.

#     Returns:
#         pd.DataFrame: A DataFrame containing the file structure.
#                       Columns will include 'Directory', dynamically generated
#                       'Subdirectory_Level_X' columns, 'File', and 'Type' (always 'File').
#     """
#     path_obj = Path(directory_path)
#     if not path_obj.exists():
#         print(f"Error: The specified directory '{directory_path}' does not exist.")
#         return pd.DataFrame() # Return an empty DataFrame

#     if not path_obj.is_dir():
#         print(f"Error: The specified path '{directory_path}' is not a directory.")
#         return pd.DataFrame() # Return an empty DataFrame

#     data = []
#     max_depth = 0

#     # Iterate through all files and directories recursively
#     # We use rglob('*') to find all items, but only process files.
#     for item in path_obj.rglob('*'):
#         if item.is_file(): # Only process files
#             row_data = {
#                 'Directory': str(path_obj.name), # The name of the root directory being scanned
#                 'File': item.name,              # The name of the file
#                 'Type': 'File'                  # Type is always 'File'
#             }

#             current_level_parts = [] # To store the components of the subdirectory path for the current file

#             # For a file, the subdirectory levels are determined by its parent's path relative to the root
#             if item.parent != path_obj: # Check if the parent is not the root directory itself
#                 relative_parent_path = item.parent.relative_to(path_obj)
#                 current_level_parts = list(relative_parent_path.parts)

#             # Update the maximum depth found across all items to know how many level columns to create
#             max_depth = max(max_depth, len(current_level_parts))

#             # Assign each part of the subdirectory path to its respective 'Subdirectory_Level_X' column
#             for i, part in enumerate(current_level_parts):
#                 row_data[f'Subdirectory_Level_{i + 1}'] = part

#             data.append(row_data)

#     # If no data was collected (e.g., empty directory or only directories), return an empty DataFrame
#     if not data:
#         return pd.DataFrame()

#     # Create the initial DataFrame from the collected data
#     df = pd.DataFrame(data)

#     # Ensure all potential 'Subdirectory_Level_X' columns exist and fill any missing values (NaNs)
#     # with empty strings for shallower paths (e.g., files directly in root)
#     for i in range(1, max_depth + 1):
#         col_name = f'Subdirectory_Level_{i}'
#         if col_name not in df.columns:
#             df[col_name] = '' # Add column if it was not present in any row data
#         df[col_name] = df[col_name].fillna('') # Fill NaN values with empty strings

#     # Define the desired order of columns in the final DataFrame
#     ordered_columns = ['Directory']
#     for i in range(1, max_depth + 1):
#         ordered_columns.append(f'Subdirectory_Level_{i}')
#     ordered_columns.extend(['File', 'Type'])

#     # Filter the ordered_columns list to only include columns that actually exist in the DataFrame
#     df = df[[col for col in ordered_columns if col in df.columns]]

#     # Sort the DataFrame for better readability.
#     # The sort keys are dynamically constructed based on the available subdirectory level columns.
#     if not df.empty:
#         sort_by_cols = ['Directory']
#         for i in range(1, max_depth + 1):
#             if f'Subdirectory_Level_{i}' in df.columns:
#                 sort_by_cols.append(f'Subdirectory_Level_{i}')
#         sort_by_cols.append('File')
#         df = df.sort_values(by=sort_by_cols).reset_index(drop=True)

#     return df

# if __name__ == "__main__":
#     # This block creates a temporary directory and populates it with a multi-level structure
#     # to demonstrate the script's functionality without affecting your actual file system.
#     with tempfile.TemporaryDirectory() as tmpdir:
#         base_demo_path = Path(tmpdir) / "my_multi_level_root"
#         base_demo_path.mkdir(parents=True, exist_ok=True) # Ensure the root for demo exists

#         # Creating a sample multi-level directory and file structure
#         (base_demo_path / "level1_dir_A").mkdir()
#         (base_demo_path / "level1_dir_B").mkdir()
#         (base_demo_path / "level1_dir_A" / "file_A1.txt").write_text("Content A1")
#         (base_demo_path / "level1_dir_A" / "level2_dir_AA").mkdir() # This directory itself won't be in df
#         (base_demo_path / "level1_dir_A" / "level2_dir_AA" / "file_AA1.log").write_text("Content AA1")
#         (base_demo_path / "level1_dir_A" / "level2_dir_AA" / "level3_dir_AAA").mkdir() # This directory itself won't be in df
#         (base_demo_path / "level1_dir_A" / "level2_dir_AA" / "level3_dir_AAA" / "data.csv").write_text("data,1,2,3")
#         (base_demo_path / "level1_dir_B" / "file_B1.jpg").write_text("Binary data B1")
#         (base_demo_path / "root_file.txt").write_text("Root file content")
#         (base_demo_path / "another_root_file.md").write_text("Another root file.")
#         (base_demo_path / "single_level_dir").mkdir() # This directory itself won't be in df
#         (base_demo_path / "single_level_dir" / "single_file.json").write_text("{}")
#         (base_demo_path / "level1_dir_A" / "level2_dir_AA" / "empty_dir").mkdir() # Empty dir, won't be in df

#         target_directory = str(base_demo_path)
#         print(f"\n--- Walking through the directory: {target_directory} and creating DataFrame with only files ---")
#         df_result = walk_directory_to_dataframe(target_directory)

#         if not df_result.empty:
#             print("\nDataFrame created successfully:")
#             # Using to_string() to display the full DataFrame without truncation in the console
#             print(df_result.to_string())
#         else:
#             print("\nNo data found or DataFrame is empty.")
#             print("Note: This could happen if the directory contains only subdirectories and no files.")


#         print(f"\nTemporary directory '{tmpdir}' and its contents will be removed.")


## Initial attempts

In [None]:
#https://appdividend.com/list-of-files-in-directory-and-subdirectories-in-python/
file_list=[]
for dirpath, dirnames, filenames in os.walk('outputs'):
    #print(dirpath,dirnames,filenames)
    file_dict={}
    for filename in filenames:
        print(dirpath,filename)
        file_dict['dirname'] = dirpath
        file_dict['filename'] = filename
        print(file_dict)
        file_list.append(file_dict)
        #print(os.path.join(dirpath, filename))
print(file_list)

In [None]:
file_inventory = pd.DataFrame(file_list)

In [None]:
file_inventory

In [None]:
file_inventory['dirname'].str.split('/')

In [None]:
#https://www.statology.org/pandas-split-column/
file_inventory[['dirname','subdir']] = file_inventory['dirname'].str.split('/', n=1, expand=True)
#df['A'].str.split(',', 1, expand=True)

In [None]:
file_inventory

In [None]:
#https://www.geeksforgeeks.org/python/python-list-all-files-in-directory-and-subdirectories/
from pathlib import Path

def list_files_pathlib(path=Path('.')):
    for entry in path.iterdir():
        if entry.is_file():
            print(entry)
        elif entry.is_dir():
            list_files_pathlib(entry)

# Specify the directory path you want to start from
directory_path = Path('outputs')
list_files_pathlib(directory_path)

In [None]:
#https://www.geeksforgeeks.org/python/python-list-all-files-in-directory-and-subdirectories/
from pathlib import Path
path = Path('outputs')

#def list_files_pathlib(path=Path('.')):
for entry in path.iterdir():
    print(entry.parent())
    # if entry.is_file():
    #     print(entry)
    # elif entry.is_dir():
    #     list_files_pathlib(entry)

# # Specify the directory path you want to start from
# directory_path = Path('outputs')
# list_files_pathlib(directory_path)

In [None]:
from pathlib import Path


#def list_files(directory):
path = Path('outputs')
# rglob pattern '*' matches all files and directories
for file_path in path.rglob('*'):
    if file_path.is_file():  # Check if it's a file
        print(file_path)
    if file_path.is_dir():
        print(file_path)

In [None]:
#Google Gemini Flash 2.5 version, modified

path_obj = Path('outputs')
# if not path_obj.exists():
#     print(f"Error: The specified directory '{directory_path}' does not exist.")
#     return pd.DataFrame() # Return an empty DataFrame

# if not path_obj.is_dir():
#     print(f"Error: The specified path '{directory_path}' is not a directory.")
#     return pd.DataFrame() # Return an empty DataFrame

data = []

# Iterate through all files and directories recursively
for item in path_obj.rglob('*'): # rglob('*') iterates over all contents recursively
    # if item.is_dir():
    #     dir_name = item.name
    #     parent_dir = item.parent.relative_to(path_obj) if item.parent != path_obj else Path('')
    #     data.append({
    #         'Directory': str(path_obj.name),
    #         'Subdirectory': str(parent_dir / dir_name) if parent_dir else dir_name,
    #         'File': '',
    #         'Type': 'Directory'
    #     })
    if item.is_file():
        file_name = item.name
        parent_dir = item.parent.relative_to(path_obj) if item.parent != path_obj else Path('')
        data.append({
            'dir': str(path_obj.name),
            'subdir': str(parent_dir),
            'filename': file_name,
            'type': 'File'
        })

# Sort data for better readability (optional)
df = pd.DataFrame(data)
df = df.sort_values(by=['dir', 'subdir', 'filename']).reset_index(drop=True)