In [None]:
import pandas as pd
import os
import glob

# Set your folder INPUT path here
folder_paths = ["cloudfront-top-referrers-history","cloudfront-popular-objects-history"]

output_folder_path = "cloudfront-historical-data"

for folder_path in folder_paths: 
    files = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.xlsx"))
    print(f"Found {len(files)} files to process")
    
    for file_path in files:
        print(f"\nProcessing: {os.path.basename(file_path)}")
        
        try:
            # Read first few rows to get metadata (rows 2-5, which are indices 1-4)
            if file_path.endswith('.xlsx'):
                meta_df = pd.read_excel(file_path, header=None, nrows=6)
            else:
                meta_df = pd.read_csv(file_path, header=None, nrows=6)
            
            # Extract metadata from rows 2-5
            metadata = {}
            for i in range(1, 5):  # rows 2-5 (indices 1-4)
                if i < len(meta_df):
                    key = str(meta_df.iloc[i, 0]) if pd.notna(meta_df.iloc[i, 0]) else f"field_{i}"
                    value = str(meta_df.iloc[i, 1]) if len(meta_df.columns) > 1 and pd.notna(meta_df.iloc[i, 1]) else ""
                    metadata[key] = value
            
            print(f"Metadata: {metadata}")
            
            # Read the actual data starting from row 7 (skiprows=6)
            if file_path.endswith('.xlsx'):
                df = pd.read_excel(file_path, skiprows=7)
            else:
                df = pd.read_csv(file_path, skiprows=7)
            
            # Clean column names
            df.columns = df.columns.str.strip().str.replace('"', '')
            
            # Add metadata columns
            for key, value in metadata.items():
                df[key] = value
            
            # Save as Parquet
            output_filename = os.path.splitext(os.path.basename(file_path))[0] + ".parquet"
            output_path = os.path.join(output_folder_path, folder_path)
            os.makedirs(output_path, exist_ok=True)
            output_path = os.path.join(output_path, output_filename)
            
            df.to_parquet(output_path, index=False)
            
            print(f"Saved {len(df)} rows to {output_filename}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

print("\nDone!")

## Testing

In [None]:
for folder_path in folder_paths: 
    parquet_folder = f"{output_folder_path}/{folder_path}"
    parquet_files = glob.glob(os.path.join(parquet_folder, "*.parquet"))
    
    print(f"Found {len(parquet_files)} parquet files")
    
    # Read and combine all parquet files
    all_dfs = []
    for file_path in parquet_files:
        df = pd.read_parquet(file_path)
        print(f"Loaded {os.path.basename(file_path)}: {len(df)} rows, {len(df.columns)} columns")
        all_dfs.append(df)
    
    # Combine into single dataframe
    combined_df = pd.concat(all_dfs, ignore_index=True)
    display(combined_df)