In [1]:
import os
import json
import gzip
import pandas as pd

# Paths
metadata_folder = "metadata"
image_csv_path = "images.csv"

# Collect all .json.gz metadata files
metadata_files = [f for f in os.listdir(metadata_folder) if f.startswith("listings_") and f.endswith(".json.gz")]

all_metadata = []

# Load metadata line-by-line from all .json.gz files
for file_name in metadata_files:
    file_path = os.path.join(metadata_folder, file_name)
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        try:
            for line in f:
                data = json.loads(line.strip())
                all_metadata.append(data)
        except Exception as e:
            print(f"❌ Error loading {file_name}: {e}")

print(f"🔍 Loaded {len(all_metadata)} metadata entries.")

# Convert metadata list to DataFrame
metadata_df = pd.json_normalize(all_metadata)

# Load image CSV
image_df = pd.read_csv(image_csv_path)

# Merge metadata with image data using main_image_id == image_id
merged_df = metadata_df.merge(image_df, left_on="main_image_id", right_on="image_id", how="inner")

print(f"✅ Merged {len(merged_df)} entries with image data.")

# Save to CSV
merged_df.to_csv("filtered_metadata_with_images.csv", index=False)
print("📁 Saved to 'filtered_metadata_with_images.csv'")


🔍 Loaded 147702 metadata entries.
✅ Merged 147127 entries with image data.
📁 Saved to 'filtered_metadata_with_images.csv'
