In [3]:
import pandas as pd
import glob
import pytz
from pathlib import Path

def convert_to_local(ts_utc, country_code):
    """Convert a UTC‐localized timestamp to the local timezone for a given country code."""
    try:
        tz_name = pytz.country_timezones[country_code][0]
    except (KeyError, IndexError):
        tz_name = "UTC"
    return ts_utc.tz_convert(tz_name)

def main():
    # 1. Discover files via glob
    data_dir = Path("/Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History")
    json_files = glob.glob(str(data_dir / "*.json"))
    
    # 2. Load & concatenate
    dfs = []
    for fp in json_files:
        print(f"Loading {fp}...")
        dfs.append(pd.read_json(fp))  # assumes each JSON is a list of records
    df = pd.concat(dfs, ignore_index=True)
    
    # Record initial row count
    counts = {}
    counts['initial'] = len(df)
    
     # 3. Filter out any row with a non-null value in any of these seven columns
    drop_cols = [
        "episode_name", "episode_show_name", "spotify_episode_uri",
        "audiobook_title", "audiobook_uri",
        "audiobook_chapter_uri", "audiobook_chapter_title"
    ]
    before = len(df)
    mask_has_media = df[drop_cols].notnull().any(axis=1)
    df = df[~mask_has_media]
    counts['after_media_filter'] = len(df)
    print(f"Dropped {before - counts['after_media_filter']} non-track rows (non-null media)")
    
    # 4. Drop the now-unneeded columns
    df.drop(columns=drop_cols, inplace=True)
    
    # 5. Deduplicate (byte-for-byte across all columns)
    before = len(df)
    df.drop_duplicates(keep="first", inplace=True)
    counts['after_dedup'] = len(df)
    print(f"Dropped {before - counts['after_dedup']} duplicate rows")
    
    # 6. Filter out rows with missing/blank track or artist
    before = len(df)
    track_blank = df['master_metadata_track_name'].fillna("").eq("")
    artist_blank = df['master_metadata_album_artist_name'].fillna("").eq("")
    df = df[~(track_blank | artist_blank)]
    counts['after_track_artist_filter'] = len(df)
    print(f"Dropped {before - counts['after_track_artist_filter']} rows missing track or artist")
    
    # 7. Timezone conversion
    # Parse 'ts' to UTC-aware all at once:
    df['ts_utc'] = pd.to_datetime(df['ts'], utc=True)

    # Now convert each row into its country-local timezone:
    df['ts_local'] = df.apply(
        lambda row: convert_to_local(row['ts_utc'], row.get('conn_country', '')), axis=1
    )
    
    # 8. Summary of removals
    print("\nSummary of row counts:")
    print(f"  Initial:             {counts['initial']}")
    print(f"  After media filter:  {counts['after_media_filter']}  (−{counts['initial'] - counts['after_media_filter']})")
    print(f"  After dedup:         {counts['after_dedup']}  (−{counts['after_media_filter'] - counts['after_dedup']})")
    print(f"  After track/artist:  {counts['after_track_artist_filter']}  (−{counts['after_dedup'] - counts['after_track_artist_filter']})")
    
    # 9. Export to CSV
    out_path = data_dir / "combined_cleaned_streaming_history.csv"
    df.drop(columns=['ts_utc'], inplace=True)  # optional: drop the helper column
    df.to_csv(out_path, index=False)
    print(f"\nCleaned data written to: {out_path}")

if __name__ == "__main__":
    main()

Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Audio_2021-2022_5.json...
Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Audio_2019-2020_2.json...
Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Audio_2020-2021_4.json...
Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Audio_2022-2023_7.json...
Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Audio_2014-2018_0.json...
Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Video_2020-2024.json...
Loading /Users/baptistemeynet/Downloads/Projects/spotify_dashboard/Spotify Extended Streaming History/Streaming_History_Audio_