In [1]:
from pathlib import Path
from typing import Iterable, Optional, Union
import pandas as pd
import json

## Functions

In [2]:
def spotify_history_for_month(json_path, year, month):
    """
    Read a downloaded Spotify listening history .json and return only rows in (year, month).

    Works with common Spotify exports:
      - Classic: endTime, artistName, trackName, msPlayed
      - Extended: ts, master_metadata_album_artist_name, master_metadata_track_name, ms_played
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    df = pd.DataFrame(data)

    # Pick the timestamp column Spotify used
    if "endTime" in df.columns:
        df["played_at"] = pd.to_datetime(df["endTime"])
    else:
        df["played_at"] = pd.to_datetime(df["ts"])

    # Filter to the month
    df = df[(df["played_at"].dt.year == year) & (df["played_at"].dt.month == month)]

    return df.sort_values("played_at").reset_index(drop=True)

In [3]:
def top_15_songs_separate_rankings(df):
    """
    Return two DataFrames:
      1) Top 15 songs by total seconds played
      2) Top 15 songs by discrete play count
    """
    # Handle both Spotify export formats
    if "trackName" in df.columns:
        track_col = "trackName"
        artist_col = "artistName"
        ms_col = "msPlayed"
    else:
        track_col = "master_metadata_track_name"
        artist_col = "master_metadata_album_artist_name"
        ms_col = "ms_played"

    summary = (
        df.groupby([artist_col, track_col])[ms_col]
          .agg(
              seconds_played=lambda x: x.sum() / 1000,
              play_count="count"
          )
          .reset_index()
    )

    top_by_seconds = (
        summary.sort_values("seconds_played", ascending=False)
               .head(15)
               .reset_index(drop=True)
    )

    top_by_plays = (
        summary.sort_values("play_count", ascending=False)
               .head(15)
               .reset_index(drop=True)
    )

    return top_by_seconds, top_by_plays

In [4]:
def songs_not_in_both(top_by_seconds, top_by_plays):
    """
    Return songs that appear in one list but not the other.
    """
    cols = ["artistName", "trackName"]

    set_seconds = set(map(tuple, top_by_seconds[cols].values))
    set_plays = set(map(tuple, top_by_plays[cols].values))

    only_seconds = set_seconds - set_plays
    only_plays = set_plays - set_seconds

    return {
        "only_in_top_by_seconds": list(only_seconds),
        "only_in_top_by_plays": list(only_plays),
    }

In [5]:
def remove_christmas_songs(df):
    """
    Remove Christmas / holiday songs from a Spotify history DataFrame.
    """
    # Handle both Spotify export formats
    if "trackName" in df.columns:
        track_col = "trackName"
    else:
        track_col = "master_metadata_track_name"

    christmas_keywords = [
        "christmas", "xmas", "holiday", "santa", "reindeer",
        "snow", "winter", "jingle", "noel", "nativity", "hallelujah"
    ]

    mask = df[track_col].str.lower().str.contains(
        "|".join(christmas_keywords),
        na=False
    )

    return df[~mask].reset_index(drop=True)

In [6]:
def remove_soundtracks(df):
    """
    Remove soundtrack songs from a Spotify history DataFrame.
    """
    # Handle both Spotify export formats
    if "trackName" in df.columns:
        track_col = "trackName"
    else:
        track_col = "master_metadata_track_name"

    soundtrack_keywords = [
        "soundtrack", "score", "ost", "original motion picture", "film music",
        "movie", "tv show", "theme", "background music", "broadway", "unknown track"
    ]

    mask = df[track_col].str.lower().str.contains(
        "|".join(soundtrack_keywords),
        na=False
    )

    df = df[~mask].reset_index(drop=True)

    if "artistName" in df.columns:
        artist_col = "artistName"
    else:
        artist_col = "master_metadata_artist_name"

    soundtrack_keywords = [
        "broadway", "james newton howard", "hans zimmer",
        "john williams", "nicholas britell", "howard shore", "alan menken",
        "stephen sondheim", "andrew lloyd webber", "unknown artist"
    ]

    mask = df[artist_col].str.lower().str.contains(
        "|".join(soundtrack_keywords),
        na=False
    )

    return df[~mask].reset_index(drop=True)

In [7]:
def print_month_report(top_seconds, top_plays, discrepancies):
    print("\n=== TOP BY SECONDS ===")
    print(top_seconds.to_string(index=False))

    print("\n=== TOP BY PLAYS ===")
    print(top_plays.to_string(index=False))

    print("\n=== DISCREPANCIES ===")
    print("Only in top by seconds:")
    for x in discrepancies["only_in_top_by_seconds"]:
        print(" -", x)

    print("\nOnly in top by plays:")
    for x in discrepancies["only_in_top_by_plays"]:
        print(" -", x)

In [None]:
def run_all_for_month(year, month, folder="Spotify Account Data"):
    """
    Run the full analysis pipeline for a given month and year
    across ALL StreamingHistory_music_*.json files.
    """
    folder = Path(folder)

    # Find all matching Spotify history files
    files = sorted(folder.glob("StreamingHistory_music_*.json"))

    if not files:
        raise FileNotFoundError("No StreamingHistory_music_*.json files found")

    # Load & combine all months
    dfs = [spotify_history_for_month(f, year, month) for f in files]
    df = pd.concat(dfs, ignore_index=True)

    clean_df = remove_soundtracks(remove_christmas_songs(df))

    top_seconds, top_plays = top_15_songs_separate_rankings(clean_df)
    discrepancies = songs_not_in_both(top_seconds, top_plays)

    return print_month_report(top_seconds, top_plays, discrepancies)

## November 2025

In [9]:
run_all_for_month(2025, 11)


=== TOP BY SECONDS ===
  artistName                     trackName  seconds_played  play_count
Taylor Swift                     Labyrinth        1487.772           6
Taylor Swift               Eldest Daughter        1445.159           6
Taylor Swift                 The Great War        1364.784           6
Taylor Swift                 Father Figure        1314.528           9
Taylor Swift                     Wi$h Li$t        1244.110           6
Taylor Swift                Hits Different        1195.021           6
Taylor Swift                         Paris        1177.554           6
Taylor Swift                       Opalite        1176.371           5
Taylor Swift           Ruin The Friendship        1171.000           6
Taylor Swift                     Bejeweled        1164.996           6
Taylor Swift              Elizabeth Taylor        1146.140           7
Taylor Swift           The Fate of Ophelia        1130.110           5
Taylor Swift                         Honey        108

## December 2025

In [10]:
run_all_for_month(2025, 12)


=== TOP BY SECONDS ===
  artistName           trackName  seconds_played  play_count
Taylor Swift The Fate of Ophelia         813.860           5
Taylor Swift     Eldest Daughter         738.142           3
Taylor Swift                Wood         603.506           6
Taylor Swift                loml         554.314           2
      Hozier        That You Are         513.040           2
      Hozier           Too Sweet         509.639           4
Taylor Swift           Labyrinth         495.924           2
      Jokers            오솔길 Path         485.344           3
Taylor Swift             Opalite         476.356           3
Taylor Swift      Hits Different         468.934           2
      Hozier           Work Song         461.115           4
Taylor Swift Ruin The Friendship         441.130           2
Taylor Swift       Father Figure         425.554           2
Taylor Swift          CANCELLED!         422.996           2
Taylor Swift        Question...?         421.114           2


## January 2026

In [11]:
run_all_for_month(2026, 1)


=== TOP BY SECONDS ===
            artistName                                      trackName  seconds_played  play_count
          Harry Styles                         Meet Me in the Hallway         454.400           2
              The 1975                                  Somebody Else         347.510           1
          Harry Styles                              Sign of the Times         340.715           1
              Bon Iver                                       Holocene         336.630           1
              Coldplay                                  The Scientist         309.600           1
                Hozier                                         Shrike         298.880           1
            Noah Kahan                                   Orange Juice         297.103           1
          Taylor Swift Better Man (Taylor's Version) (From The Vault)         297.013           1
              Coldplay                                        Fix You         295.533         