In [1]:
from pathlib import Path
from typing import Iterable, Optional, Union
import pandas as pd
import json

In [2]:
def spotify_history_for_month(json_path, year, month):
    """
    Read a downloaded Spotify listening history .json and return only rows in (year, month).

    Works with common Spotify exports:
      - Classic: endTime, artistName, trackName, msPlayed
      - Extended: ts, master_metadata_album_artist_name, master_metadata_track_name, ms_played
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    df = pd.DataFrame(data)

    # Pick the timestamp column Spotify used
    if "endTime" in df.columns:
        df["played_at"] = pd.to_datetime(df["endTime"])
    else:
        df["played_at"] = pd.to_datetime(df["ts"])

    # Filter to the month
    df = df[(df["played_at"].dt.year == year) & (df["played_at"].dt.month == month)]

    return df.sort_values("played_at").reset_index(drop=True)

In [3]:
def top_15_songs_separate_rankings(df):
    """
    Return two DataFrames:
      1) Top 15 songs by total seconds played
      2) Top 15 songs by discrete play count
    """
    # Handle both Spotify export formats
    if "trackName" in df.columns:
        track_col = "trackName"
        artist_col = "artistName"
        ms_col = "msPlayed"
    else:
        track_col = "master_metadata_track_name"
        artist_col = "master_metadata_album_artist_name"
        ms_col = "ms_played"

    summary = (
        df.groupby([artist_col, track_col])[ms_col]
          .agg(
              seconds_played=lambda x: x.sum() / 1000,
              play_count="count"
          )
          .reset_index()
    )

    top_by_seconds = (
        summary.sort_values("seconds_played", ascending=False)
               .head(15)
               .reset_index(drop=True)
    )

    top_by_plays = (
        summary.sort_values("play_count", ascending=False)
               .head(15)
               .reset_index(drop=True)
    )

    return top_by_seconds, top_by_plays

In [4]:
def songs_not_in_both(top_by_seconds, top_by_plays):
    """
    Return songs that appear in one list but not the other.
    """
    cols = ["artistName", "trackName"]

    set_seconds = set(map(tuple, top_by_seconds[cols].values))
    set_plays = set(map(tuple, top_by_plays[cols].values))

    only_seconds = set_seconds - set_plays
    only_plays = set_plays - set_seconds

    return {
        "only_in_top_by_seconds": list(only_seconds),
        "only_in_top_by_plays": list(only_plays),
    }

In [5]:
def remove_christmas_songs(df):
    """
    Remove Christmas / holiday songs from a Spotify history DataFrame.
    """
    # Handle both Spotify export formats
    if "trackName" in df.columns:
        track_col = "trackName"
    else:
        track_col = "master_metadata_track_name"

    christmas_keywords = [
        "christmas", "xmas", "holiday", "santa", "reindeer",
        "snow", "winter", "jingle", "noel", "nativity", "hallelujah"
    ]

    mask = df[track_col].str.lower().str.contains(
        "|".join(christmas_keywords),
        na=False
    )

    return df[~mask].reset_index(drop=True)

In [37]:
def remove_soundtracks(df):
    """
    Remove soundtrack songs from a Spotify history DataFrame.
    """
    # Handle both Spotify export formats
    if "trackName" in df.columns:
        track_col = "trackName"
    else:
        track_col = "master_metadata_track_name"

    soundtrack_keywords = [
        "soundtrack", "score", "ost", "original motion picture", "film music",
        "movie", "tv show", "theme", "background music", "broadway", "unknown"
    ]

    mask = df[track_col].str.lower().str.contains(
        "|".join(soundtrack_keywords),
        na=False
    )

    df = df[~mask].reset_index(drop=True)

    if "artistName" in df.columns:
        artist_col = "artistName"
    else:
        artist_col = "master_metadata_artist_name"

    soundtrack_keywords = [
        "broadway", "james newton howard", "hans zimmer",
        "john williams", "danny elfman", "howard shore", "alan menken",
        "stephen sondheim", "andrew lloyd webber", "unknown"
    ]

    mask = df[artist_col].str.lower().str.contains(
        "|".join(soundtrack_keywords),
        na=False
    )

    return df[~mask].reset_index(drop=True)

## November 2025

In [6]:
nov_2025 = spotify_history_for_month("Spotify Account Data/StreamingHistory_music_1.json", 2025, 11)
nov_top_seconds, nov_top_plays = top_15_songs_separate_rankings(nov_2025)


In [7]:
print(nov_top_seconds)

      artistName                      trackName  seconds_played  play_count
0   Taylor Swift                      Labyrinth        1487.772           6
1   Taylor Swift                Eldest Daughter        1445.159           6
2   Taylor Swift                  The Great War        1364.784           6
3   Taylor Swift                  Father Figure        1314.528           9
4   Taylor Swift                      Wi$h Li$t        1244.110           6
5   Taylor Swift                 Hits Different        1195.021           6
6   Taylor Swift                          Paris        1177.554           6
7   Taylor Swift                        Opalite        1176.371           5
8   Taylor Swift            Ruin The Friendship        1171.000           6
9   Taylor Swift                      Bejeweled        1164.996           6
10  Taylor Swift               Elizabeth Taylor        1146.140           7
11  Taylor Swift            The Fate of Ophelia        1130.110           5
12  Taylor S

In [8]:
print(nov_top_plays)

           artistName            trackName  seconds_played  play_count
0        Taylor Swift        Father Figure        1314.528           9
1        Taylor Swift                Honey        1089.220           7
2        Taylor Swift     Elizabeth Taylor        1146.140           7
3        Taylor Swift           CANCELLED!        1025.894           7
4        Taylor Swift    Actually Romantic         982.192           6
5        Taylor Swift      Eldest Daughter        1445.159           6
6        Taylor Swift  Ruin The Friendship        1171.000           6
7        Taylor Swift            Wi$h Li$t        1244.110           6
8        Taylor Swift                Paris        1177.554           6
9        Taylor Swift       Hits Different        1195.021           6
10       Taylor Swift            Labyrinth        1487.772           6
11       Taylor Swift        The Great War        1364.784           6
12       Taylor Swift            Bejeweled        1164.996           6
13  Ha

In [9]:
nov_diff = songs_not_in_both(nov_top_seconds, nov_top_plays)

In [10]:
nov_diff["only_in_top_by_seconds"]

[('Taylor Swift', 'The Fate of Ophelia'),
 ('Taylor Swift', "Would've, Could've, Should've")]

In [11]:
nov_diff["only_in_top_by_plays"]

[('Haley Heynderickx', 'The Bug Collector'),
 ('Taylor Swift', 'Actually Romantic')]

## December 2025

In [32]:
dec_2025 = spotify_history_for_month("Spotify Account Data/StreamingHistory_music_1.json", 2025, 12)


In [38]:
clean_dec_2025 = remove_soundtracks(remove_christmas_songs(dec_2025))

In [39]:
dec_top_seconds, dec_top_plays = top_15_songs_separate_rankings(clean_dec_2025)

In [40]:
print(dec_top_seconds)

      artistName            trackName  seconds_played  play_count
0   Taylor Swift  The Fate of Ophelia         813.860           5
1   Taylor Swift      Eldest Daughter         738.142           3
2   Taylor Swift                 Wood         603.506           6
3   Taylor Swift                 loml         554.314           2
4         Hozier         That You Are         513.040           2
5         Hozier            Too Sweet         509.639           4
6   Taylor Swift            Labyrinth         495.924           2
7         Jokers             오솔길 Path         485.344           3
8   Taylor Swift              Opalite         476.356           3
9   Taylor Swift       Hits Different         468.934           2
10        Hozier            Work Song         461.115           4
11  Taylor Swift  Ruin The Friendship         441.130           2
12  Taylor Swift        Father Figure         425.554           2
13  Taylor Swift           CANCELLED!         422.996           2
14  Taylor

In [41]:
print(dec_top_plays)

      artistName                                         trackName  \
0     Song a Day                                         Baby Yoda   
1   Taylor Swift                                              Wood   
2   Taylor Swift                               The Fate of Ophelia   
3         Hozier                                         Too Sweet   
4         Hozier                                         Work Song   
5         Hozier                                  Butchered Tongue   
6        HUNTR/X                                          Takedown   
7   Taylor Swift  The Life of a Showgirl (feat. Sabrina Carpenter)   
8        HUNTR/X                                            Golden   
9   Taylor Swift                                           Opalite   
10        Jokers                                          오솔길 Path   
11  Taylor Swift                                   Eldest Daughter   
12  Taylor Swift                                            Glitch   
13        Hozier    

In [42]:
dec_diff = songs_not_in_both(dec_top_seconds, dec_top_plays)

In [43]:
dec_diff["only_in_top_by_seconds"]

[('Taylor Swift', 'Labyrinth'),
 ('Taylor Swift', 'CANCELLED!'),
 ('Taylor Swift', 'Question...?'),
 ('Taylor Swift', 'Hits Different'),
 ('Taylor Swift', 'Ruin The Friendship'),
 ('Taylor Swift', 'loml'),
 ('Taylor Swift', 'Father Figure')]

In [44]:
dec_diff["only_in_top_by_plays"]

[('HUNTR/X', 'Golden'),
 ('Rumi', 'Free'),
 ('Hozier', 'Butchered Tongue'),
 ('HUNTR/X', 'Takedown'),
 ('Song a Day', 'Baby Yoda'),
 ('Taylor Swift', 'The Life of a Showgirl (feat. Sabrina Carpenter)'),
 ('Taylor Swift', 'Glitch')]