In [1]:
import typing

import polars as pl

In [2]:
pl.Config.set_fmt_str_lengths(40)

polars.config.Config

## Streaming History as DataFrame


In [3]:
stream_history = pl.concat(
    [
        pl.read_json("data.arv/Spotify Account Data/StreamingHistory0.json"),
        pl.read_json("data.arv/Spotify Account Data/StreamingHistory1.json"),
        pl.read_json("data.arv/Spotify-Data-5.Jan/StreamingHistory0.json"),
        pl.read_json("data.arv/Spotify-Data-5.Jan/StreamingHistory1.json"),
    ]
).unique()
print(stream_history.shape)
stream_history.head()

(29742, 4)


endTime,artistName,trackName,msPlayed
str,str,str,i64
"""2022-11-29 16:23""","""Sonu Nigam""","""Tumse Milke Dil Ka""",305194
"""2022-12-01 16:55""","""Amit Trivedi""","""Ghodey Pe Sawaar""",172238
"""2022-12-02 03:31""","""Zack Knight""","""Armani""",108748
"""2022-12-02 03:34""","""Pritam""","""Kahani (From ""Laal Singh Chaddha"")""",208539
"""2022-12-02 08:48""","""Vishal Chandrashekhar""","""Sita Ramam Theme (Telugu)""",161121


### Drop the concluded!


In [4]:
# See https://github.com/arv-anshul/notebooks/tree/main/spotify-analysis/StreamingHistory.ipynb
# to know why I droped these.
prev_height = stream_history.height
stream_history = stream_history.filter(
    # Drop tracks whose playtime is lesser than 10sec
    pl.col("msPlayed").gt(10_000),
    # Drop tracks whose playtime is greater than 20min
    pl.col("msPlayed").lt(60_000 * 20),
)

new_height = stream_history.height
print(f"We've dropped {prev_height - new_height}.")

We've dropped 3176.


## Playlist as DataFrame


In [5]:
playlist = pl.read_json("data.arv/Spotify Account Data/Playlist1.json")
print(playlist.shape)
playlist.head()

(1, 1)


playlists
list[struct[5]]
"[{""miscellaneous"",""2023-11-14"",[{{""Toh Kya Ye Teh Hai"",""Zeeshan Ali"",""Toh Kya Ye Teh Hai"",""spotify:track:682n0ZYSJuHmxJ28Mxn7qK""},{null,null,null},null,""2023-10-08""}, {{""Choo Lo"",""Urban Crunch Media"",""Choo Lo"",""spotify:track:42Kn55DyO5BG9FlGUPwn2V""},{null,null,null},null,""2023-10-08""}, … {{""Bataa"",""Osho Jain"",""Permanent Roommates: Season 3 (Music from the Series)"",""spotify:track:0t5WjcLOTCLFKNc1QkXAkQ""},{null,null,null},null,""2023-11-06""}],null,0}, {""Retro"",""2023-11-02"",[{{""Teri Duniya Se Hoke Majboor Chala"",""Kishore Kumar"",""Pavitra Papi"",""spotify:track:3J3BTZ3FN5c5FsHo5N9Fdq""},{null,null,null},null,""2023-08-15""}, {{""Ghunghroo Ki Tarah Bajta Hi Raha"",""Kishore Kumar"",""Chor Machaye Shor"",""spotify:track:34B7aElZCqkX3jnLHCTTJF""},{null,null,null},null,""2023-08-15""}, … {{""Aye Dil Kisi Ki Yaad (From ""Ik Tera Sahara"")"",""Saleem Raza"",""Ik Tera Sahara / Aag Ka Daryia"",""spotify:track:2oeehqJKzBU94BcXgXdcDF""},{null,null,null},null,""2023-11-02""}],null,0}, … {""English Hai"",""2023-12-01"",[{{""Bad Habits"",""Ed Sheeran"",""Bad Habits"",""spotify:track:6PQ88X9TkUIAUIZJHW2upE""},{null,null,null},null,""2021-07-10""}, {{""Señorita"",""Shawn Mendes"",""Señorita"",""spotify:track:0TK2YIli7K1leLovkQiNik""},{null,null,null},null,""2020-06-29""}, … {{""golden hour"",""JVKE"",""this is what falling in love feels like (554Hz)"",""spotify:track:4yNk9iz9WVJikRFle3XEvn""},{null,null,null},null,""2023-12-01""}],null,0}]"


### Preprocessing


In [6]:
# I think `polars` APIs are beautiful. Just see 😍 this cell.
playlist = (
    playlist.explode("playlists")
    .unnest("playlists")
    .explode("items")
    .unnest("items")
    .unnest("track")
    .drop(
        "addedDate",
        "description",
        "episode",
        "lastModifiedDate",
        "localTrack",
        "numberOfFollowers",
    )
    .rename({"name": "playlistName"})
)
print(playlist.shape)
playlist.head()

(818, 5)


playlistName,trackName,artistName,albumName,trackUri
str,str,str,str,str
"""miscellaneous""","""Toh Kya Ye Teh Hai""","""Zeeshan Ali""","""Toh Kya Ye Teh Hai""","""spotify:track:682n0ZYSJuHmxJ28Mxn7qK"""
"""miscellaneous""","""Choo Lo""","""Urban Crunch Media""","""Choo Lo""","""spotify:track:42Kn55DyO5BG9FlGUPwn2V"""
"""miscellaneous""","""Humne Bhi""","""The Western Ghats""","""Humne Bhi - Single""","""spotify:track:78bJtdSTpHYS8u9axaismu"""
"""miscellaneous""","""Dhun""","""Navjot Ahuja""","""Dhun""","""spotify:track:1PYV3TA7E1elt9AFcQmOIZ"""
"""miscellaneous""","""Chale Sabse Dur""","""Ganeshsingh Rathore""","""Chale Sabse Dur""","""spotify:track:0912JziGoyyDFCu2WGwaIq"""


In [7]:
# All Playlist names
playlist.get_column("playlistName").unique()

playlistName
str
"""InJoy"""
"""Retro"""
"""gOLD"""
"""BANOOK"""
"""Master-Pस """
"""Yeah! Beats"""
"""think nothing"""
"""Ka-riyan"""
"""think exception"""
"""PICK-UPS"""


## Merge "Streaming History" and "Playlist"

**ON**: `"trackName"` and `"artistName"`


In [8]:
df = stream_history.join(playlist, on=["artistName", "trackName"])
print(df.shape)
df.head()

(12267, 7)


endTime,artistName,trackName,msPlayed,playlistName,albumName,trackUri
str,str,str,i64,str,str,str
"""2022-11-29 16:23""","""Sonu Nigam""","""Tumse Milke Dil Ka""",305194,"""Part-EE""","""Main Hoon Na""","""spotify:track:1ax8ZuwRVkSdzzsIqyCNWQ"""
"""2022-12-02 03:34""","""Pritam""","""Kahani (From ""Laal Singh Chaddha"")""",208539,"""HNDi""","""Kahani (From ""Laal Singh Chaddha"")""","""spotify:track:7Lcp4QlpR3RDngYejm4fmS"""
"""2022-12-02 08:48""","""Vishal Chandrashekhar""","""Sita Ramam Theme (Telugu)""",161121,"""Yeah! Beats""","""Sita Ramam (Telugu) (Extended Version)""","""spotify:track:1YxoeF51bofo4ODO9XQkTV"""
"""2022-12-04 12:14""","""Jasleen Royal""","""Kho Gaye Hum Kahan""",10445,"""think nothing""","""Baar Baar Dekho""","""spotify:track:6nZiYSBwPQ7fYnVWkkkj4g"""
"""2022-12-04 18:39""","""The Weeknd""","""Is There Someone Else?""",155847,"""Thend""","""Dawn FM""","""spotify:track:0mL82sxCRjrs3br407IdJh"""


### Preprocessing


In [9]:
df = df.with_columns(
    pl.col("endTime").str.to_datetime(),
    pl.col("trackName", "albumName")
    .str.replace(r"\(.*", "")
    .str.replace(r"-.*", "")
    .str.strip_chars_end(),
)
df.head(3)

endTime,artistName,trackName,msPlayed,playlistName,albumName,trackUri
datetime[μs],str,str,i64,str,str,str
2022-11-29 16:23:00,"""Sonu Nigam""","""Tumse Milke Dil Ka""",305194,"""Part-EE""","""Main Hoon Na""","""spotify:track:1ax8ZuwRVkSdzzsIqyCNWQ"""
2022-12-02 03:34:00,"""Pritam""","""Kahani""",208539,"""HNDi""","""Kahani""","""spotify:track:7Lcp4QlpR3RDngYejm4fmS"""
2022-12-02 08:48:00,"""Vishal Chandrashekhar""","""Sita Ramam Theme""",161121,"""Yeah! Beats""","""Sita Ramam""","""spotify:track:1YxoeF51bofo4ODO9XQkTV"""


## References

| ShortCode | Description       |
| :-------: | ----------------- |
|  **T/A**  | Track/Artist      |
| **T/As**  | Track/Artist(s)   |
|  **P/A**  | Playlist/Album    |
| **P/As**  | Playlist/Album(s) |


## Some Awesome Insights

- [x] Most streamed P/As
- [x] No. of T/As and Albums in each Playlist and calculate `"minutesPlayed"` for each
- [x] Streaming timline of P/As (with `plot.line()`)
- [x] Check any Track present in multiple Playlists
- [x] Playlist-wise top T/As
- [x] Check any artist present in multiple Playlists
- [ ] Days when the user doesn't listened any track


### Basic Info


In [10]:
print(
    f"Datetime range: ({df['endTime'].min():%d %B, %Y}) — ({df['endTime'].max():%d %B, %Y})"
)
print(f"No. of days: {df['endTime'].dt.date().n_unique()}")

Datetime range: (02 January, 2022) — (02 December, 2023)
No. of days: 680


In [11]:
(
    df.group_by("playlistName")
    .agg(
        # Some tracks has same name that's why used "trackUri" feature
        pl.n_unique("trackUri").alias("trackName"),
        pl.n_unique("artistName", "albumName"),
        pl.sum("msPlayed").truediv(60_000).cast(int).alias("minutesPlayed"),
    )
    .sort("minutesPlayed", descending=True)
)

playlistName,trackName,artistName,albumName,minutesPlayed
str,u32,u32,u32,i64
"""think nothing""",76,41,72,8110
"""Tan-maya""",112,87,107,3916
"""feel ENGlish""",50,44,48,2654
"""Nandu-san""",46,36,44,2045
"""gOLD""",35,21,28,1622
"""PICK-UPS""",18,16,18,1532
"""Thend""",21,6,11,1485
"""gOLD-en""",29,23,27,1401
"""English Hai""",29,24,29,1339
"""miscellaneous""",42,41,42,1301


### Most streamed P/As


In [None]:
def most_streamed(
    type: typing.Literal["playlistName", "albumName", "artistName", "trackName"],  # noqa: A002
):
    """Most streamed `<type>` on the basis of tracks counts."""
    return (
        df.group_by(type)
        .len()
        .top_k(10, by="len")
        .plot.bar(
            type,
            "len",
            ylabel="No. of tracks played",
            width=1000,
            rot=45,
        )
    )

In [15]:
most_streamed("playlistName")

In [14]:
most_streamed("albumName")

In [13]:
most_streamed("artistName")

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


### P/As streaming timeline


In [16]:
albums = ["Dawn FM", "Kassor", "Faasle", "Sun Lo Na"]
playlists = ["talwiinder", "dNO", "gOLD-en", "Thend"]

In [17]:
(
    df.filter(
        pl.col("playlistName").is_in(playlists),
    )
    .group_by(
        pl.col("endTime").dt.month().alias("month"),
        "playlistName",
    )
    .agg(
        pl.col("msPlayed").sum().truediv(60_000).cast(int).alias("minutesPlayed"),
    )
    .sort("month")
    .plot.line("month", "minutesPlayed", by="playlistName", width=1000, height=500)
)

In [18]:
(
    df.filter(
        pl.col("albumName").is_in(albums),
    )
    .group_by(
        pl.col("endTime").dt.month().alias("month"),
        "albumName",
    )
    .agg(
        pl.col("msPlayed").sum().truediv(60_000).cast(int).alias("minutesPlayed"),
    )
    .sort("month")
    .plot.line("month", "minutesPlayed", by="albumName", width=1000, height=500)
)

### Check any Track present in multiple Playlists


In [30]:
(
    df.group_by("trackUri", "artistName")
    .agg(
        pl.col("trackName").first(),
        pl.col("playlistName").unique().sort(),
    )
    .drop("trackUri")
    .filter(
        pl.col("playlistName").list.len().gt(1),
        # pl.col("playlistName").list.contains("English Hai"),
    )
    .with_columns(
        pl.col("playlistName").list.len().alias("nPlaylist"),
    )
    .sort("nPlaylist", descending=True)
)

artistName,trackName,playlistName,nPlaylist
str,str,list[str],u32
"""Vasuda Sharma""","""Mera Hai Tu""","[""Tan-maya"", ""miscellaneous"", ""think nothing""]",3
"""Taba Chake""","""Blush""","[""Tan-maya"", ""think nothing""]",2
"""Taylor Swift""","""Cruel Summer""","[""English Hai"", ""InJoy""]",2
"""Prateek Kuhad""","""Shehron Ke Raaz""","[""Nandu-san"", ""Tan-maya""]",2
"""Zodiac Wave""","""Emptiness""","[""Yeah! Beats"", ""feel ENGlish""]",2
"""Prateek Kuhad""","""Tum Jab Paas""","[""Tan-maya"", ""think nothing""]",2
"""NAALAYAK""","""Haan Pyaar Hai""","[""Tan-maya"", ""think nothing""]",2
"""Nicholas Yee""","""Paint it Black""","[""Compose It !!"", ""Yeah! Beats""]",2
"""Abhilasha Sinha""","""Jab Tum Miley""","[""Nandu-san"", ""Tan-maya""]",2
"""Prateek Kuhad""","""Kasoor""","[""Tan-maya"", ""think nothing""]",2


### Playlist-wise top T/As


In [27]:
_playlist = "Tan-maya"
(
    df.filter(
        pl.col("playlistName").eq(_playlist),
    )
    .group_by("trackUri")
    .agg(
        pl.col("trackName").first(),
        pl.col("msPlayed").sum().truediv(60_000).cast(int).alias("minutesPlayed"),
    )
    .top_k(10, by="minutesPlayed")
    .plot.bar(
        "trackName",
        "minutesPlayed",
        title=f"playlist = {_playlist!r}",
        min_width=1000,
        rot=45,
    )
)

In [25]:
_playlist = "think nothing"
(
    df.filter(
        pl.col("playlistName").eq(_playlist),
    )
    .group_by("artistName")
    .agg(
        pl.col("msPlayed").sum().truediv(60_000).cast(int).alias("minutesPlayed"),
    )
    .top_k(10, by="minutesPlayed")
    .plot.bar(
        "artistName",
        "minutesPlayed",
        title=f"playlist = {_playlist!r}",
        min_width=1000,
        rot=45,
    )
)

### Check any artist present in multiple Playlists


In [29]:
(
    df.group_by("artistName")
    .agg(
        pl.col("playlistName").unique().sort(),
    )
    .drop("trackUri")
    .filter(
        pl.col("playlistName").list.len().gt(1),
    )
    .with_columns(
        pl.col("playlistName").list.len().alias("nPlaylist"),
    )
    .sort("nPlaylist", descending=True)
)

artistName,playlistName,nPlaylist
str,list[str],u32
"""A.R. Rahman""","[""HNDi"", ""Master-Pस "", … ""gOLD""]",5
"""Taylor Swift""","[""English Hai"", ""InJoy"", … ""think nothing""]",5
"""Pritam""","[""HNDi"", ""Master-Pस "", … ""हींग-lish""]",4
"""Lucky Ali""","[""HNDi"", ""Master-Pस "", … ""gOLD""]",4
"""Sonu Nigam""","[""Master-Pस "", ""Part-EE"", ""gOLD""]",3
"""Kishore Kumar""","[""Part-EE"", ""Retro"", ""gOLD""]",3
"""Mohammed Rafi""","[""Retro"", ""Tan-maya"", ""gOLD""]",3
"""When Chai Met Toast""","[""PICK-UPS"", ""Tan-maya"", ""think nothing""]",3
"""Prateek Kuhad""","[""Nandu-san"", ""Tan-maya"", ""think nothing""]",3
"""Zedd""","[""InJoy"", ""Jd"", ""feel ENGlish""]",3


### Days when the user doesn't listened any track


In [31]:
start_date = df.get_column("endTime").dt.date().min()
end_date = df.get_column("endTime").dt.date().max()
date_range: pl.Series = (
    pl.datetime_range(
        start_date,  # type: ignore
        end_date,  # type: ignore
        interval="1d",
        eager=True,
    )
    .dt.date()
    .alias("date_range")
)
date_range.shape

(700,)

In [32]:
start_date, end_date

(datetime.date(2022, 1, 2), datetime.date(2023, 12, 2))

In [33]:
date_range.filter(
    date_range.is_in(
        df.get_column("endTime").dt.date(),
    ).not_(),
)

date_range
date
2022-01-12
2022-03-24
2022-05-19
2022-06-20
2022-09-18
2022-10-03
2022-10-04
2022-10-05
2022-10-27
2022-10-29
