In [1]:
import polars as pl
import polars.selectors as cs

import altair as alt
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt

# Feature Enginiering

In [2]:
spotify_history_path = r"D:\Datasets\Data Preaparations\spotify_history.csv"
spotify_data_dictionary_description_path = r"D:\Datasets\Data Preaparations\spotify_data_dictionary Description.csv"

In [3]:
spotify_history_df = pl.read_csv(spotify_history_path)

In [4]:
spotify_data_dictionary_description_df = pl.read_csv(spotify_data_dictionary_description_path)

In [6]:
spotify_history_df.describe()

statistic,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
str,str,str,str,f64,str,str,str,str,str,f64,f64
"""count""","""149860""","""149860""","""149860""",149860.0,"""149860""","""149860""","""149860""","""149717""","""149743""",149860.0,149860.0
"""null_count""","""0""","""0""","""0""",0.0,"""0""","""0""","""0""","""143""","""117""",0.0,0.0
"""mean""",,,,128316.635093,,,,,,0.744582,0.052509
"""std""",,,,117840.060332,,,,,,,
"""min""","""003d3VbyJTZiiOYT2W7fnQ""","""2013-07-08 02:44:34""","""android""",0.0,"""!!!!!!!""","""""Weird Al"" Yankovic""","""!!Going Places!!""","""appload""","""appload""",0.0,0.0
"""25%""",,,,2795.0,,,,,,,
"""50%""",,,,138840.0,,,,,,,
"""75%""",,,,218502.0,,,,,,,
"""max""","""7zwWMJUV62QNNLz15hrWZl""","""2024-12-15 23:06:25""","""windows""",1561125.0,"""🪐""","""落日飛車 Sunset Rollercoaster""","""母親""","""unknown""","""unknown""",1.0,1.0


In [8]:
spotify_history_df.collect_schema()

Schema([('spotify_track_uri', String),
        ('ts', String),
        ('platform', String),
        ('ms_played', Int64),
        ('track_name', String),
        ('artist_name', String),
        ('album_name', String),
        ('reason_start', String),
        ('reason_end', String),
        ('shuffle', Boolean),
        ('skipped', Boolean)])

In [11]:
spotify_history_df.estimated_size('mb')

17.740535736083984

In [19]:
spotify_history_df = spotify_history_df.with_columns(
    pl.col('reason_start').fill_null('unknown'),
    pl.col('reason_end').fill_null('unknown'),
    pl.col('ms_played').cast(pl.UInt32),
    pl.col('ts').str.strptime(pl.Datetime(), format='%Y-%m-%d %H:%M:%S')
)

In [21]:
spotify_data_dictionary_description_df.collect_schema()

Schema([('Field', String), ('Description', String)])

In [22]:
spotify_data_dictionary_description_df.describe()

statistic,Field,Description
str,str,str
"""count""","""11""","""11"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""","""album_name""","""Name of the album"""
"""25%""",,
"""50%""",,
"""75%""",,
"""max""","""ts""","""Why the track started"""


In [23]:
spotify_data_dictionary_description_df

Field,Description
str,str
"""spotify_track_uri""","""Spotify URI that uniquely iden…"
"""ts""","""Timestamp indicating when the …"
"""platform""","""Platform used when streaming t…"
"""ms_played""","""Number of milliseconds the str…"
"""track_name""","""Name of the track"""
…,…
"""album_name""","""Name of the album"""
"""reason_start""","""Why the track started"""
"""reason_end""","""Why the track ended"""
"""shuffle""","""TRUE or FALSE depending on if …"


In [27]:
spotify_history_df.write_parquet(
    r'D:\Datasets\Data Werehouse\Top Spotify Listening History Songs in Countries\spotify_history.parquet',
    compression='zstd',
    compression_level=22,
)

In [28]:
spotify_data_dictionary_description_df.write_parquet(
    r'D:\Datasets\Data Werehouse\Top Spotify Listening History Songs in Countries\spotify_data_dictionary_description'
    r'.parquet',
    compression='zstd',
    compression_level=22,
)

# Data analysis

In [5]:
df_path = r'D:\Datasets\Data Werehouse\Top Spotify Listening History Songs in Countries\spotify_history.parquet'

In [6]:
df = pl.read_parquet(df_path)

In [9]:
df

spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
str,datetime[μs],str,u32,str,str,str,str,str,bool,bool
"""2J3n32GeLmMjwuAzyhcSNe""",2013-07-08 02:44:34,"""web player""",3185,"""Say It, Just Say It""","""The Mowgli's""","""Waiting For The Dawn""","""autoplay""","""clickrow""",false,false
"""1oHxIPqJyvAYHy0PVrDU98""",2013-07-08 02:45:37,"""web player""",61865,"""Drinking from the Bottle (feat…","""Calvin Harris""","""18 Months""","""clickrow""","""clickrow""",false,false
"""487OPlneJNni3NWC8SYqhW""",2013-07-08 02:50:24,"""web player""",285386,"""Born To Die""","""Lana Del Rey""","""Born To Die - The Paradise Edi…","""clickrow""","""unknown""",false,false
"""5IyblF777jLZj1vGHG2UD3""",2013-07-08 02:52:40,"""web player""",134022,"""Off To The Races""","""Lana Del Rey""","""Born To Die - The Paradise Edi…","""trackdone""","""clickrow""",false,false
"""0GgAAB0ZMllFhbNc3mAodO""",2013-07-08 03:17:52,"""web player""",0,"""Half Mast""","""Empire Of The Sun""","""Walking On A Dream""","""clickrow""","""nextbtn""",false,false
…,…,…,…,…,…,…,…,…,…,…
"""4Fz1WWr5o0OrlIcZxcyZtK""",2024-12-15 23:06:19,"""android""",1247,"""On The Way Home""","""John Mayer""","""Paradise Valley""","""fwdbtn""","""fwdbtn""",true,true
"""0qHMhBZqYb99yhX9BHcIkV""",2024-12-15 23:06:21,"""android""",1515,"""Magical Mystery Tour - Remaste…","""The Beatles""","""Magical Mystery Tour""","""fwdbtn""","""fwdbtn""",true,true
"""0HHdujGjOZChTrl8lJWEIq""",2024-12-15 23:06:22,"""android""",1283,"""Stop This Train - Live at the …","""John Mayer""","""Where the Light Is: John Mayer…","""fwdbtn""","""fwdbtn""",true,true
"""7peh6LUcdNPcMdrSH4JPsM""",2024-12-15 23:06:23,"""android""",1306,"""I Don't Trust Myself (With Lov…","""John Mayer""","""Continuum""","""fwdbtn""","""fwdbtn""",true,true


In [8]:
df.collect_schema()

Schema([('spotify_track_uri', String),
        ('ts', Datetime(time_unit='us', time_zone=None)),
        ('platform', String),
        ('ms_played', UInt32),
        ('track_name', String),
        ('artist_name', String),
        ('album_name', String),
        ('reason_start', String),
        ('reason_end', String),
        ('shuffle', Boolean),
        ('skipped', Boolean)])