In [1]:
import pandas as pd

class SpotifyDataProcessor:
    def __init__(self, url: str):
        self.url = url
        self.df = None

    # Step 1: Load Data
    def load_data(self):
        self.df = pd.read_csv(self.url)
        return self

    # Step 2: Convert date + extract year
    def process_dates(self):
        self.df["track_album_release_date"] = pd.to_datetime(self.df["track_album_release_date"], errors="coerce")
        self.df["release_year"] = self.df["track_album_release_date"].dt.year.astype("Int64")
        return self

    # Step 3: Show songs per year
    def songs_per_year(self):
        return self.df["release_year"].value_counts().sort_index()

    # Step 4: Drop unnecessary columns
    def drop_columns(self):
        self.df = self.df.drop(columns=["track_id", "track_album_id", "playlist_id", "track_album_release_date"])
        return self

    # Step 5: Filter by year (>= 2015)
    def filter_by_year(self, year=2015):
        self.df = self.df[self.df["release_year"] >= year]
        return self

    # Step 6: Fill missing values
    def handle_missing(self):
        self.df = self.df.fillna("Unknown")
        return self

    # Step 7: Convert duration to seconds
    def convert_duration(self):
        self.df["duration_sec"] = (self.df["duration_ms"] / 1000).round(2)
        self.df = self.df.drop(columns=["duration_ms"])
        return self

    # Step 8: Get duration stats
    def duration_stats(self):
        return self.df["duration_sec"].describe()

    # Step 9: Check missing values
    def null_summary(self):
        return self.df.isnull().sum()

    # Step 10: Return final cleaned data
    def get_clean_data(self):
        return self.df


# Run directly from the same file
if __name__ == "__main__":
    URL = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv"
    
    processor = (
        SpotifyDataProcessor(URL)
        .load_data()
        .process_dates()
    )

    print("Songs Per Year:\n", processor.songs_per_year())

    processor = (
        processor
        .drop_columns()
        .filter_by_year(2015)
        .handle_missing()
        .convert_duration()
    )

    clean_df = processor.get_clean_data()

    print("\nMissing Values:\n", processor.null_summary())
    print("\nDuration Stats:\n", processor.duration_stats())
    print("\nCleaned Data (Top 5 Rows):")
    print(clean_df.head())


Songs Per Year:
 release_year
1957       1
1958       1
1961       1
1963       4
1964       8
        ... 
2016    2114
2017    2426
2018    3312
2019    9079
2020     785
Name: count, Length: 61, dtype: Int64

Missing Values:
 track_name           0
track_artist         0
track_popularity     0
track_album_name     0
playlist_name        0
playlist_genre       0
playlist_subgenre    0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
release_year         0
duration_sec         0
dtype: int64

Duration Stats:
 count    19477.000000
mean       207.140670
std         51.194689
min         31.430000
25%        177.120000
50%        201.390000
75%        227.870000
max        517.810000
Name: duration_sec, dtype: float64

Cleaned Data (Top 5 Rows):
                                         