In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import joblib

df = pd.read_csv('/Users/sa25/Desktop/music-rec-algorithm-TLAB/data/train.csv')

df.shape

(28362, 24)

In [2]:
df.columns

Index(['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre',
       'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'sadness', 'feelings', 'topic', 'age'],
      dtype='object')

In [3]:
# Drop columns that do not help define clusters

df_cleaned = df[[
    #'Unnamed: 0', 'artist_name', 'track_name', 
    'release_date', 'genre',
    #'lyrics', 
    'len', 'dating', 'violence', 'world/life', 'night/time',
    'shake the audience', 'family/gospel', 'romantic', 'communication',
    'obscene', 'music', 'movement/places', 'light/visual perceptions',
    'family/spiritual', 'sadness', 'feelings', 'topic', 
    #'age'
    ]].copy()

In [4]:
df_cleaned.columns

Index(['release_date', 'genre', 'len', 'dating', 'violence', 'world/life',
       'night/time', 'shake the audience', 'family/gospel', 'romantic',
       'communication', 'obscene', 'music', 'movement/places',
       'light/visual perceptions', 'family/spiritual', 'sadness', 'feelings',
       'topic'],
      dtype='object')

In [5]:
df_cleaned.dtypes

release_date                  int64
genre                        object
len                           int64
dating                      float64
violence                    float64
world/life                  float64
night/time                  float64
shake the audience          float64
family/gospel               float64
romantic                    float64
communication               float64
obscene                     float64
music                       float64
movement/places             float64
light/visual perceptions    float64
family/spiritual            float64
sadness                     float64
feelings                    float64
topic                        object
dtype: object

In [6]:
# Check for % of Null/missing values

df_cleaned.isnull().mean()

release_date                0.0
genre                       0.0
len                         0.0
dating                      0.0
violence                    0.0
world/life                  0.0
night/time                  0.0
shake the audience          0.0
family/gospel               0.0
romantic                    0.0
communication               0.0
obscene                     0.0
music                       0.0
movement/places             0.0
light/visual perceptions    0.0
family/spiritual            0.0
sadness                     0.0
feelings                    0.0
topic                       0.0
dtype: float64

In [7]:
# Flag outliers using IQR filtering
df_flagged = df_cleaned.copy()

# Loop through all numeric columns
for col in df_flagged.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df_flagged[col].quantile(0.25)
    Q3 = df_flagged[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

# Create new column that flags outliers (1 = outlier, 0 = normal)
df_flagged[f'{col}_outlier'] = ((df_flagged[col] < lower_bound) | (df_flagged[col] > upper_bound)).astype(int)

In [8]:
# Select only numeric features for clustering
numeric_cols = df_flagged.select_dtypes(include=['float64', 'int64']).columns

# Normalize data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_flagged[numeric_cols]), columns=numeric_cols)

In [9]:
# Save the scaler model
joblib.dump(scaler, 'scaler_model.joblib')

['scaler_model.joblib']

In [10]:
# List all outlier flags
outlier_flags = [col for col in df_flagged.columns if '_outlier' in col]
print("Outlier flags:", outlier_flags)

# Check how many outliers were flagged per feature
for flag in outlier_flags:
    count = df_flagged[flag].sum()
    print(f"{flag}: {count} outliers")

Outlier flags: ['feelings_outlier']
feelings_outlier: 3140 outliers


In [11]:
df_model = df_flagged.copy()

In [12]:
joblib.dump(numeric_cols, 'numeric_cols.joblib')

['numeric_cols.joblib']

In [13]:
# Save final DataFrame as new CSV file
df_model.to_csv('cleaned_dataset.csv', index=False)