In [2]:
import pandas as pd

data = pd.read_csv('dataset.csv')


ModuleNotFoundError: No module named 'pandas'

In [8]:
# Convert the column to datetime if it isn't already
data['track_album_release_date'] = pd.to_datetime(data['track_album_release_date'], errors='coerce')

# Extract just the year
data['year'] = data['track_album_release_date'].dt.year.astype('Int64')
data['year']

0        2001
1        2017
2        2005
3        2012
4        2019
         ... 
41064    <NA>
41065    <NA>
41066    <NA>
41067    <NA>
41068    <NA>
Name: year, Length: 41069, dtype: Int64

In [2]:
data = data.dropna(subset=['track_popularity'])
data.shape[0]

40721

In [3]:
from sklearn.model_selection import train_test_split # splits training from real data 
from sklearn.pipeline import Pipeline
# applies transformations to diff sub # chains processing sets
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from  sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor  # You can swap with any regressor
from sklearn.metrics import mean_squared_error


data['lyrics'] = data['lyrics'].fillna('')
# we will use these variables will predict track_popularity
numerical_features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
categorical_features = 'track_album_release_date'
text_feature = 'lyrics'
target = 'track_popularity'
encode_artist = 'track_artist'

text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_features),
        ('text', text_pipeline, text_feature),
        ('categorical', OneHotEncoder(), [categorical_features]),
        ('artist', TargetEncoder(), [encode_artist])
    ]
)

X = data[numerical_features + [text_feature, categorical_features, encode_artist]]
y = data[target]

pipeline = Pipeline(
    steps=[
        ('preproces)sor', preprocessor),
        #('regressor', RandomForestRegressor(n_estimators=1000, random_state=42))    
        ('regressor', HistGradientBoostingRegressor(random_state=42))
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.2f}")


: 