# 1.0 | Imports


## 1.1.0 | Import libraries

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sys
import time
import joblib



## 1.2.0 | Import dataset

In [None]:
df = pd.read_csv('final_clean_to_do_interaction_features.csv')
top_words = pd.read_csv('top_words.csv')

In [None]:
print(df.dtypes)

song                                  object
artist                                object
year                                  object
ranking                               object
MYCS                                 float64
                                      ...   
section_11                             int64
chorus_sentiment_shift               float64
repeated_phrase_intensity            float64
bigram_repeated_phrase_intensity     float64
trigram_repeated_phrase_intensity    float64
Length: 68, dtype: object


In [None]:
df.columns

Index(['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned', 'lyrics',
       'word_count', 'char_count', 'unique_word_count', 'unique_words',
       'distinct_word_count', 'avg_word_length', 'stopword_ratio', 'pos_ratio',
       'neg_ratio', 'neu_ratio', 'compound', 'lexical_diversity',
       'syllable_count', 'rhyme_pairs', 'rhyme_density', 'stopword_count',
       'distinct_stopword_count', 'stopword_repetition_ratio',
       'true_repetition_ratio', 'vocab_redundancy_ratio', 'first_person',
       'second_person', 'third_person', 'male_pronouns', 'female_pronouns',
       'total_pronouns', 'pronoun_word_ratio', 'first_person_ratio',
       'second_person_ratio', 'third_person_ratio', 'male_pronoun_ratio',
       'female_pronoun_ratio', 'verse_count', 'intro_count', 'outro_count',
       'bridge_count', 'chorus_count', 'prechorus_count', 'postchorus_count',
       'total_section_count', 'chorus_ratio', 'prechorus_ratio', 'verse_ratio',
       'bridge_ratio', 'pattern', 'se

In [None]:
df.head(5)

Unnamed: 0,song,artist,year,ranking,MYCS,lyrics_cleaned,lyrics,word_count,char_count,unique_word_count,...,section_6,section_7,section_8,section_9,section_10,section_11,chorus_sentiment_shift,repeated_phrase_intensity,bigram_repeated_phrase_intensity,trigram_repeated_phrase_intensity
0,Blinding Lights,The Weeknd,"[2020, 2021]","[1, 3]",2.376,yeah ive been tryna call ive been on my own fo...,247 ContributorsTranslationsTürkçeSvenskaEspañ...,261,1174,43,...,3,4,3,5,6,0,-0.152558,0.226027,0.228669,0.226027
1,how do i live,leann rimes,"[1997, 1998]","[9, 5]",2.256,how do i get through one night without you if...,,279,1212,27,...,0,0,0,0,0,0,,,,
2,Stay,Justin Bieber,"[2021, 2022]","[12, 3]",2.244,i do the same thing i told you that i never wo...,166 ContributorsTranslationsTürkçeEspañolPortu...,423,1790,42,...,3,6,0,0,0,0,0.191633,0.145125,0.133484,0.145125
3,All I Want for Christmas Is You,Mariah Carey,"[2020, 2021, 2022, 2023]","[67, 78, 65, 55]",2.224,i dont want a lot for christmas there is just ...,186 ContributorsTranslationsEspañolTürkçeDeuts...,391,1843,61,...,1,3,6,0,0,0,0.86392,0.151442,0.179856,0.151442
4,Heat Waves,Glass Animals,"[2021, 2022]","[16, 1]",2.22,last night all i think about is you dont stop ...,171 ContributorsTranslationsItalianoDeutschFra...,398,2065,68,...,4,3,6,0,0,0,-1.313375,0.095672,0.106818,0.095672


In [None]:
df.shape

(5618, 68)

In [None]:
print(df.info())
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5618 entries, 0 to 5617
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   song                               5618 non-null   object 
 1   artist                             5618 non-null   object 
 2   year                               5618 non-null   object 
 3   ranking                            5618 non-null   object 
 4   MYCS                               5618 non-null   float64
 5   lyrics_cleaned                     5436 non-null   object 
 6   lyrics                             5395 non-null   object 
 7   word_count                         5618 non-null   int64  
 8   char_count                         5618 non-null   int64  
 9   unique_word_count                  5618 non-null   int64  
 10  unique_words                       5618 non-null   object 
 11  distinct_word_count                5618 non-null   int64

Unnamed: 0,song,artist,year,ranking,MYCS,lyrics_cleaned,lyrics,word_count,char_count,unique_word_count,...,section_6,section_7,section_8,section_9,section_10,section_11,chorus_sentiment_shift,repeated_phrase_intensity,bigram_repeated_phrase_intensity,trigram_repeated_phrase_intensity
0,Blinding Lights,The Weeknd,"[2020, 2021]","[1, 3]",2.376,yeah ive been tryna call ive been on my own fo...,247 ContributorsTranslationsTürkçeSvenskaEspañ...,261,1174,43,...,3,4,3,5,6,0,-0.152558,0.226027,0.228669,0.226027
1,how do i live,leann rimes,"[1997, 1998]","[9, 5]",2.256,how do i get through one night without you if...,,279,1212,27,...,0,0,0,0,0,0,,,,
2,Stay,Justin Bieber,"[2021, 2022]","[12, 3]",2.244,i do the same thing i told you that i never wo...,166 ContributorsTranslationsTürkçeEspañolPortu...,423,1790,42,...,3,6,0,0,0,0,0.191633,0.145125,0.133484,0.145125
3,All I Want for Christmas Is You,Mariah Carey,"[2020, 2021, 2022, 2023]","[67, 78, 65, 55]",2.224,i dont want a lot for christmas there is just ...,186 ContributorsTranslationsEspañolTürkçeDeuts...,391,1843,61,...,1,3,6,0,0,0,0.86392,0.151442,0.179856,0.151442
4,Heat Waves,Glass Animals,"[2021, 2022]","[16, 1]",2.22,last night all i think about is you dont stop ...,171 ContributorsTranslationsItalianoDeutschFra...,398,2065,68,...,4,3,6,0,0,0,-1.313375,0.095672,0.106818,0.095672


In [None]:
top_words.head(5)

Unnamed: 0,Genre,Word
0,rap,shit
1,rap,bitch
2,rap,fuck
3,rap,nigga
4,rap,back


# 2.0 | Preprocessing

In [None]:
df2 = df.copy()

## 2.1.0 | Update dataset with new feratures

Gets the genres and adds their intensity to the dataset. The intensity of a genre is a measure of how often the lyrics of a song appear in the top 100 words of a genre.

In [None]:
unique_top_words = list(top_words['Genre'].unique())

unique_top_words


['rap', 'misc', 'pop', 'rock', 'rb', 'country']

In [None]:
# Function to get the genre intensity
def get_genre_intensity(lyrics, genre_words):
    lyrics = str(lyrics)
    lyrics_length = len(lyrics.split())
    genre_word_count = sum(lyrics.count(word) for word in genre_words)

    if lyrics_length > 0:
        intensity = genre_word_count / lyrics_length
    else:
        intensity = 0

    return intensity

In [None]:
# Creates the new features based on genre intensity
for genre in unique_top_words:
    col_header = str(genre) + '_intensity'
    df2[col_header] = df2.apply(lambda row: get_genre_intensity(row['lyrics_cleaned'], top_words[top_words['Genre'] == genre]['Word'].tolist()), axis=1)

In [None]:
df2.columns

Index(['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned', 'lyrics',
       'word_count', 'char_count', 'unique_word_count', 'unique_words',
       'distinct_word_count', 'avg_word_length', 'stopword_ratio', 'pos_ratio',
       'neg_ratio', 'neu_ratio', 'compound', 'lexical_diversity',
       'syllable_count', 'rhyme_pairs', 'rhyme_density', 'stopword_count',
       'distinct_stopword_count', 'stopword_repetition_ratio',
       'true_repetition_ratio', 'vocab_redundancy_ratio', 'first_person',
       'second_person', 'third_person', 'male_pronouns', 'female_pronouns',
       'total_pronouns', 'pronoun_word_ratio', 'first_person_ratio',
       'second_person_ratio', 'third_person_ratio', 'male_pronoun_ratio',
       'female_pronoun_ratio', 'verse_count', 'intro_count', 'outro_count',
       'bridge_count', 'chorus_count', 'prechorus_count', 'postchorus_count',
       'total_section_count', 'chorus_ratio', 'prechorus_ratio', 'verse_ratio',
       'bridge_ratio', 'pattern', 'se

In [None]:
intensity_columns = [col for col in df2.columns if '_intensity' in col]
df2[intensity_columns]


Unnamed: 0,repeated_phrase_intensity,bigram_repeated_phrase_intensity,trigram_repeated_phrase_intensity,rap_intensity,misc_intensity,pop_intensity,rock_intensity,rb_intensity,country_intensity
0,0.226027,0.228669,0.226027,0.157088,0.145594,0.187739,0.187739,0.168582,0.195402
1,,,,0.218638,0.247312,0.279570,0.279570,0.272401,0.272401
2,0.145125,0.133484,0.145125,0.210402,0.191489,0.219858,0.217494,0.238771,0.217494
3,0.151442,0.179856,0.151442,0.097187,0.092072,0.122762,0.104859,0.117647,0.117647
4,0.095672,0.106818,0.095672,0.128141,0.097990,0.110553,0.108040,0.103015,0.120603
...,...,...,...,...,...,...,...,...,...
5613,0.149466,0.152482,0.149466,0.137097,0.181452,0.237903,0.213710,0.209677,0.189516
5614,0.247619,0.228029,0.247619,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5615,0.121662,0.130178,0.121662,0.169184,0.154079,0.244713,0.238671,0.232628,0.238671
5616,0.151365,0.153465,0.151365,0.175000,0.171875,0.187500,0.171875,0.178125,0.178125


In [None]:
df2.columns

Index(['song', 'artist', 'year', 'ranking', 'MYCS', 'lyrics_cleaned', 'lyrics',
       'word_count', 'char_count', 'unique_word_count', 'unique_words',
       'distinct_word_count', 'avg_word_length', 'stopword_ratio', 'pos_ratio',
       'neg_ratio', 'neu_ratio', 'compound', 'lexical_diversity',
       'syllable_count', 'rhyme_pairs', 'rhyme_density', 'stopword_count',
       'distinct_stopword_count', 'stopword_repetition_ratio',
       'true_repetition_ratio', 'vocab_redundancy_ratio', 'first_person',
       'second_person', 'third_person', 'male_pronouns', 'female_pronouns',
       'total_pronouns', 'pronoun_word_ratio', 'first_person_ratio',
       'second_person_ratio', 'third_person_ratio', 'male_pronoun_ratio',
       'female_pronoun_ratio', 'verse_count', 'intro_count', 'outro_count',
       'bridge_count', 'chorus_count', 'prechorus_count', 'postchorus_count',
       'total_section_count', 'chorus_ratio', 'prechorus_ratio', 'verse_ratio',
       'bridge_ratio', 'pattern', 'se

# 3.0 Run Model

## 3.1.0 | Separate numeric/non-numeric columns

In [None]:
# Shows the numeric and non-numeric columns
numeric_cols = df2.select_dtypes(include=np.number).columns.tolist()
non_numeric_cols = df2.select_dtypes(exclude=np.number).columns.tolist()


print("Non-numeric columns:")
print(non_numeric_cols)

print("\nNumeric columns:")
numeric_cols


Non-numeric columns:
['song', 'artist', 'year', 'ranking', 'lyrics_cleaned', 'lyrics', 'unique_words', 'pattern']

Numeric columns:


['MYCS',
 'word_count',
 'char_count',
 'unique_word_count',
 'distinct_word_count',
 'avg_word_length',
 'stopword_ratio',
 'pos_ratio',
 'neg_ratio',
 'neu_ratio',
 'compound',
 'lexical_diversity',
 'syllable_count',
 'rhyme_pairs',
 'rhyme_density',
 'stopword_count',
 'distinct_stopword_count',
 'stopword_repetition_ratio',
 'true_repetition_ratio',
 'vocab_redundancy_ratio',
 'first_person',
 'second_person',
 'third_person',
 'male_pronouns',
 'female_pronouns',
 'total_pronouns',
 'pronoun_word_ratio',
 'first_person_ratio',
 'second_person_ratio',
 'third_person_ratio',
 'male_pronoun_ratio',
 'female_pronoun_ratio',
 'verse_count',
 'intro_count',
 'outro_count',
 'bridge_count',
 'chorus_count',
 'prechorus_count',
 'postchorus_count',
 'total_section_count',
 'chorus_ratio',
 'prechorus_ratio',
 'verse_ratio',
 'bridge_ratio',
 'section_0',
 'section_1',
 'section_2',
 'section_3',
 'section_4',
 'section_5',
 'section_6',
 'section_7',
 'section_8',
 'section_9',
 'section

In [None]:
df3 = df2.select_dtypes(include=np.number)
# Selects numeric columns to train


## 3.2.0 | Run Models

In this instance, 3 models are compared:
- LightGBM
- Linear Regression
- Random Forest

In [None]:
X = df3.drop('MYCS', axis=1)
y = df3['MYCS']

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Convert X back to a DataFrame to enable iloc indexing:
X = pd.DataFrame(X, index=df3.index, columns=df3.drop('MYCS', axis=1).columns)

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

lgb_rmse_scores = []
lr_rmse_scores = []
rf_rmse_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    lgb_clf = lgb.LGBMRegressor()
    lgb_clf.fit(X_train, y_train)
    lgb_preds = lgb_clf.predict(X_test)
    lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_preds))
    lgb_rmse_scores.append(lgb_rmse)
    print(f"LightGBM RMSE: {lgb_rmse}")

    lr_clf = LinearRegression()
    lr_clf.fit(X_train, y_train)
    lr_preds = lr_clf.predict(X_test)
    lr_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
    lr_rmse_scores.append(lr_rmse)
    print(f"Linear Regression RMSE: {lr_rmse}")

    rf_clf = RandomForestRegressor(random_state=42)
    rf_clf.fit(X_train, y_train)
    rf_preds = rf_clf.predict(X_test)
    rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
    rf_rmse_scores.append(rf_rmse)
    print(f"Random Forest RMSE: {rf_rmse}")

    print("Completed Fold!!!")

print(f"LightGBM RMSE scores: {lgb_rmse_scores}")
print(f"LightGBM Mean RMSE: {np.mean(lgb_rmse_scores)}")
print(f"Linear Regression RMSE scores: {lr_rmse_scores}")
print(f"Linear Regression Mean RMSE: {np.mean(lr_rmse_scores)}")
print(f"Random Forest RMSE scores: {rf_rmse_scores}")
print(f"Random Forest Mean RMSE: {np.mean(rf_rmse_scores)}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7019
[LightGBM] [Info] Number of data points in the train set: 4494, number of used features: 59
[LightGBM] [Info] Start training from score 0.536377
LightGBM RMSE: 0.3143634644144405
Linear Regression RMSE: 0.31836922289888486
Random Forest RMSE: 0.3142622635764495
Completed Fold!!!
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001808 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7022
[LightGBM] [Info] Number of data points in the train set: 4494, number of used features: 59
[LightGBM] [Info] Start training from score 0.539738
LightGBM RMSE: 0.3024066764759681
Linear Regression RMS

Random Forest performed the best and as such training will be primarily performed on this model.

In [None]:
lyrics_column = 'lyrics_cleaned'

X = df2.drop('MYCS', axis=1)
y = df2['MYCS']


numerical_features = X.select_dtypes(include=np.number).columns.tolist()

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
])

tfidf_vectorizer = TfidfVectorizer(
    max_features=500,
    stop_words='english',
    ngram_range=(1,1)
)


transformers_list = []

if lyrics_column in df2.columns:
    transformers_list.append(('tfidf', tfidf_vectorizer, lyrics_column))
else:
    print(f"Warning: Lyrics column '{lyrics_column}' not found in X. TF-IDF will not be applied.")

if numerical_features:
    transformers_list.append(('num', numerical_pipeline, numerical_features))
else:
    print("Warning: No numerical features to apply numerical pipeline.")


if not transformers_list:
    sys.exit("Error: No features to process (neither lyrics nor numerical features found/specified).")

preprocessor = ColumnTransformer(
    transformers=transformers_list,
    remainder='drop'

)


rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    max_depth=10,
    min_samples_leaf=5
)

full_pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

print(f"\n--- Evaluating RandomForestRegressor with 5-Fold Cross-Validation for 'MYCS' ---")

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)


rf_rmse_scores_cv = -cross_val_score(full_pipeline_rf, X, y, cv=kf, scoring=rmse_scorer, n_jobs=-1)


print(f"RandomForestRegressor RMSE scores for each fold: {rf_rmse_scores_cv}")
print(f"RandomForestRegressor Mean RMSE: {np.mean(rf_rmse_scores_cv):.4f}")
print(f"RandomForestRegressor Std Dev of RMSE: {np.std(rf_rmse_scores_cv):.4f}")




--- Evaluating RandomForestRegressor with 5-Fold Cross-Validation for 'MYCS' ---
RandomForestRegressor RMSE scores for each fold: [0.307412   0.29334683 0.28014747 0.31538068 0.28855666]
RandomForestRegressor Mean RMSE: 0.2970
RandomForestRegressor Std Dev of RMSE: 0.0128


In [None]:
print(f"Standard Deviation of MYCS: {df3['MYCS'].std()}")


Standard Deviation of MYCS: 0.3429838006042616


## 3.3.0 | Export model

Now that the model has been trained, the model has to be exported

In [None]:
joblib.dump(rf_clf, 'rf_model.joblib')

['rf_model.joblib']