In [1]:
import pandas as pd
data = pd.read_csv('./dataset/cleaned_music_reviews.csv')
data.isna().sum()

data.head()

Unnamed: 0,Review,Rating,Cleaned_Review
0,i think i actually under-rate ok computer if a...,5.0,think actually underrate ok computer anything ...
1,i get why radiohead rub a lot of people the wr...,5.0,get radiohead rub lot people wrong way lot peo...
2,i would like to think i am good about not lett...,4.5,would like think good letting wider critical w...
3,there are radiohead devotees like there were o...,4.0,radiohead devotee like bowie devotee find unex...
4,i wrote a shining excellent review for this al...,5.0,wrote shining excellent review album browser w...


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.8,
    min_df=5,
    sublinear_tf=True 
)

X = tfidf_vectorizer.fit_transform(data['Cleaned_Review'])
vocab = tfidf_vectorizer.get_feature_names_out()


In [5]:
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
# Original unbalanced data
# X = tfidf_vectorizer.fit_transform(data['Cleaned_Review'])  # Use original unbalanced data
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inverse frequency weighting
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train, sample_weight=sample_weights)
y_pred = model.predict(X_test)
print("MSE : ",mean_squared_error(y_test,y_pred))
print("r2 : ",r2_score(y_test,y_pred))


MSE :  0.640363531210197
r2 :  0.1451905572738471


In [6]:
from collections import Counter
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Print class distribution of training and testing data
print("Training set class distribution:")
print(Counter(y_train))
print("\nTesting set class distribution:")
print(Counter(y_test))

# Bin ratings for SMOTE (since SMOTE is for classification, not regression)
# We'll use a regression-friendly approach: bin the ratings, oversample, then train on original y
from imblearn.over_sampling import SMOTE

# Bin ratings for SMOTE
y_train_binned = pd.cut(y_train, bins=[0,2,3,4,5], labels=[0,1,2,3], include_lowest=True)
sm = SMOTE(random_state=42)
X_res, y_res_binned = sm.fit_resample(X_train, y_train_binned)

# Map binned labels back to original y_train values for regression
# We'll use the mean rating in each bin as the target
bin_means = y_train.groupby(y_train_binned).mean()
y_res = y_res_binned.map(lambda b: bin_means.loc[b])

# Train a Ridge regressor for improved r2 score
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_res, y_res)
y_pred_ridge = ridge.predict(X_test)

print("\nRidge Regression Results after SMOTE oversampling:")
print("MSE : ", mean_squared_error(y_test, y_pred_ridge))
print("r2 : ", r2_score(y_test, y_pred_ridge))


Training set class distribution:
Counter({5.0: 23509, 4.5: 14155, 4.0: 11301, 3.5: 5617, 3.0: 3559, 2.5: 1765, 2.0: 1121, 1.5: 503, 1.0: 438, 0.5: 304})

Testing set class distribution:
Counter({5.0: 5886, 4.5: 3573, 4.0: 2852, 3.5: 1394, 3.0: 857, 2.5: 436, 2.0: 266, 1.5: 131, 0.5: 91, 1.0: 83})


  bin_means = y_train.groupby(y_train_binned).mean()



Ridge Regression Results after SMOTE oversampling:
MSE :  0.7029892708414007
r2 :  0.061592614878076324
