## Exploraty Data Analysis

In [38]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt_tab')

## Loading Data

In [2]:
df = pd.read_csv('./dataset/music_album_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,i think i actually under-rate ok computer if a...,5.0
1,i get why radiohead rub a lot of people the wr...,5.0
2,i would like to think i am good about not lett...,4.5
3,there are radiohead devotees like there were o...,4.0
4,i wrote a shining excellent review for this al...,5.0


In [8]:
print(f"Dataset size: {len(df)}")
print(f"Rating distribution:\n{df['Rating'].value_counts()}")

Dataset size: 80271
Rating distribution:
Rating
5.0    29534
4.5    17793
4.0    14213
3.5     7048
3.0     4430
2.5     2210
2.0     1396
1.5      640
1.0      525
0.5      398
Name: count, dtype: int64


In [9]:
df.isna().sum()

Review      26
Rating    2084
dtype: int64

In [10]:
#drop null values
df.dropna(inplace=True)

In [46]:
#function which tokenizes the text,lowercase the text, remove stopwords, and lemmatize the text 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Keep !? for sentiment
    words = word_tokenize(text)
    stop_words = list(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [12]:
df['Cleaned_Review'] = df['Review'].apply(preprocess_text)
df.head()

Unnamed: 0,Review,Rating,Cleaned_Review
0,i think i actually under-rate ok computer if a...,5.0,think actually underrate ok computer anything ...
1,i get why radiohead rub a lot of people the wr...,5.0,get radiohead rub lot people wrong way lot peo...
2,i would like to think i am good about not lett...,4.5,would like think good letting wider critical w...
3,there are radiohead devotees like there were o...,4.0,radiohead devotee like bowie devotee find unex...
4,i wrote a shining excellent review for this al...,5.0,wrote shining excellent review album browser w...


In [35]:
(df['Review'] == '').sum()

0

In [36]:
(df['Cleaned_Review'] == '').sum()

321

In [37]:
#remove the empty strings in cleaned_review column
df = df[df['Cleaned_Review'] != '']

In [38]:
(df['Cleaned_Review'] == '').sum()

0

In [2]:
cleaned_csv = df.to_csv('./dataset/cleaned_music_reviews.csv',index=False)

NameError: name 'df' is not defined

## Loading Cleaned Data

In [39]:
data = pd.read_csv('./dataset/cleaned_music_reviews.csv')
data.isna().sum()

Review            0
Rating            0
Cleaned_Review    0
dtype: int64

In [53]:
print("Class distribution of the cleaned dataset (before balancing):")
print(data['Rating'].value_counts())

# Handle class imbalance by oversampling reviews with lower ratings
# Find the maximum class count
max_count = data['Rating'].value_counts().max()

# For each unique rating, resample to match the max_count
from sklearn.utils import resample

balanced_data = []
for rating in data['Rating'].unique():
    subset = data[data['Rating'] == rating]
    if len(subset) < max_count:
        # Oversample (with replacement) to match max_count
        subset_upsampled = resample(subset, 
                                    replace=True, 
                                    n_samples=max_count, 
                                    random_state=42)
        balanced_data.append(subset_upsampled)
    else:
        balanced_data.append(subset)

# Concatenate all upsampled subsets
data_balanced = pd.concat(balanced_data)

print("\nClass distribution after balancing (oversampling lower ratings):")
print(data_balanced['Rating'].value_counts())

# Use data_balanced for further processing
data = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)


Class distribution of the cleaned dataset (before balancing):
Rating
5.0    29395
4.5    17728
4.0    14153
3.5     7011
3.0     4416
2.5     2201
2.0     1387
1.5      634
1.0      521
0.5      395
Name: count, dtype: int64

Class distribution after balancing (oversampling lower ratings):
Rating
5.0    29395
4.5    29395
4.0    29395
2.5    29395
2.0    29395
3.0    29395
1.5    29395
3.5    29395
0.5    29395
1.0    29395
Name: count, dtype: int64


In [62]:
data_balanced['Rating'].value_counts()

Rating
5.0    29395
4.5    29395
4.0    29395
2.5    29395
2.0    29395
3.0    29395
1.5    29395
3.5    29395
0.5    29395
1.0    29395
Name: count, dtype: int64

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.8,
    min_df=5,
    sublinear_tf=True 
)

X2 = tfidf_vectorizer.fit_transform(data_balanced['Cleaned_Review'])
vocab2 = tfidf_vectorizer.get_feature_names_out()

In [65]:
y2 = data_balanced['Rating']

In [66]:
from sklearn.model_selection import train_test_split
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [71]:
from sklearn.linear_model import Ridge
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import mean_squared_error, r2_score


model2 = Ridge(alpha=1.0)
model2.fit(X_train_2,y_train_2)
y_pred = model2.predict(X_test_2)

mse = mean_squared_error(y_test_2,y_pred)
r2 = r2_score(y_test_2,y_pred)

print(f"MSE: {mse}")
print(f"R2: {r2}")


MSE: 0.09264052681406838
R2: 0.955095409038368


In [None]:
def predict_rating(review):
    # Preprocess
    processed_review = preprocess_text(review)

    # Transform
    review_vector = tfidf_vectorizer.transform([processed_review])

    # Predict
    rating = model2.predict(review_vector)[0]

    # Ensure rating is within original bounds
    min_rating, max_rating = data['Rating'].min(), data['Rating'].max()
    rating = np.clip(rating, min_rating, max_rating)

    return f"Predicted rating: {rating:.2f} (scale: {min_rating}-{max_rating})"

# Test cases
test_reviews = [
    "The album was a masterpiece from start to finish",
    "Some good tracks but overall disappointing",
    "The vocals were amazing, though the production quality ruined it",
    "Mediocre at best - nothing special",
    "This album changed my life! Perfect in every way",
    "This album was the worst thing I heard in my life, Death to the artist and the producer, disgusting, awful, bad , waste of time",
    "A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.",
    "Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.",
    "The production is lush and detailed, but the lyrics are pretentious and the melodies forgettable, making for a frustrating listen."
]
# Additional test cases expected to be below 1 rating
test_reviews += [
    "Absolutely terrible album, not a single redeeming quality. I regret listening to it.",
    "Horrible in every way, the worst music I've ever heard.",
    "Unbearable noise, couldn't finish a single track.",
    "A complete disaster, avoid at all costs.",
    "Painful to listen to, a total waste of time.",
    "Zero talent, zero effort, zero enjoyment.",
    "This album is an insult to music.",
    "If I could give it a zero, I would.",
    "The most disappointing and awful release of the year.",
    "Disgusting, offensive, and unlistenable."
]

for review in test_reviews:
    print(f"\nReview: {review}")
    print(predict_rating(review))


Review: The album was a masterpiece from start to finish
Predicted rating: 5.00 (scale: 0.5-5.0)

Review: Some good tracks but overall disappointing
Predicted rating: 3.02 (scale: 0.5-5.0)

Review: The vocals were amazing, though the production quality ruined it
Predicted rating: 2.98 (scale: 0.5-5.0)

Review: Mediocre at best - nothing special
Predicted rating: 1.87 (scale: 0.5-5.0)

Review: This album changed my life! Perfect in every way
Predicted rating: 5.00 (scale: 0.5-5.0)

Review: This album was the worst thing I heard in my life, Death to the artist and the producer, disgusting, awful, bad , waste of time
Predicted rating: 1.50 (scale: 0.5-5.0)

Review: A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.
Predicted rating: 3.33 (scale: 0.5-5.0)

Review: Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.
Predicted ra

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.8,
    min_df=5,
    sublinear_tf=True 
)

X = tfidf_vectorizer.fit_transform(data['Cleaned_Review'])
vocab = tfidf_vectorizer.get_feature_names_out()


In [41]:
vocab

array(['aa', 'aaaahs', 'aade', ..., 'zyskuje', 'zz', 'zz top'],
      dtype=object)

In [42]:
# Create bins that make sense for your distribution
rating_bins = pd.cut(data['Rating'],
                    bins=[0, 2.0, 3.5, 5.0],
                    labels=['negative', 'neutral', 'positive'])

print("Class distribution before resampling:")
print(rating_bins.value_counts())

Class distribution before resampling:
Rating
positive    61276
neutral     13628
negative     2937
Name: count, dtype: int64


## word 2 vec

In [4]:
from gensim.models import Word2Vec
import numpy as np

# Assuming 'data' is your DataFrame and 'Cleaned_Review' is the column with tokenized text (as lists of words)
# If not tokenized, tokenize first:
data['tokens'] = data['Cleaned_Review'].apply(lambda x: x.split())

# Train Word2Vec model (or load a pre-trained one)
w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=5, workers=4)

def document_vector(word_list):
    # Remove out-of-vocabulary words
    word_list = [word for word in word_list if word in w2v_model.wv.index_to_key]
    if len(word_list) == 0:
        return np.zeros(100)
    return np.mean(w2v_model.wv[word_list], axis=0)

    

In [5]:
w2v_model.wv.index_to_key

['album',
 'song',
 'one',
 'like',
 'de',
 'track',
 'time',
 'music',
 'sound',
 'really',
 'best',
 'would',
 'la',
 '!',
 'great',
 'good',
 'que',
 '?',
 'band',
 'rock',
 'even',
 'first',
 'much',
 'get',
 'record',
 'still',
 'make',
 'love',
 'guitar',
 'way',
 'ever',
 'e',
 'feel',
 'well',
 'think',
 'thing',
 'also',
 'listen',
 'come',
 'en',
 'could',
 'know',
 'lyric',
 'un',
 'vocal',
 'say',
 'never',
 'something',
 'year',
 'work',
 'every',
 'el',
 'many',
 'two',
 'though',
 'go',
 'favorite',
 'better',
 'metal',
 'life',
 'people',
 'back',
 'heard',
 'lot',
 'end',
 'part',
 'classic',
 'pretty',
 'little',
 'le',
 'take',
 'going',
 'bit',
 'side',
 'another',
 'new',
 'long',
 'made',
 'day',
 'minute',
 'probably',
 'man',
 'got',
 'kind',
 'perfect',
 'quite',
 'solo',
 'always',
 'pop',
 'give',
 'point',
 'want',
 'blue',
 'see',
 'whole',
 'hard',
 'listening',
 'world',
 'almost',
 'moment',
 'start',
 'second',
 'yet',
 'jazz',
 'right',
 'fan',
 'una',

In [None]:
# X_w2v = data['Cleaned_Review'].apply(lambda x: document_vector(x.split()))

In [None]:
# X_w2v
# #save in a csv
# # Convert the Series of numpy arrays to a DataFrame
# X_w2v_df = X_w2v.apply(pd.Series)
# # Optionally, add the index or an identifier if needed
# X_w2v_df.to_csv('./dataset/X_w2v_vectors.csv', index=False)


## splitting

In [43]:
y = data['Rating']

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

## Model Training

In [45]:
from sklearn.linear_model import Ridge
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import mean_squared_error, r2_score

# Higher weights for rare ratings

weights = compute_sample_weight(
    class_weight={
        'negative': 10.0,
        'neutral': 3.0,
        'positive': 1.0
    },
    y=rating_bins
)

model = Ridge(alpha=1.0)
model.fit(X,y,sample_weight=weights)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MSE: {mse}")
print(f"R2: {r2}")


MSE: 0.16347329822553325
R2: 0.7817825154835283


## hide

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
# from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
# from sklearn.svm import SVR
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor

In [22]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(alpha=1.0,solver='lsqr')
    # 'RandomForestRegressor':RandomForestRegressor(),
    # 'GradientBoostingRegressor':GradientBoostingRegressor(),
    # 'SVR':SVR(),
    # 'KNeighborsRegressor':KNeighborsRegressor(),
    # 'DecisionTreeRegressor':DecisionTreeRegressor(),
    # 'XGBRegressor':XGBRegressor(),
    # 'LGBMRegressor':LGBMRegressor(),
    # 'CatBoostRegressor':CatBoostRegressor()
}

In [14]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error
metrics_list = []

for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [15]:
metrics_df

Unnamed: 0,Model,MSE,RMSE,R2,MAE,MAPE
0,LinearRegression,0.817187,0.903984,-0.090847,0.690256,0.216402
1,Lasso,0.749184,0.865554,-7.1e-05,0.672071,0.238454
2,Ridge,0.476026,0.689946,0.364562,0.51005,0.177092


## testing

In [47]:
# Get user input
text = input("Enter review: ")

# Clean the review text
clean_data = preprocess_text(text)  # This should return a cleaned string

# Vectorize using the already trained vectorizer (do NOT use fit_transform)
X = tfidf_vectorizer.transform([clean_data])  # Wrap in a list to avoid error

# Predict using the trained model
predicted_rating = model.predict(X)

# Output the result
print(f"The predicted rating for the review '{text}' is: {predicted_rating[0]:.2f}")

The predicted rating for the review 'mid' is: 3.15


In [51]:
def predict_rating(review):
    # Preprocess
    processed_review = preprocess_text(review)

    # Transform
    review_vector = tfidf_vectorizer.transform([processed_review])

    # Predict
    rating = model.predict(review_vector)[0]

    # Ensure rating is within original bounds
    min_rating, max_rating = data['Rating'].min(), data['Rating'].max()
    rating = np.clip(rating, min_rating, max_rating)

    return f"Predicted rating: {rating:.2f} (scale: {min_rating}-{max_rating})"

# Test cases
test_reviews = [
    "The album was a masterpiece from start to finish",
    "Some good tracks but overall disappointing",
    "The vocals were amazing, though the production quality ruined it",
    "Mediocre at best - nothing special",
    "This album changed my life! Perfect in every way",
    "This album was the worst thing I heard in my life, Death to the artist and the producer, disgusting, awful, bad , waste of time",
    "A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.",
    "Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.",
    "The production is lush and detailed, but the lyrics are pretentious and the melodies forgettable, making for a frustrating listen."
]
# Additional test cases expected to be below 1 rating
test_reviews += [
    "Absolutely terrible album, not a single redeeming quality. I regret listening to it.",
    "Horrible in every way, the worst music I've ever heard.",
    "Unbearable noise, couldn't finish a single track.",
    "A complete disaster, avoid at all costs.",
    "Painful to listen to, a total waste of time.",
    "Zero talent, zero effort, zero enjoyment.",
    "This album is an insult to music.",
    "If I could give it a zero, I would.",
    "The most disappointing and awful release of the year.",
    "Disgusting, offensive, and unlistenable."
]

for review in test_reviews:
    print(f"\nReview: {review}")
    print(predict_rating(review))


Review: The album was a masterpiece from start to finish
Predicted rating: 4.98 (scale: 0.5-5.0)

Review: Some good tracks but overall disappointing
Predicted rating: 3.13 (scale: 0.5-5.0)

Review: The vocals were amazing, though the production quality ruined it
Predicted rating: 3.16 (scale: 0.5-5.0)

Review: Mediocre at best - nothing special
Predicted rating: 2.73 (scale: 0.5-5.0)

Review: This album changed my life! Perfect in every way
Predicted rating: 5.00 (scale: 0.5-5.0)

Review: This album was the worst thing I heard in my life, Death to the artist and the producer, disgusting, awful, bad , waste of time
Predicted rating: 1.05 (scale: 0.5-5.0)

Review: A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.
Predicted rating: 3.20 (scale: 0.5-5.0)

Review: Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.
Predicted ra