## EDA

In [2]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load original data
original_df = pd.read_csv('./dataset/cleaned_music_reviews.csv') 
original_df['is_synthetic'] = False
synth_df = pd.read_csv('./dataset/synthetic_low_ratings_4000.csv')

In [4]:
# Create a new dataframe with 'Cleaned_Review' and 'Rating' from both original and synthetic data
cleaned_df = pd.concat([
    original_df[['Cleaned_Review', 'Rating']],
    synth_df[['Cleaned_Review', 'Rating']]
], ignore_index=True)

In [5]:
print(f"Dataset size: {len(cleaned_df)}")
print(f"Rating distribution:\n{cleaned_df['Rating'].value_counts()}")

Dataset size: 97841
Rating distribution:
Rating
5.0    29395
4.5    17728
4.0    14153
3.5     7011
2.5     6201
2.0     5387
1.5     4634
1.0     4521
3.0     4416
0.5     4395
Name: count, dtype: int64


In [7]:
cleaned_df.isna().sum()

Cleaned_Review    0
Rating            0
dtype: int64

In [8]:
#drop null values
cleaned_df.dropna(inplace=True)

In [9]:
(cleaned_df['Cleaned_Review'] == '').sum()

0

In [2]:
cleaned_csv = cleaned_df.to_csv('./dataset/cleaned_music_reviews2.csv',index=False)

NameError: name 'cleaned_df' is not defined

In [3]:
import pandas as pd
cleaned_df = pd.read_csv('./dataset/cleaned_music_reviews2.csv')
cleaned_df.shape

(97841, 2)

## TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.8,
    min_df=5,
    sublinear_tf=True 
)

X = tfidf_vectorizer.fit_transform(cleaned_df['Cleaned_Review'])
vocab = tfidf_vectorizer.get_feature_names_out()

In [7]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, just what I needed - another generic pop album!")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...


{'neg': 0.0, 'neu': 0.631, 'pos': 0.369, 'compound': 0.6239}

In [4]:
y = cleaned_df['Rating']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Higher weights for rare ratings)

model = Ridge(alpha=1.0)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MSE: {mse}")
print(f"R2: {r2}")

MSE: 0.3880349426888212
R2: 0.7999400794691092


In [15]:
#function which tokenizes the text,lowercase the text, remove stopwords, and lemmatize the text 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Keep !? for sentiment
    words = word_tokenize(text)
    stop_words = list(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [16]:
def predict_rating(review):
    # Preprocess
    processed_review = preprocess_text(review)

    # Transform
    review_vector = tfidf_vectorizer.transform([processed_review])
    # Predict
    rating = model.predict(review_vector)[0]

    # Ensure rating is within original bounds
    min_rating, max_rating = cleaned_df['Rating'].min(), cleaned_df['Rating'].max()
    rating = np.clip(rating, min_rating, max_rating)

    return f"Predicted rating: {rating:.2f} (scale: {min_rating}-{max_rating})"

# Test cases
test_reviews = [
    # Positive
    "The album was a masterpiece from start to finish",
    "This album changed my life! Perfect in every way",
    # Mixed
    "Some good tracks but overall disappointing",
    "The vocals were amazing, though the production quality ruined it",
    "A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.",
    "Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.",
    "The production is lush and detailed, but the lyrics are pretentious and the melodies forgettable, making for a frustrating listen.",
    "There are a few catchy songs, but most of the album is forgettable.",
    "The instrumentation is solid, but the songwriting leaves much to be desired.",
    "Some tracks are fantastic, others are just filler.",
    # Sarcastic
    "Wow, what an album. I totally needed another hour of generic pop songs in my life.",
    "Groundbreaking stuffΓÇöI've never heard such originality in a song called 'Love Tonight' before.",
    "If boredom was an art form, this album would be a masterpiece.",
    "Oh great, another autotuned ballad. Just what the world was missing.",
    "Truly inspiring how they managed to make every track sound exactly the same.",
    "I laughed, I cried, mostly because I couldn't believe I paid for this.",
    "This album really redefines the word 'mediocre'.",
    "So innovative, I almost didn't fall asleep halfway through.",
    # Negative
    "Mediocre at best - nothing special",
    "This album was the worst thing I heard in my life, Death to the artist and the producer, disgusting, awful, bad , waste of time",
    "Absolutely terrible album, not a single redeeming quality. I regret listening to it.",
    "Horrible in every way, the worst music I've ever heard.",
    "Unbearable noise, couldn't finish a single track.",
    "A complete disaster, avoid at all costs.",
    "Painful to listen to, a total waste of time.",
    "Zero talent, zero effort, zero enjoyment.",
    "This album is an insult to music.",
    "If I could give it a zero, I would.",
    "The most disappointing and awful release of the year.",
    "Disgusting, offensive, and unlistenable."
]

for review in test_reviews:
    print(f"\nReview: {review}")
    print(predict_rating(review))


Review: The album was a masterpiece from start to finish
Predicted rating: 4.98 (scale: 0.5-5.0)

Review: This album changed my life! Perfect in every way
Predicted rating: 5.00 (scale: 0.5-5.0)

Review: Some good tracks but overall disappointing
Predicted rating: 3.00 (scale: 0.5-5.0)

Review: The vocals were amazing, though the production quality ruined it
Predicted rating: 3.19 (scale: 0.5-5.0)

Review: A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.
Predicted rating: 3.50 (scale: 0.5-5.0)

Review: Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.
Predicted rating: 2.64 (scale: 0.5-5.0)

Review: The production is lush and detailed, but the lyrics are pretentious and the melodies forgettable, making for a frustrating listen.
Predicted rating: 2.85 (scale: 0.5-5.0)

Review: There are a few catchy songs, but most of th

In [17]:
# Get user input
text = input("Enter review: ")

# Clean the review text
clean_data = preprocess_text(text)  # This should return a cleaned string

# Vectorize using the already trained vectorizer (do NOT use fit_transform)
X = tfidf_vectorizer.transform([clean_data])  # Wrap in a list to avoid error

# Predict using the trained model
predicted_rating = model.predict(X)

# Output the result
print(f"The predicted rating for the review '{text}' is: {predicted_rating[0]:.2f}")

The predicted rating for the review 'bad' is: 2.05


In [7]:
import joblib

# Save trained models
joblib.dump(model, 'models/model_ridge.pkl')
joblib.dump(tfidf_vectorizer, 'models/tfidf.pkl')

['models/tfidf.pkl']

## Word2VEC

In [3]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize the cleaned reviews for Word2Vec training
tokenized_reviews = [review.split() for review in cleaned_df['Cleaned_Review']]

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=5, workers=4, sg=1, seed=42)

# Function to get average word2vec embedding for a review
def get_review_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)

# Create feature matrix using average word2vec embeddings
X_w2v = np.vstack([
    get_review_vector(tokens, w2v_model, w2v_model.vector_size)
    for tokens in tokenized_reviews
])

In [5]:
from sklearn.model_selection import train_test_split
y = cleaned_df['Rating']
# Use the Word2Vec feature matrix for train-test split
X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42
)

In [7]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

ridge_w2v = Ridge(alpha=1.0)
ridge_w2v.fit(X_w2v_train, y_w2v_train)
y_w2v_pred = ridge_w2v.predict(X_w2v_test)

mse_w2v = mean_squared_error(y_w2v_test, y_w2v_pred)
r2_w2v = r2_score(y_w2v_test, y_w2v_pred)

print(f"Word2Vec Ridge MSE: {mse_w2v}")
print(f"Word2Vec Ridge R2: {r2_w2v}")

Word2Vec Ridge MSE: 0.5583304323041101
Word2Vec Ridge R2: 0.7121405068761708


In [22]:
#function which tokenizes the text,lowercase the text, remove stopwords, and lemmatize the text 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Keep !? for sentiment
    words = word_tokenize(text)
    stop_words = list(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [9]:
import joblib

# Save trained models
joblib.dump(ridge_w2v, 'models/model_w2v.pkl')
joblib.dump(w2v_model, 'models/w2v.pkl')

['models/w2v.pkl']

In [23]:
def predict_rating(review):
    # Preprocess
    processed_review = preprocess_text(review)

    # Transform
    # Use Word2Vec average embedding instead of tf-idf
    tokens = processed_review.split()
    review_vector = get_review_vector(tokens, w2v_model, w2v_model.vector_size).reshape(1, -1)

    # Predict
    rating = ridge_w2v.predict(review_vector)[0]

    # Ensure rating is within original bounds
    min_rating, max_rating = cleaned_df['Rating'].min(), cleaned_df['Rating'].max()
    rating = np.clip(rating, min_rating, max_rating)

    return f"Predicted rating: {rating:.2f} (scale: {min_rating}-{max_rating})"

# Test cases
test_reviews = [
    # Positive
    "The album was a masterpiece from start to finish",
    "This album changed my life! Perfect in every way",
    # Mixed
    "Some good tracks but overall disappointing",
    "The vocals were amazing, though the production quality ruined it",
    "A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.",
    "Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.",
    "The production is lush and detailed, but the lyrics are pretentious and the melodies forgettable, making for a frustrating listen.",
    "There are a few catchy songs, but most of the album is forgettable.",
    "The instrumentation is solid, but the songwriting leaves much to be desired.",
    "Some tracks are fantastic, others are just filler.",
    # Sarcastic
    "Wow, what an album. I totally needed another hour of generic pop songs in my life.",
    "Groundbreaking stuffΓÇöI've never heard such originality in a song called 'Love Tonight' before.",
    "If boredom was an art form, this album would be a masterpiece.",
    "Oh great, another autotuned ballad. Just what the world was missing.",
    "Truly inspiring how they managed to make every track sound exactly the same.",
    "I laughed, I cried, mostly because I couldn't believe I paid for this.",
    "This album really redefines the word 'mediocre'.",
    "So innovative, I almost didn't fall asleep halfway through.",
    # Negative
    "Mediocre at best - nothing special",
    "This album was the worst thing I heard in my life, Death to the artist and the producer, disgusting, awful, bad , waste of time",
    "Absolutely terrible album, not a single redeeming quality. I regret listening to it.",
    "Horrible in every way, the worst music I've ever heard.",
    "Unbearable noise, couldn't finish a single track.",
    "A complete disaster, avoid at all costs.",
    "Painful to listen to, a total waste of time.",
    "Zero talent, zero effort, zero enjoyment.",
    "This album is an insult to music.",
    "If I could give it a zero, I would.",
    "The most disappointing and awful release of the year.",
    "Disgusting, offensive, and unlistenable."
]

for review in test_reviews:
    print(f"\nReview: {review}")
    print(predict_rating(review))


Review: The album was a masterpiece from start to finish
Predicted rating: 5.00 (scale: 0.5-5.0)

Review: This album changed my life! Perfect in every way
Predicted rating: 5.00 (scale: 0.5-5.0)

Review: Some good tracks but overall disappointing
Predicted rating: 2.92 (scale: 0.5-5.0)

Review: The vocals were amazing, though the production quality ruined it
Predicted rating: 2.47 (scale: 0.5-5.0)

Review: A genre-defying record that blends jazz, electronica, and rock seamlessly, though some tracks feel unnecessarily long and meandering.
Predicted rating: 3.88 (scale: 0.5-5.0)

Review: Despite the hype, the album lacks originality and feels like a rehash of the band's previous work, with only a few standout moments.
Predicted rating: 2.91 (scale: 0.5-5.0)

Review: The production is lush and detailed, but the lyrics are pretentious and the melodies forgettable, making for a frustrating listen.
Predicted rating: 2.93 (scale: 0.5-5.0)

Review: There are a few catchy songs, but most of th

In [24]:
# Get user input
text = input("Enter review: ")

# Clean the review text
clean_data = preprocess_text(text)  # This should return a cleaned string

# Vectorize using the already trained vectorizer (do NOT use fit_transform)
X = tfidf_vectorizer.transform([clean_data])  # Wrap in a list to avoid error

# Predict using the trained model
predicted_rating = model.predict(X)

# Output the result
print(f"The predicted rating for the review '{text}' is: {predicted_rating[0]:.2f}")

The predicted rating for the review 'hh' is: 3.64
