## Exploraty Data Analysis

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt_tab')

In [2]:
df = pd.read_csv('./dataset/music_album_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,i think i actually under-rate ok computer if a...,5.0
1,i get why radiohead rub a lot of people the wr...,5.0
2,i would like to think i am good about not lett...,4.5
3,there are radiohead devotees like there were o...,4.0
4,i wrote a shining excellent review for this al...,5.0


In [8]:
print(f"Dataset size: {len(df)}")
print(f"Rating distribution:\n{df['Rating'].value_counts()}")

Dataset size: 80271
Rating distribution:
Rating
5.0    29534
4.5    17793
4.0    14213
3.5     7048
3.0     4430
2.5     2210
2.0     1396
1.5      640
1.0      525
0.5      398
Name: count, dtype: int64


In [9]:
df.isna().sum()

Review      26
Rating    2084
dtype: int64

In [10]:
#drop null values
df.dropna(inplace=True)

In [4]:
#function which tokenizes the text,lowercase the text, remove stopwords, and lemmatize the text 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Keep !? for sentiment
    words = word_tokenize(text)
    stop_words = list(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [12]:
df['Cleaned_Review'] = df['Review'].apply(preprocess_text)
df.head()

Unnamed: 0,Review,Rating,Cleaned_Review
0,i think i actually under-rate ok computer if a...,5.0,think actually underrate ok computer anything ...
1,i get why radiohead rub a lot of people the wr...,5.0,get radiohead rub lot people wrong way lot peo...
2,i would like to think i am good about not lett...,4.5,would like think good letting wider critical w...
3,there are radiohead devotees like there were o...,4.0,radiohead devotee like bowie devotee find unex...
4,i wrote a shining excellent review for this al...,5.0,wrote shining excellent review album browser w...


In [35]:
(df['Review'] == '').sum()

0

In [36]:
(df['Cleaned_Review'] == '').sum()

321

In [37]:
#remove the empty strings in cleaned_review column
df = df[df['Cleaned_Review'] != '']

In [38]:
(df['Cleaned_Review'] == '').sum()

0

In [2]:
cleaned_csv = df.to_csv('./dataset/cleaned_music_reviews.csv',index=False)

NameError: name 'df' is not defined

In [2]:
data = pd.read_csv('./dataset/cleaned_music_reviews.csv')
data.isna().sum()

Review            0
Rating            0
Cleaned_Review    0
dtype: int64

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    max_df=0.8,
    min_df=5,
)

X = tfidf_vectorizer.fit_transform(data['Cleaned_Review'])
vocab = tfidf_vectorizer.get_feature_names_out()


In [4]:
from gensim.models import Word2Vec
import numpy as np

# Assuming 'data' is your DataFrame and 'Cleaned_Review' is the column with tokenized text (as lists of words)
# If not tokenized, tokenize first:
data['tokens'] = data['Cleaned_Review'].apply(lambda x: x.split())

# Train Word2Vec model (or load a pre-trained one)
w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=5, workers=4)

def document_vector(word_list):
    # Remove out-of-vocabulary words
    word_list = [word for word in word_list if word in w2v_model.wv.index_to_key]
    if len(word_list) == 0:
        return np.zeros(100)
    return np.mean(w2v_model.wv[word_list], axis=0)

    

In [5]:
w2v_model.wv.index_to_key

['album',
 'song',
 'one',
 'like',
 'de',
 'track',
 'time',
 'music',
 'sound',
 'really',
 'best',
 'would',
 'la',
 '!',
 'great',
 'good',
 'que',
 '?',
 'band',
 'rock',
 'even',
 'first',
 'much',
 'get',
 'record',
 'still',
 'make',
 'love',
 'guitar',
 'way',
 'ever',
 'e',
 'feel',
 'well',
 'think',
 'thing',
 'also',
 'listen',
 'come',
 'en',
 'could',
 'know',
 'lyric',
 'un',
 'vocal',
 'say',
 'never',
 'something',
 'year',
 'work',
 'every',
 'el',
 'many',
 'two',
 'though',
 'go',
 'favorite',
 'better',
 'metal',
 'life',
 'people',
 'back',
 'heard',
 'lot',
 'end',
 'part',
 'classic',
 'pretty',
 'little',
 'le',
 'take',
 'going',
 'bit',
 'side',
 'another',
 'new',
 'long',
 'made',
 'day',
 'minute',
 'probably',
 'man',
 'got',
 'kind',
 'perfect',
 'quite',
 'solo',
 'always',
 'pop',
 'give',
 'point',
 'want',
 'blue',
 'see',
 'whole',
 'hard',
 'listening',
 'world',
 'almost',
 'moment',
 'start',
 'second',
 'yet',
 'jazz',
 'right',
 'fan',
 'una',

In [6]:
X_w2v = data['Cleaned_Review'].apply(lambda x: document_vector(x.split()))

KeyboardInterrupt: 

In [None]:
X_w2v
#save in a csv
# Convert the Series of numpy arrays to a DataFrame
X_w2v_df = X_w2v.apply(pd.Series)
# Optionally, add the index or an identifier if needed
X_w2v_df.to_csv('./dataset/X_w2v_vectors.csv', index=False)


In [9]:
y = data['Rating']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

## Model Training

In [7]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [11]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(alpha=1.0),
    # 'RandomForestRegressor':RandomForestRegressor(),
    # 'GradientBoostingRegressor':GradientBoostingRegressor(),
    # 'SVR':SVR(),
    # 'KNeighborsRegressor':KNeighborsRegressor(),
    # 'DecisionTreeRegressor':DecisionTreeRegressor(),
    # 'XGBRegressor':XGBRegressor(),
    # 'LGBMRegressor':LGBMRegressor(),
    # 'CatBoostRegressor':CatBoostRegressor()
}

In [12]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error
metrics_list = []

for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [13]:
metrics_df

Unnamed: 0,Model,MSE,RMSE,R2,MAE,MAPE
0,LinearRegression,0.549724,0.741434,0.266183,0.555106,0.186065
1,Lasso,0.749184,0.865554,-7.1e-05,0.672071,0.238454
2,Ridge,0.501959,0.708491,0.329944,0.525581,0.181037


In [None]:
models = {
    # 'RandomForestRegressor':RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoostingRegressor':GradientBoostingRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    # 'SVR':SVR()
}

In [None]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [11]:
models = {
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
}

In [12]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [13]:
metrics_df

Unnamed: 0,Model,MSE,RMSE,R2,MAE,MAPE
0,LinearRegression,0.549724,0.741434,0.266183,0.555106,0.186065
1,Lasso,0.749184,0.865554,-7.1e-05,0.672071,0.238454
2,Ridge,0.501959,0.708491,0.329944,0.525581,0.181037
3,KNeighborsRegressor,0.874224,0.934999,-0.166985,0.694065,0.245467
4,DecisionTreeRegressor,1.096813,1.047289,-0.464116,0.713785,0.238664


In [None]:
models = {
    'XGBRegressor':XGBRegressor(),
    'LGBMRegressor':LGBMRegressor(),
    # 'CatBoostRegressor':CatBoostRegressor()
}

In [16]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


KeyboardInterrupt: 

In [14]:
from sklearn.linear_model import Ridge

model = Ridge()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print(f"MSE: {mse}")
print(f"R2: {r2}")


MSE: 0.5019594967014778
R2: 0.3299435449804736


In [15]:
# Get user input
text = input("Enter review: ")

# Clean the review text
clean_data = preprocess_text(text)  # This should return a cleaned string

# Vectorize using the already trained vectorizer (do NOT use fit_transform)
X = tfidf_vectorizer.transform([clean_data])  # Wrap in a list to avoid error

# Predict using the trained model
predicted_rating = model.predict(X)

# Output the result
print(f"The predicted rating for the review '{text}' is: {predicted_rating[0]:.2f}")

NameError: name 'preprocess_text' is not defined

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression

# Enhanced pipeline with feature selection
pipeline = make_pipeline(
    StandardScaler(with_mean=False),  # Important for sparse matrices
    SelectKBest(f_regression, k=5000),
    GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\nEnhanced Pipeline Performance:")
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"R2: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")


Enhanced Pipeline Performance:
MSE: 0.5941
MAE: 0.5827
R2: 0.2069
RMSE: 0.7708
