# Exploraty Data Analysis

In [3]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
df = pd.read_csv('./dataset/music_album_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,i think i actually under-rate ok computer if a...,5.0
1,i get why radiohead rub a lot of people the wr...,5.0
2,i would like to think i am good about not lett...,4.5
3,there are radiohead devotees like there were o...,4.0
4,i wrote a shining excellent review for this al...,5.0


In [8]:
print(f"Dataset size: {len(df)}")
print(f"Rating distribution:\n{df['Rating'].value_counts()}")

Dataset size: 80271
Rating distribution:
Rating
5.0    29534
4.5    17793
4.0    14213
3.5     7048
3.0     4430
2.5     2210
2.0     1396
1.5      640
1.0      525
0.5      398
Name: count, dtype: int64


In [9]:
df.isna().sum()

Review      26
Rating    2084
dtype: int64

In [10]:
#drop null values
df.dropna(inplace=True)

In [11]:
#function which tokenizes the text,lowercase the text, remove stopwords, and lemmatize the text 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Keep !? for sentiment
    words = word_tokenize(text)
    stop_words = list(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [12]:
df['Cleaned_Review'] = df['Review'].apply(preprocess_text)
df.head()

Unnamed: 0,Review,Rating,Cleaned_Review
0,i think i actually under-rate ok computer if a...,5.0,think actually underrate ok computer anything ...
1,i get why radiohead rub a lot of people the wr...,5.0,get radiohead rub lot people wrong way lot peo...
2,i would like to think i am good about not lett...,4.5,would like think good letting wider critical w...
3,there are radiohead devotees like there were o...,4.0,radiohead devotee like bowie devotee find unex...
4,i wrote a shining excellent review for this al...,5.0,wrote shining excellent review album browser w...


In [35]:
(df['Review'] == '').sum()

0

In [36]:
(df['Cleaned_Review'] == '').sum()

321

In [37]:
#remove the empty strings in cleaned_review column
df = df[df['Cleaned_Review'] != '']

In [38]:
(df['Cleaned_Review'] == '').sum()

0

In [2]:
cleaned_csv = df.to_csv('./dataset/cleaned_music_reviews.csv',index=False)

NameError: name 'df' is not defined

In [4]:
data = pd.read_csv('./dataset/cleaned_music_reviews.csv')
data.isna().sum()

Review            0
Rating            0
Cleaned_Review    0
dtype: int64

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    max_df=0.8,
    min_df=5,
)

X = tfidf_vectorizer.fit_transform(data['Cleaned_Review'])
vocab = tfidf_vectorizer.get_feature_names_out()


In [7]:
y = data['Rating']
y.to_csv('./dataset/y.csv',index=False)

In [11]:
import pandas as pd

X_dense = X.toarray()

# 3. Create a DataFrame
X_df = pd.DataFrame(X_dense)

In [None]:
X_df.to_csv('./dataset/X.csv', index=False)


In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training

In [46]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [47]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    # 'RandomForestRegressor':RandomForestRegressor(),
    # 'GradientBoostingRegressor':GradientBoostingRegressor(),
    # 'SVR':SVR(),
    # 'KNeighborsRegressor':KNeighborsRegressor(),
    # 'DecisionTreeRegressor':DecisionTreeRegressor(),
    # 'XGBRegressor':XGBRegressor(),
    # 'LGBMRegressor':LGBMRegressor(),
    # 'CatBoostRegressor':CatBoostRegressor()
}

In [48]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error
metrics_list = []

for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [49]:
metrics_df

Unnamed: 0,Model,MSE,RMSE,R2,MAE,MAPE
0,LinearRegression,0.549724,0.741434,0.266183,0.555106,0.186065
1,Lasso,0.749184,0.865554,-7.1e-05,0.672071,0.238454
2,Ridge,0.501959,0.708491,0.329944,0.525581,0.181037


In [50]:
models = {
    'RandomForestRegressor':RandomForestRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'SVR':SVR()
}

In [None]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [None]:
models = {
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
}

In [None]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)


In [None]:
models = {
    'XGBRegressor':XGBRegressor(),
    'LGBMRegressor':LGBMRegressor(),
    'CatBoostRegressor':CatBoostRegressor()
}

In [None]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    metrics_list.append({
        'Model':model.__class__.__name__,
        'MSE':mse,
        'RMSE':rmse,
        'R2':r2,
        'MAE':mae,
        'MAPE':mape})
        
metrics_df = pd.DataFrame(metrics_list)
