In [4]:
!pip install numPy pandas matplotlib seaborn scikit-learn




In [5]:
import os, re, string, time, gc, warnings, pickle
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path

from scipy import sparse
from collections import defaultdict, Counter

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, RidgeClassifierCV, RidgeClassifier
from sklearn.naive_bayes import ComplementNB

In [6]:
# Define paths
BASE_DIR = Path("data")
TRAIN_FILE = BASE_DIR / "train.csv"
TEST_FILE = BASE_DIR / "test.csv"
SAMPLE_SUB = BASE_DIR / "sample_submission.csv"



In [7]:
# Load data
df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)
df_sample = pd.read_csv(SAMPLE_SUB)

In [8]:
display(df_train.head())


Unnamed: 0,id,reviewerID,album_mbid,artist_mbid,unixReviewTime,VotedHelpful,TotalVotes,summary,reviewText,genres,Score
0,1,A0001624UKLQG4OFIM8X,B000002KIC,8c90ad8c-9150-4c51-a1eb-342232e99d06,1361059200,0,0,very good listening,Ive liked the band since first heard them. Fig...,"Folk Rock,Country Rock,Country,Rock,Pop,Singer...",5.0
1,2,A00082583JGF0RURTDN8A,B000007T1M,cc0b7089-c08d-4c10-b6b0-873582c17fd6,1393632000,0,0,Best album ever!!!!,I love this album sents it came out!!! This is...,"Alternative Metal,Metal,Pop Metal,Pop,Rock",5.0
2,3,A00162161QSZVJYMHX0T4,B0000001T0,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,1402358400,0,0,"A great Album , good seller",Bought this used. An awesome country rock albu...,"Smooth Jazz,Jazz,Pop,Jazz Fusion",5.0
3,4,A00162161QSZVJYMHX0T4,B0000001UU,0c361ea5-98c6-4947-900b-201833f2dd84,1402358400,0,0,Larry and Lee = a future Classic!,This album is sure to become a future classic....,"Smooth Jazz,Jazz,Pop,Easy Listening",
4,5,A00162161QSZVJYMHX0T4,B0000001SB,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,1402358400,0,0,Wow! Where be Mosada?,"I heard this album a few times on youtube.com,...","Adult Contemporary,Jazz Fusion,Smooth Jazz,Jaz...",5.0


In [9]:
display(df_test.head())


Unnamed: 0,id,Score
0,4,
1,6,
2,15,
3,26,
4,31,


In [10]:
df_train.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,447583.0,,,,223792.0,129206.227105,1.0,111896.5,223792.0,335687.5,447583.0
reviewerID,447583.0,276853.0,A9Q28YTLYREO7,699.0,,,,,,,
album_mbid,447583.0,31471.0,B0000AGWEC,1958.0,,,,,,,
artist_mbid,447583.0,12934.0,89ad4ac3-39f7-470e-963a-56509c546377,17575.0,,,,,,,
unixReviewTime,447583.0,,,,1162858668.844885,137163010.809733,874800000.0,1056585600.0,1148688000.0,1269388800.0,1406073600.0
VotedHelpful,447583.0,,,,2.663251,6.990395,0.0,0.0,1.0,3.0,724.0
TotalVotes,447583.0,,,,3.964116,8.821499,0.0,0.0,2.0,4.0,827.0
summary,447545.0,368718.0,Great CD,1207.0,,,,,,,
reviewText,447568.0,447405.0,sweat was a pretty good cd from nelly. that ha...,14.0,,,,,,,
genres,447583.0,12903.0,"Pop,Rock",4971.0,,,,,,,


In [11]:
# Target Distribution

score_overview = pd.DataFrame({
    "count": df_train["Score"].value_counts().sort_index(),
    "proportion": df_train["Score"].value_counts(normalize=True).sort_index()
})
score_overview


Unnamed: 0_level_0,count,proportion
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,39353,0.107059
2.0,23476,0.063866
3.0,44933,0.122239
4.0,80708,0.219564
5.0,179113,0.487272


In [12]:
# Missing Values
missing_train = df_train.isna().mean().sort_values(ascending=False)
missing_test = df_test.isna().mean().sort_values(ascending=False)

missing_df = pd.DataFrame({
    "missing_train_pct": missing_train,
    "missing_test_pct": missing_test
}).fillna(0)

missing_df.head(15)


Unnamed: 0,missing_train_pct,missing_test_pct
Score,0.178738,1.0
TotalVotes,0.0,0.0
VotedHelpful,0.0,0.0
album_mbid,0.0,0.0
artist_mbid,0.0,0.0
genres,0.0,0.0
id,0.0,0.0
reviewText,3.4e-05,0.0
reviewerID,0.0,0.0
summary,8.5e-05,0.0


In [13]:
# Helpful Votes

def helpful_ratio(df):
    if "VotedHelpful" not in df.columns or "TotalVotes" not in df.columns:
        return pd.Series(dtype=float)
    denom = df["TotalVotes"].replace({0: np.nan})
    return df["VotedHelpful"] / denom

train_help_ratio = helpful_ratio(df_train)
if not train_help_ratio.empty:
    train_help_ratio.describe(percentiles=[.25, .5, .75, .9, .95])


In [14]:
# Text Field Exploration
for col in ["summary", "reviewText"]:
    if col in df_train.columns:
        col_len = f"{col}_len"
        df_train[col_len] = df_train[col].fillna("").str.split().apply(len)
        print(f"{col} - token length summary:")
        display(df_train[col_len].describe(percentiles=[.25, .5, .75, .9, .95]))


summary - token length summary:


count    447583.000000
mean          4.510529
std           2.764297
min           0.000000
25%           2.000000
50%           4.000000
75%           6.000000
90%           8.000000
95%          10.000000
max          32.000000
Name: summary_len, dtype: float64

reviewText - token length summary:


count    447583.000000
mean        128.942100
std         144.899165
min           0.000000
25%          41.000000
50%          84.000000
75%         160.000000
90%         283.000000
95%         396.000000
max        4972.000000
Name: reviewText_len, dtype: float64

In [15]:
# Genre Insights

if "genres" in df_train.columns:
    genres_split = df_train["genres"].fillna("").str.split(",")
    df_train["genre_count"] = genres_split.apply(lambda x: len([g.strip() for g in x if g.strip()]))
    expanded = genres_split.explode().str.strip().replace("", np.nan).dropna()
    print("Top 15 Genres:\n", expanded.value_counts().head(15))
    print("\nGenre count summary:")
    display(df_train["genre_count"].describe(percentiles=[.25, .5, .75, .9, .95]))


Top 15 Genres:
 genres
Pop                   415611
Rock                  286223
Alternative Rock      144209
Metal                  80187
Adult Alternative      67763
R&B                    60248
World Music            53492
Classic Rock           53259
Dance Pop              50753
Vocal Pop              49810
Folk                   49713
Indie & Lo-Fi          45265
Singer-Songwriters     44669
Dance & Electronic     44488
Adult Contemporary     41306
Name: count, dtype: int64

Genre count summary:


count    447583.000000
mean          6.765789
std           2.854074
min           1.000000
25%           5.000000
50%           6.000000
75%           8.000000
90%          10.000000
95%          12.000000
max          52.000000
Name: genre_count, dtype: float64

In [16]:
# Reviewer & Album Activity
if "reviewerID" in df_train.columns:
    reviewer_freq = df_train.groupby("reviewerID").size().sort_values(ascending=False)
    print("Reviewer summary:")
    display(pd.DataFrame({
        "unique_reviewers": [reviewer_freq.shape[0]],
        "median_reviews_per_reviewer": [reviewer_freq.median()],
        "max_reviews_per_reviewer": [reviewer_freq.iloc[0]]
    }))
    display(reviewer_freq.head(10))

if "album_mbid" in df_train.columns:
    album_freq = df_train.groupby("album_mbid").size().sort_values(ascending=False)
    print("Album summary:")
    display(pd.DataFrame({
        "unique_albums": [album_freq.shape[0]],
        "median_reviews_per_album": [album_freq.median()],
        "max_reviews_per_album": [album_freq.iloc[0]]
    }))
    display(album_freq.head(10))


Reviewer summary:


Unnamed: 0,unique_reviewers,median_reviews_per_reviewer,max_reviews_per_reviewer
0,276853,1.0,699


reviewerID
A9Q28YTLYREO7     699
A8IFUOL8S9BZC     496
A2U49LUUY4IKQQ    441
A3HU0B9XUEVHIM    441
A2WQY1B8ZS7QRZ    369
A3KJ6JAZPH382D    333
A1J5KCZC8CMW9I    331
A3QS1EPDZTLPWS    318
A1RJD10TTI568L    299
A2X7NBUETXC19E    281
dtype: int64

Album summary:


Unnamed: 0,unique_albums,median_reviews_per_album,max_reviews_per_album
0,31471,4.0,1958


album_mbid
B0000AGWEC    1958
B000002UJQ    1569
B000BGR18W    1386
B0002GMSC0    1380
B000IY04RC     962
B000066HQC     815
B0000A0WKG     814
B00000JCOV     811
B0007WZUMO     703
B00004U9MS     675
dtype: int64

In [17]:
# Temporal Analysis
def add_datetime_features(df):
    if "unixReviewTime" not in df.columns:
        print("Missing unixReviewTime column.")
        return df
    df["review_dt"] = pd.to_datetime(df["unixReviewTime"], unit="s", errors="coerce")
    df["review_year"] = df["review_dt"].dt.year
    df["review_month"] = df["review_dt"].dt.month
    return df

df_train = add_datetime_features(df_train)
df_test = add_datetime_features(df_test)

print("Review year span:")
display(df_train["review_dt"].agg(["min", "max"]))


Missing unixReviewTime column.
Review year span:


min   1997-09-21
max   2014-07-23
Name: review_dt, dtype: datetime64[ns]

In [18]:
# Feature Engineering
engineered = df_train.copy()

# Clean text
for tcol in ["summary", "reviewText", "genres"]:
    if tcol in engineered.columns:
        engineered[tcol] = engineered[tcol].fillna("")

# Helpful ratios
engineered["TotalVotes"] = engineered["TotalVotes"].fillna(0)
engineered["VotedHelpful"] = engineered["VotedHelpful"].fillna(0)
engineered["help_ratio"] = helpful_ratio(engineered).fillna(0)

# Log transforms
engineered["total_votes_log1p"] = np.log1p(engineered["TotalVotes"])
engineered["voted_helpful_log1p"] = np.log1p(engineered["VotedHelpful"])

# Text signals
engineered["review_len"] = engineered["reviewText"].str.split().apply(len)
engineered["summary_len"] = engineered["summary"].str.split().apply(len)
engineered["review_len_log"] = np.log1p(engineered["review_len"])
engineered["summary_len_log"] = np.log1p(engineered["summary_len"])

# Genre diversity
engineered["genre_count"] = engineered["genres"].str.split(",").apply(
    lambda x: len([g.strip() for g in x if g.strip()])
)

print(f"Engineered dataset shape: {engineered.shape}")


Engineered dataset shape: (447583, 23)


In [19]:
# Model Preparation
from sklearn.model_selection import train_test_split

data_ready = engineered[engineered["Score"].notna()].copy()
data_ready["Score"] = data_ready["Score"].astype(int)

text_cols = ["summary", "reviewText", "genres"]
num_cols = ["review_year","review_month","TotalVotes","VotedHelpful",
            "help_ratio","total_votes_log1p","voted_helpful_log1p",
            "review_len","summary_len","review_len_log","summary_len_log",
            "genre_count"]

X = data_ready[text_cols + num_cols]
y = data_ready["Score"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train size: {X_train.shape[0]}, Val size: {X_val.shape[0]}")


Train size: 294066, Val size: 73517


In [20]:
# Modeling Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.impute import SimpleImputer

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MaxAbsScaler())
])

preproc = ColumnTransformer([
    ("tfidf_summary", TfidfVectorizer(max_features=1500, ngram_range=(1,2)), "summary"),
    ("tfidf_review", TfidfVectorizer(max_features=4000, ngram_range=(1,2), min_df=4), "reviewText"),
    ("tfidf_genre", TfidfVectorizer(max_features=400, ngram_range=(1,1)), "genres"),
    ("num", num_pipe, num_cols)
], remainder="drop", sparse_threshold=0.3)

model_pipe = Pipeline([
    ("prep", preproc),
    ("clf", LogisticRegression(multi_class="multinomial", 
                               solver="lbfgs", 
                               max_iter=300))
])

model_pipe.fit(X_train, y_train)
y_pred = model_pipe.predict(X_val)

print(f"Val Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"Val Macro F1: {f1_score(y_val, y_pred, average='macro'):.4f}")
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Val Accuracy: 0.6493
Val Macro F1: 0.4845

Classification Report:
               precision    recall  f1-score   support

           1       0.67      0.73      0.70      7870
           2       0.30      0.11      0.16      4695
           3       0.34      0.22      0.27      8987
           4       0.46      0.46      0.46     16142
           5       0.78      0.89      0.83     35823

    accuracy                           0.65     73517
   macro avg       0.51      0.48      0.48     73517
weighted avg       0.61      0.65      0.62     73517



In [21]:
# Hyperparameter Search
from sklearn.model_selection import StratifiedKFold, GridSearchCV

subset = data_ready.sample(min(60000, len(data_ready)), random_state=42)
X_cv = subset[text_cols + num_cols]
y_cv = subset["Score"]

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_params = {"clf__C": [0.5, 1.0, 2.0], "clf__class_weight": [None, "balanced"]}

search = GridSearchCV(model_pipe, grid_params, scoring="f1_macro", cv=cv, n_jobs=-1, verbose=1)
search.fit(X_cv, y_cv)

print("Best parameters:", search.best_params_)
print("Best Macro F1 (CV):", search.best_score_)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters: {'clf__C': 0.5, 'clf__class_weight': 'balanced'}
Best Macro F1 (CV): 0.4706652926721282
Best parameters: {'clf__C': 0.5, 'clf__class_weight': 'balanced'}
Best Macro F1 (CV): 0.4706652926721282


In [22]:
# Final Training & Submission
best_model = search.best_estimator_ if "search" in globals() else model_pipe

best_model.fit(X, y)
unlabeled = engineered["Score"].isna()
X_test_final = engineered.loc[unlabeled, text_cols + num_cols]
test_preds = best_model.predict(X_test_final)

submission = df_test[["id"]].copy()
submission["Score"] = test_preds.astype(int)
submission.to_csv("submission_final.csv", index=False)

print("submission_final.csv saved successfully!")
display(submission.head())


submission_final.csv saved successfully!


Unnamed: 0,id,Score
0,4,5
1,6,4
2,15,5
3,26,5
4,31,3


In [23]:
# Save Preprocessed Dataset
import pickle
with open("processed_dataset.pkl", "wb") as f:
    pickle.dump(engineered, f)
print("Processed dataset saved as processed_dataset.pkl")


Processed dataset saved as processed_dataset.pkl
