In [1]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
ratings_path = "dataset.csv"
movies_path  = "movieIdTitles.csv"

def robust_read(path):
    for sep in ['\t', ',', ';', '|']:
        try:
            df = pd.read_csv(path, sep=sep, engine='python')
            # if looks like MovieLens with 4 columns without header, set names
            if df.shape[1] == 4 and 'rating' not in df.columns:
                df.columns = ['userId','movieId','rating','timestamp']
            return df
        except Exception:
            pass
    return pd.read_csv(path)

print("Loading ratings...")
ratings = robust_read(ratings_path)
print("Ratings shape:", ratings.shape)
ratings.head()

Loading ratings...
Ratings shape: (100002, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,0,172,5,881250949
1,0,133,1,881250949
2,196,242,3,881250949
3,186,302,3,891717742
4,22,377,1,878887116


In [3]:
print("Columns:", ratings.columns.tolist())
print(ratings.info())
print("Rating value counts:")
print(ratings['rating'].value_counts())

if os.path.exists(movies_path):
    movies = robust_read(movies_path)
    # If two-column file (movieId,title) ensure names
    if movies.shape[1] == 2 and list(movies.columns) != ['movieId','title']:
        movies.columns = ['movieId','title']
    print("Movies shape:", movies.shape)
    display(movies.head())
else:
    movies = None
    print("movieIdTitles.csv not found.")


Columns: ['userId', 'movieId', 'rating', 'timestamp']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100002 entries, 0 to 100001
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   userId     100002 non-null  int64
 1   movieId    100002 non-null  int64
 2   rating     100002 non-null  int64
 3   timestamp  100002 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None
Rating value counts:
rating
4    34174
3    27145
5    21202
2    11370
1     6111
Name: count, dtype: int64
Movies shape: (1682, 1)


Unnamed: 0,"item_id,title"
0,"1,Toy Story (1995)"
1,"2,GoldenEye (1995)"
2,"3,Four Rooms (1995)"
3,"4,Get Shorty (1995)"
4,"5,Copycat (1995)"


In [4]:
import re
import numpy as np
import pandas as pd

df = ratings.copy()

if 'movies' not in globals() or movies is None:
    movies_path = "movieIdTitles.csv"
    if os.path.exists(movies_path):
        try:
            movies_try = pd.read_csv(movies_path, engine='python')
        except:
            movies_try = pd.read_csv(movies_path, header=None, engine='python')
        movies = movies_try.copy()
    else:
        movies = None

print("movies is None?:", movies is None)
if movies is not None:
    print("movies.columns:", list(movies.columns))
    display(movies.head(8))

if movies is not None:
    if 'movieId' in movies.columns and 'title' in movies.columns:
        pass
    else:
        if movies.shape[1] == 2:
            movies.columns = ['movieId', 'title']
            print("Renamed movies columns -> ['movieId','title']")
        else:
            id_col = None
            for c in movies.columns:
                try:
                    frac_numeric = pd.to_numeric(movies[c], errors='coerce').notna().mean()
                except:
                    frac_numeric = 0
                if frac_numeric > 0.9:
                    id_col = c
                    break
            title_col = None
            for c in movies.columns:
                s = movies[c].astype(str).str.contains(r'\(\d{4}\)', regex=True).mean()
                if s > 0.1: 
                    title_col = c
                    break
            if id_col is not None and title_col is not None:
                movies = movies.rename(columns={id_col: 'movieId', title_col: 'title'})
                print(f"Renamed columns detected: id_col={id_col}, title_col={title_col}")
            else:
                print("Could not confidently find movieId/title columns. Trying header=None read and heuristics...")
                movies2 = pd.read_csv(movies_path, header=None, engine='python')
                if movies2.shape[1] == 2:
                    movies2.columns = ['movieId','title']
                    movies = movies2
                    print("After header=None, renamed to ['movieId','title']")
                else:
                    movies.columns = [f"col{i}" for i in range(movies.shape[1])]
                    print("Movies columns renamed generically:", movies.columns.tolist())
    print("Final movies.columns:", list(movies.columns))
    display(movies.head(8))

if movies is not None and 'movieId' in movies.columns:
    try:
        df['movieId'] = df['movieId'].astype(int)
    except Exception:
        df['movieId'] = df['movieId'].astype(str).str.strip().astype(int, errors='ignore')
    try:
        movies['movieId'] = movies['movieId'].astype(int)
    except Exception:
        try:
            movies['movieId'] = pd.to_numeric(movies['movieId'], errors='coerce').astype('Int64')
        except:
            pass
    before = df.shape[0]
    df = df.merge(movies[['movieId','title']].drop_duplicates(), on='movieId', how='left')
    after = df.shape[0]
    print(f"Merged movies on movieId (rows before={before}, after={after}).")
else:
    print("No movies merge performed (movieId not found in movies).")

if 'timestamp' in df.columns:
    try:
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
    except:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

def extract_year(title):
    if pd.isna(title): return np.nan
    m = re.search(r'\((\d{4})\)', str(title))
    return int(m.group(1)) if m else np.nan

if 'title' in df.columns:
    df['movie_year'] = df['title'].apply(extract_year)
    df['title_clean'] = df['title'].fillna('').str.replace(r'\(\d{4}\)', '', regex=True).str.strip()
else:
    df['movie_year'] = np.nan
    df['title_clean'] = ''

user_stats = df.groupby('userId')['rating'].agg(user_count='count', user_mean='mean', user_std='std').reset_index()
df = df.merge(user_stats, on='userId', how='left')

movie_stats = df.groupby('movieId')['rating'].agg(movie_count='count', movie_mean='mean', movie_std='std').reset_index()
df = df.merge(movie_stats, on='movieId', how='left')

if 'timestamp' in df.columns:
    df['ts_year'] = df['timestamp'].dt.year
    df['ts_month'] = df['timestamp'].dt.month
    df['ts_dayofweek'] = df['timestamp'].dt.dayofweek
else:
    df['ts_year'] = np.nan; df['ts_month'] = np.nan; df['ts_dayofweek'] = np.nan

df['user_movie_mean_diff'] = df['rating'] - df['movie_mean']

df['user_std'] = df['user_std'].fillna(0)
df['movie_std'] = df['movie_std'].fillna(0)

print("Resulting df shape:", df.shape)
display(df.head(10))

df.to_csv("merged_ratings_movies.csv", index=False)
print("Merged dataset saved as merged_ratings_movies.csv")

movies is None?: False
movies.columns: ['item_id,title']


Unnamed: 0,"item_id,title"
0,"1,Toy Story (1995)"
1,"2,GoldenEye (1995)"
2,"3,Four Rooms (1995)"
3,"4,Get Shorty (1995)"
4,"5,Copycat (1995)"
5,"6,Shanghai Triad (Yao a yao yao dao waipo qiao..."
6,"7,Twelve Monkeys (1995)"
7,"8,Babe (1995)"


Could not confidently find movieId/title columns. Trying header=None read and heuristics...
After header=None, renamed to ['movieId','title']
Final movies.columns: ['movieId', 'title']


Unnamed: 0,movieId,title
0,item_id,title
1,1,Toy Story (1995)
2,2,GoldenEye (1995)
3,3,Four Rooms (1995)
4,4,Get Shorty (1995)
5,5,Copycat (1995)
6,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
7,7,Twelve Monkeys (1995)


Merged movies on movieId (rows before=100002, after=100002).
Resulting df shape: (100002, 17)


Unnamed: 0,userId,movieId,rating,timestamp,title,movie_year,title_clean,user_count,user_mean,user_std,movie_count,movie_mean,movie_std,ts_year,ts_month,ts_dayofweek,user_movie_mean_diff
0,0,172,5,1997-12-04 15:55:49,"Empire Strikes Back, The (1980)",1980.0,"Empire Strikes Back, The",2,3.0,2.828427,368,4.206522,0.922478,1997,12,3,0.793478
1,0,133,1,1997-12-04 15:55:49,Gone with the Wind (1939),1939.0,Gone with the Wind,2,3.0,2.828427,172,3.854651,1.085164,1997,12,3,-2.854651
2,196,242,3,1997-12-04 15:55:49,Kolya (1996),1996.0,Kolya,39,3.615385,1.016065,117,3.991453,0.995643,1997,12,3,-0.991453
3,186,302,3,1998-04-04 19:22:22,L.A. Confidential (1997),1997.0,L.A. Confidential,92,3.413043,1.223867,297,4.161616,0.854721,1998,4,5,-1.161616
4,22,377,1,1997-11-07 07:18:36,Heavyweights (1994),1994.0,Heavyweights,128,3.351562,1.493239,13,2.153846,1.068188,1997,11,4,-1.153846
5,244,51,2,1997-11-27 05:02:03,Legends of the Fall (1994),1994.0,Legends of the Fall,238,3.651261,1.071406,81,3.45679,1.162426,1997,11,3,-1.45679
6,166,346,1,1998-02-02 05:33:16,Jackie Brown (1997),1997.0,Jackie Brown,20,3.55,1.431782,126,3.642857,1.084172,1998,2,0,-2.642857
7,298,474,4,1998-01-07 14:20:06,Dr. Strangelove or: How I Learned to Stop Worr...,1963.0,Dr. Strangelove or: How I Learned to Stop Worr...,127,4.031496,0.815884,194,4.252577,0.853593,1998,1,2,-0.252577
8,115,265,2,1997-12-03 17:51:28,"Hunt for Red October, The (1990)",1990.0,"Hunt for Red October, The",92,3.934783,1.174868,227,3.863436,0.889383,1997,12,2,-1.863436
9,253,465,5,1998-04-03 18:34:27,"Jungle Book, The (1994)",1994.0,"Jungle Book, The",97,3.979381,0.957203,85,3.564706,1.06287,1998,4,4,1.435294


Merged dataset saved as merged_ratings_movies.csv


In [5]:
# Option A: multi-class rating (1..5)
target = 'rating'

# Option B (recommended if you want a binary problem): liked = rating >= 4
# Uncomment to use binary target
#df['liked'] = (df['rating'] >= 4).astype(int)
#target = 'liked'

print("Using target:", target)
print(df[target].value_counts())


Using target: rating
rating
4    34174
3    27145
5    21202
2    11370
1     6111
Name: count, dtype: int64


In [6]:
features = [
    'user_count','user_mean','user_std',
    'movie_count','movie_mean','movie_std',
    'movie_year','ts_year','ts_month','ts_dayofweek',
    'user_movie_mean_diff','userId','movieId'
]
features = [f for f in features if f in df.columns]
X = df[features].copy()
y = df[target].copy()

print("X shape:", X.shape, "y shape:", y.shape)
X.head()


X shape: (100002, 13) y shape: (100002,)


Unnamed: 0,user_count,user_mean,user_std,movie_count,movie_mean,movie_std,movie_year,ts_year,ts_month,ts_dayofweek,user_movie_mean_diff,userId,movieId
0,2,3.0,2.828427,368,4.206522,0.922478,1980.0,1997,12,3,0.793478,0,172
1,2,3.0,2.828427,172,3.854651,1.085164,1939.0,1997,12,3,-2.854651,0,133
2,39,3.615385,1.016065,117,3.991453,0.995643,1996.0,1997,12,3,-0.991453,196,242
3,92,3.413043,1.223867,297,4.161616,0.854721,1997.0,1998,4,5,-1.161616,186,302
4,128,3.351562,1.493239,13,2.153846,1.068188,1994.0,1997,11,4,-1.153846,22,377


In [8]:
cat_cols = [c for c in ['userId','movieId'] if c in X.columns]
num_cols = [c for c in X.columns if c not in cat_cols]

from sklearn.preprocessing import OrdinalEncoder
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
high_card_tr = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('high', high_card_tr, cat_cols)
], remainder='drop')

print("Numeric columns:", num_cols)
print("Categorical columns (treated ordinal):", cat_cols)


Numeric columns: ['user_count', 'user_mean', 'user_std', 'movie_count', 'movie_mean', 'movie_std', 'movie_year', 'ts_year', 'ts_month', 'ts_dayofweek', 'user_movie_mean_diff']
Categorical columns (treated ordinal): ['userId', 'movieId']


In [9]:
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
except Exception as e:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train distribution:", Counter(y_train))


Train shape: (80001, 13) Test shape: (20001, 13)
Train distribution: Counter({4: 27339, 3: 21716, 5: 16961, 2: 9096, 1: 4889})


In [10]:
try:
    import lightgbm as lgb
    HAS_LIGHTGBM = True
except ImportError:
    HAS_LIGHTGBM = False

print("LightGBM available:", HAS_LIGHTGBM)

def make_pipeline(estimator):
    return Pipeline([('pre', preprocessor), ('clf', estimator)])

rf = make_pipeline(RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
et = make_pipeline(ExtraTreesClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))

if HAS_LIGHTGBM:
    lgb_clf = make_pipeline(lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, num_leaves=31, objective='multiclass', random_state=42, n_jobs=-1))
else:
    lgb_clf = None

hgb = make_pipeline(HistGradientBoostingClassifier(max_iter=300, learning_rate=0.05, random_state=42))

estimators_list = []
if lgb_clf:
    estimators_list.append(('lgb', lgb_clf))
estimators_list += [('rf', rf), ('et', et), ('hgb', hgb)]

print("Estimators prepared:", [name for name, _ in estimators_list])

LightGBM available: False
Estimators prepared: ['rf', 'et', 'hgb']


In [11]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, VotingClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
hgb = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.05, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('et', et), ('hgb', hgb)],
    voting='soft',
    n_jobs=-1
)

final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', voting_clf)
])

print(" Training ensemble model...")
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print(f"\n Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

 Training ensemble model...

 Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1222
           2       1.00      1.00      1.00      2274
           3       1.00      1.00      1.00      5429
           4       1.00      1.00      1.00      6835
           5       1.00      1.00      1.00      4241

    accuracy                           1.00     20001
   macro avg       1.00      1.00      1.00     20001
weighted avg       1.00      1.00      1.00     20001



In [12]:
voting_estimators = []
if lgb_clf:
    voting_estimators.append(('lgb', lgb_clf))
voting_estimators.append(('rf', rf))
voting_estimators.append(('hgb', hgb))

voting_clf = VotingClassifier(estimators=voting_estimators, voting='soft', n_jobs=-1)
voting_clf = make_pipeline(voting_clf)  

from sklearn.base import clone
clf1 = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
clf2 = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.05, random_state=42)
clf3 = ExtraTreesClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
voting = Pipeline([('pre', preprocessor), ('voting', VotingClassifier(estimators=[('rf',clf1),('hgb',clf2),('et',clf3)], voting='soft', n_jobs=-1))])

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print("Voting accuracy:", accuracy_score(y_test, y_pred),)
print(classification_report(y_test, y_pred))

Voting accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1222
           2       1.00      1.00      1.00      2274
           3       1.00      1.00      1.00      5429
           4       1.00      1.00      1.00      6835
           5       1.00      1.00      1.00      4241

    accuracy                           1.00     20001
   macro avg       1.00      1.00      1.00     20001
weighted avg       1.00      1.00      1.00     20001



In [13]:
base_clf1 = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
base_clf2 = ExtraTreesClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
base_clf3 = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.05, random_state=42)

stacking = Pipeline([
    ('pre', preprocessor),
    ('stack', StackingClassifier(estimators=[('rf', base_clf1), ('et', base_clf2), ('hgb', base_clf3)],
                                 final_estimator=LogisticRegression(max_iter=1000),
                                 n_jobs=-1, passthrough=False))
])

print("Fitting stacking (this may take time)...")
stacking.fit(X_train, y_train)
y_pred_stack = stacking.predict(X_test)
print("Stacking accuracy:", accuracy_score(y_test, y_pred_stack))
print(classification_report(y_test, y_pred_stack))

Fitting stacking (this may take time)...
Stacking accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1222
           2       1.00      1.00      1.00      2274
           3       1.00      1.00      1.00      5429
           4       1.00      1.00      1.00      6835
           5       1.00      1.00      1.00      4241

    accuracy                           1.00     20001
   macro avg       1.00      1.00      1.00     20001
weighted avg       1.00      1.00      1.00     20001



In [14]:
param_dist = {
    'clf__n_estimators': [100,200,400],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2,5,10]
}
rf_pipeline = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))])

rs = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist, n_iter=6, scoring='accuracy', cv=3, random_state=42, n_jobs=-1, verbose=1)
rs.fit(X_train, y_train)
print("Best params:", rs.best_params_)
best_rf = rs.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("Tuned RF accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best params: {'clf__n_estimators': 100, 'clf__min_samples_split': 5, 'clf__max_depth': 30}
Tuned RF accuracy: 0.999850007499625
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1222
           2       1.00      1.00      1.00      2274
           3       1.00      1.00      1.00      5429
           4       1.00      1.00      1.00      6835
           5       1.00      1.00      1.00      4241

    accuracy                           1.00     20001
   macro avg       1.00      1.00      1.00     20001
weighted avg       1.00      1.00      1.00     20001



In [15]:
best_model = stacking  
save_path = "best_ensemble_model.joblib"
joblib.dump(best_model, save_path)
print("Saved model to", save_path)


Saved model to best_ensemble_model.joblib
