In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from scipy import sparse
import warnings

# Suppress warnings to avoid unnecessary clutter
warnings.filterwarnings('ignore')

In [26]:
# ====================
# Load Data
# ====================
# Load the datasets for training, testing, item data, user data, and genre data
train_df = pd.read_csv("train (1).csv")
test_df = pd.read_csv("test (1).csv")
item_df = pd.read_csv("item_.csv")
user_df = pd.read_csv("user.csv")
genre_df = pd.read_csv("genre.csv")


In [27]:
# ====================
# Occupation Mapping
# ====================
# Load the occupation list from a text file
with open("occupation.txt") as f:
    occupation_list = [line.strip() for line in f]

# If the 'occupation' column is numeric, map it to the occupation list
if pd.api.types.is_numeric_dtype(user_df['occupation']):
    user_df['occupation'] = user_df['occupation'].map(
        lambda x: occupation_list[int(x)] if pd.notnull(x) else 'unknown'
    )
else:
    user_df['occupation'] = user_df['occupation'].fillna('unknown')

In [28]:
# ====================
# Data Preparation
# ====================
# Clean column names in the item dataframe by stripping leading/trailing spaces
item_df.columns = item_df.columns.str.strip()

# Rename 'movie_id' column to 'item_id' for consistency across dataframes
item_df.rename(columns={'movie_id': 'item_id'}, inplace=True)

# Merge the user data into the training and testing datasets
train_df = train_df.merge(user_df[['user_id','age','gender','occupation']], on='user_id', how='left')
test_df = test_df.merge(user_df[['user_id','age','gender','occupation']], on='user_id', how='left')

# Merge the item data into the training and testing datasets
train_df = train_df.merge(item_df, on='item_id', how='left')
test_df = test_df.merge(item_df, on='item_id', how='left')

In [29]:
# ====================
# Advanced Feature Engineering
# ====================

# Function to extract the release year from the release date
def extract_year(x):
    try:
        return int(str(x).split("-")[-1])
    except:
        return np.nan

# Apply the extract_year function to get the release year for each movie
train_df['release_year'] = train_df['release_date'].apply(extract_year)
test_df['release_year'] = test_df['release_date'].apply(extract_year)

# Fill missing values for release year with the median value of the training set
median_year = train_df['release_year'].median()
train_df['release_year'].fillna(median_year, inplace=True)
test_df['release_year'].fillna(median_year, inplace=True)

# Calculate movie age at the time of rating (assuming data is from 1998)
train_df['movie_age'] = 1998 - train_df['release_year']
test_df['movie_age'] = 1998 - test_df['release_year']

# Generate features based on the timestamp (year, month, day of week, hour)
train_df['rating_year'] = pd.to_datetime(train_df['timestamp'], unit='s').dt.year
train_df['rating_month'] = pd.to_datetime(train_df['timestamp'], unit='s').dt.month
train_df['rating_dayofweek'] = pd.to_datetime(train_df['timestamp'], unit='s').dt.dayofweek
train_df['rating_hour'] = pd.to_datetime(train_df['timestamp'], unit='s').dt.hour

# For the test set, use the median values of these features from the training set
for col in ['rating_year', 'rating_month', 'rating_dayofweek', 'rating_hour']:
    test_df[col] = train_df[col].median()

In [30]:
# ====================
# User Statistics
# ====================
# Calculate user-specific statistics: average rating, standard deviation, rating count, etc.
user_stats = train_df.groupby('user_id').agg({
    'rating': ['mean', 'std', 'count', 'median'],
    'timestamp': ['min', 'max']
}).reset_index()

# Flatten multi-level columns in the user statistics dataframe
user_stats.columns = ['user_id', 'user_avg_rating', 'user_std_rating', 
                      'user_rating_count', 'user_median_rating',
                      'user_first_rating', 'user_last_rating']

# Calculate the time span between first and last ratings for each user
user_stats['user_rating_span'] = user_stats['user_last_rating'] - user_stats['user_first_rating']

# Fill missing values for standard deviation with 0
user_stats['user_std_rating'].fillna(0, inplace=True)

# Merge user statistics into the training and testing datasets
train_df = train_df.merge(user_stats, on='user_id', how='left')
test_df = test_df.merge(user_stats, on='user_id', how='left')


In [31]:
# ====================
# Movie Statistics
# ====================
# Calculate movie-specific statistics: average rating, standard deviation, rating count, etc.
movie_stats = train_df.groupby('item_id').agg({
    'rating': ['mean', 'std', 'count', 'median'],
}).reset_index()

# Flatten multi-level columns in the movie statistics dataframe
movie_stats.columns = ['item_id', 'movie_avg_rating', 'movie_std_rating', 
                       'movie_rating_count', 'movie_median_rating']

In [32]:
# Fill missing values for standard deviation with 0
movie_stats['movie_std_rating'].fillna(0, inplace=True)

# Merge movie statistics into the training and testing datasets
train_df = train_df.merge(movie_stats, on='item_id', how='left')
test_df = test_df.merge(movie_stats, on='item_id', how='left')

In [33]:
# ====================
# User-Genre Interaction
# ====================
# If there are genre columns, calculate user preference for each genre
genre_columns = [c for c in item_df.columns if 'genre' in c.lower()]
if genre_columns:
    for genre_col in genre_columns:
        # Calculate user preference for each genre based on ratings
        genre_user_stats = train_df[train_df[genre_col] == 1].groupby('user_id')['rating'].agg(['mean', 'count']).reset_index()
        genre_user_stats.columns = ['user_id', f'user_{genre_col}avg', f'user{genre_col}_count']
        train_df = train_df.merge(genre_user_stats, on='user_id', how='left')
        test_df = test_df.merge(genre_user_stats, on='user_id', how='left')


In [34]:
# ====================
# Gender-Occupation Interaction
# ====================
# Combine gender and occupation into a single feature for interaction modeling
train_df['gender_occupation'] = train_df['gender'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['gender_occupation'] = test_df['gender'].astype(str) + '_' + test_df['occupation'].astype(str)


In [35]:
# ====================
# Age Bins
# ====================
# Create age bins to categorize users into different age groups
train_df['age_bin'] = pd.cut(train_df['age'], bins=[0,18,25,35,50,100],
                             labels=['0','1','2','3','4']).astype(str)
test_df['age_bin'] = pd.cut(test_df['age'], bins=[0,18,25,35,50,100],
                            labels=['0','1','2','3','4']).astype(str)

In [36]:
# ====================
# User and Movie Deviation from Average Rating
# ====================
# Calculate how far a user's rating deviates from the overall average
train_df['user_deviation'] = train_df['user_avg_rating'] - train_df['rating'].mean()
test_df['user_deviation'] = test_df['user_avg_rating'] - train_df['rating'].mean()

# Calculate how far a movie's rating deviates from the overall average
train_df['movie_deviation'] = train_df['movie_avg_rating'] - train_df['rating'].mean()
test_df['movie_deviation'] = test_df['movie_avg_rating'] - train_df['rating'].mean()

In [37]:
# ====================
# Target Encoding with K-Fold
# ====================
# Function for target encoding using K-Fold cross-validation to prevent overfitting
def target_encode_kfold(train, test, col, target, n_splits=5):
    train_encoded = np.zeros(len(train))
    test_encoded = np.zeros(len(test))
    
    # Global mean for unseen categories
    global_mean = train[target].mean()
    
    # K-Fold encoding for train
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    for tr_idx, val_idx in kf.split(train, train[target].astype(int)):
        target_mean = train.iloc[tr_idx].groupby(col)[target].mean()
        train_encoded[val_idx] = train.iloc[val_idx][col].map(target_mean)
    
    # Fill NaN with global mean
    train_encoded = np.where(np.isnan(train_encoded), global_mean, train_encoded)
    
    # Encode test using all training data
    target_mean = train.groupby(col)[target].mean()
    test_encoded = test[col].map(target_mean).fillna(global_mean)
    
    return train_encoded, test_encoded

# Apply target encoding to 'occupation' and 'gender_occupation'
train_df['occupation_encoded'], test_df['occupation_encoded'] = target_encode_kfold(
    train_df, test_df, 'occupation', 'rating')

train_df['gender_occ_encoded'], test_df['gender_occ_encoded'] = target_encode_kfold(
    train_df, test_df, 'gender_occupation', 'rating')


In [38]:
# ====================
# Label Encoding for Categorical Features
# ====================
# Convert categorical columns like gender, occupation, and age_bin into numerical values
for col in ['gender', 'occupation', 'age_bin']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = test_df[col].map(lambda x: le.transform([str(x)])[0] if str(x) in le.classes_ else -1)

In [39]:
# ====================
# Fill Missing Values BEFORE Label Encoding for Genre Columns
# ====================
# Identify numeric columns and fill any missing values with -999
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    if train_df[col].isnull().any():
        train_df[col].fillna(-999, inplace=True)
    if col in test_df.columns and test_df[col].isnull().any():
        test_df[col].fillna(-999, inplace=True)

In [40]:
# ====================
# Feature Selection
# ====================
# Drop irrelevant columns and select the remaining features for the model
drop_cols = ['rating', 'timestamp', 'title', 'release_date', 'imdb_url', 
             'user_first_rating', 'user_last_rating', 'gender_occupation']
feature_cols = [c for c in train_df.columns if c not in drop_cols]

test_drop_cols = ['id', 'title', 'release_date', 'imdb_url', 
                  'user_first_rating', 'user_last_rating', 'gender_occupation']
test_feature_cols = [c for c in test_df.columns if c not in test_drop_cols]

# Prepare the feature matrix and target vector
X = train_df[feature_cols]
y = train_df['rating']
X_test = test_df[test_feature_cols]


In [None]:
# ====================
# Stratified K-Fold Cross Validation
# ====================
n_splits = 7
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds_xgb = np.zeros(len(X))
oof_preds_lgb = np.zeros(len(X))
test_preds_xgb = np.zeros(len(X_test))
test_preds_lgb = np.zeros(len(X_test))

# Cross-validation loop for both models (XGBoost and LightGBM)
fold = 1
for train_idx, val_idx in skf.split(X, y.astype(int)):
    print(f"\n{'='*50}")
    print(f"Fold {fold}/{n_splits}")
    print(f"{'='*50}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # ========== XGBoost ==========
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)
    
    xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': 7,
        'learning_rate': 0.03,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'reg_alpha': 0.5,
        'reg_lambda': 2,
        'gamma': 0.1,
        'random_state': 42
    }
    
    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=1000,
        early_stopping_rounds=50,
        evals=[(dval, 'eval')],
        verbose_eval=False
    )
    
    val_preds_xgb = xgb_model.predict(dval)
    oof_preds_xgb[val_idx] = val_preds_xgb
    test_preds_xgb += xgb_model.predict(dtest) / n_splits
    
    rmse_xgb = mean_squared_error(y_val, val_preds_xgb, squared=False)
    print(f"XGBoost RMSE: {rmse_xgb:.4f}")
    
    # ========== LightGBM ==========
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.03,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'min_child_samples': 20,
        'reg_alpha': 0.5,
        'reg_lambda': 2,
        'verbose': -1,
        'random_state': 42
    }
    
    lgb_model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_val],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    val_preds_lgb = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
    oof_preds_lgb[val_idx] = val_preds_lgb
    test_preds_lgb += lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration) / n_splits
    
    rmse_lgb = mean_squared_error(y_val, val_preds_lgb, squared=False)
    print(f"LightGBM RMSE: {rmse_lgb:.4f}")
    
    # Ensemble predictions
    val_preds_ensemble = 0.5 * val_preds_xgb + 0.5 * val_preds_lgb
    rmse_ensemble = mean_squared_error(y_val, val_preds_ensemble, squared=False)
    print(f"Ensemble RMSE: {rmse_ensemble:.4f}")
    
    fold += 1

# ====================
# Overall CV Scores
# ====================
print(f"\n{'='*50}")
print("OVERALL CROSS-VALIDATION SCORES")
print(f"{'='*50}")


Fold 1/7
XGBoost RMSE: 0.8867
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 0.88946
LightGBM RMSE: 0.8895
Ensemble RMSE: 0.8856

Fold 2/7
XGBoost RMSE: 0.8877
Training until validation scores don't improve for 50 rounds


In [None]:
# Calculate overall CV RMSE for each model and ensemble
cv_rmse_xgb = mean_squared_error(y, oof_preds_xgb, squared=False)
print(f"XGBoost CV RMSE: {cv_rmse_xgb:.4f}")

cv_rmse_lgb = mean_squared_error(y, oof_preds_lgb, squared=False)
print(f"LightGBM CV RMSE: {cv_rmse_lgb:.4f}")

oof_preds_ensemble = 0.5 * oof_preds_xgb + 0.5 * oof_preds_lgb
cv_rmse_ensemble = mean_squared_error(y, oof_preds_ensemble, squared=False)
print(f"Ensemble CV RMSE: {cv_rmse_ensemble:.4f}")

In [None]:
# ====================
# Final Predictions
# ====================
# Generate final ensemble predictions for the test set
test_preds_ensemble = 0.5 * test_preds_xgb + 0.5 * test_preds_lgb

# Clip predictions to ensure they fall within the valid range (1 to 5)
y_test_pred_clipped = np.clip(np.round(test_preds_ensemble), 1, 5)

# Prepare the submission file
submission = pd.DataFrame({
    'timestamp': test_df['id'],
    'rating': y_test_pred_clipped.astype(int)
})

# Save predictions to a CSV file
submission.to_csv('IITG_240107019_Ayush_kumar3.csv', index=False)
print(f"\n{'='*50}")
print("Predictions saved to 'IITG_240107019_Ayush_kumar3.csv'")
print(f"{'='*50}")