## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


## Load Final Dataset

In [2]:
df_train = pd.read_csv("../data/train_final.csv")
df_test  = pd.read_csv("../data/test_final.csv")

df_train.shape, df_test.shape



((9309, 6), (458, 6))

## Text Cleaning Function

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()



## Creating Combined Text

In [4]:
TEXT_COLUMNS = ["title", "description", "input_format", "output_format"]

for col in TEXT_COLUMNS:
    df_train[col] = df_train[col].fillna("")
    df_test[col] = df_test[col].fillna("")

df_train["combined_text"] = (
    df_train["title"] + " " +
    df_train["description"] + " " +
    df_train["input_format"] + " " +
    df_train["output_format"]
)

df_test["combined_text"] = (
    df_test["title"] + " " +
    df_test["description"] + " " +
    df_test["input_format"] + " " +
    df_test["output_format"]
)


## Cleaning the Text

In [5]:
df_train["clean_text"] = df_train["combined_text"].apply(clean_text)
df_test["clean_text"]  = df_test["combined_text"].apply(clean_text)


## Loading TF-IDF Vectorizer & Transform Text

In [6]:
vectorizer = joblib.load("../models/tfidf_vectorizer.pkl")

X_train = vectorizer.transform(df_train["clean_text"])
X_test  = vectorizer.transform(df_test["clean_text"])


## Defining Regression Target

In [7]:
y_train = df_train["rating"]
y_test  = df_test["rating"]


## REGRESSION MODEL COMPARISON

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

reg_results = []
reg_models = {}


## Train & Compare Regression Models
### LINEAR REGRESSION

In [9]:
lin = LinearRegression()
lin.fit(X_train, y_train)
lin_pred = lin.predict(X_test)

reg_results.append((
    "Linear Regression",
    mean_absolute_error(y_test, lin_pred),
    np.sqrt(mean_squared_error(y_test, lin_pred))

))

reg_models["Linear Regression"] = lin


### RANDOM FOREST REGRESSOR

In [10]:
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

reg_results.append((
    "Random Forest",
    mean_absolute_error(y_test, rf_pred),
    np.sqrt(mean_squared_error(y_test, rf_pred))
))

reg_models["Random Forest"] = rf


### GRADIENT BOOSTING REGRESSOR

In [11]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

reg_results.append((
    "Gradient Boosting",
    mean_absolute_error(y_test, gb_pred),
    np.sqrt(mean_squared_error(y_test, gb_pred))
))

reg_models["Gradient Boosting"] = gb


### Regression Comparison Table

In [12]:
reg_results_df = pd.DataFrame(
    reg_results,
    columns=["Model", "MAE", "RMSE"]
)
reg_results_df



Unnamed: 0,Model,MAE,RMSE
0,Linear Regression,694.322846,869.368927
1,Random Forest,508.162516,664.192209
2,Gradient Boosting,525.045027,664.597273


### For difficulty score prediction, three regression models were evaluated: Linear Regression, Random Forest Regressor, and Gradient Boosting Regressor. Linear Regression performed poorly due to its inability to model non-linear relationships in high-dimensional TF-IDF features. Random Forest Regressor achieved the lowest MAE (≈508) and RMSE (≈664), outperforming Gradient Boosting. Consequently, Random Forest Regressor was selected as the final model for numerical difficulty prediction.

## FINAL SAVE CELL

In [13]:
final_regressor = rf  # Random Forest selected based on lowest MAE & RMSE
joblib.dump(final_regressor, "../models/rating_regressor.pkl")


['../models/rating_regressor.pkl']