In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.sparse import hstack

In [2]:
from load_data.data_loader import load_df
df = load_df()
df.head()

Unnamed: 0,problem_class,problem_score,score_norm,combined_text,constraint,math,algoword,length
0,hard,2500.0,0.62963,Title: Digits Description: John gave Jack a ve...,5.30103,12,0,888
1,medium,2000.0,0.444444,Title: Neural Network country Description: Due...,2.025306,13,2,995
2,medium,2100.0,0.481481,Title: Property Description: Bill is a famous ...,4.69897,70,0,1292
3,medium,2100.0,0.481481,Title: Exploration plan Description: The compe...,4.30103,3,0,1253
4,medium,2100.0,0.481481,Title: Casinos and travel Description: John ha...,5.0,4,2,1337


In [3]:
feature_cols = ["constraint", "math", "algoword", "length"]

X_text = df["combined_text"]
X_num  = df[feature_cols].fillna(0)
y = df["score_norm"]

(
    X_text_train, X_text_test,
    X_num_train,  X_num_test,
    y_train,  y_test,
) = train_test_split(
    X_text, X_num, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=3, max_df=0.9)

X_text_train_vec = tfidf.fit_transform(X_text_train)
X_text_test_vec  = tfidf.transform(X_text_test)

scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled  = scaler.fit_transform(X_num_test)

X_train_final = hstack([X_text_train_vec, X_num_train_scaled])
X_test_final  = hstack([X_text_test_vec,  X_num_test_scaled])


In [4]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_final, y_train)
y_pred_rfr = rf.predict(X_test_final)

print("RFR")
print("MAE:", mean_absolute_error(y_test, y_pred_rfr))
print("R² :", r2_score(y_test, y_pred_rfr))


RFR
MAE: 0.177664752728568
R² : 0.3215507355716495
