In [1]:
import joblib
data=joblib.load("training_data.pkl")

In [2]:
import pandas as pd
x=data["user_input"]
y_class=data["problem_class"]
y_score=data["problem_score"]

In [3]:
from sklearn.model_selection import train_test_split

# splitting that dataset to two parts - first for training (~80%), second for testing (~20%) (about 800 problems)
# random state keeps the randomness of the split fixed
# stratify keeps the proportion of problems (easy,med,hard) equal in train and test set to ensure that training and test set dont lack any kind of class

x_train, x_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
    x,
    y_class,
    y_score,
    test_size=0.20,
    random_state=42,
    stratify=y_class
)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF is used to mark words acc to their frequency
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words="english"
)

x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)
joblib.dump(tfidf , "tfidf.pkl")

['tfidf.pkl']

In [5]:
keywords = ["greedy","dp","dynamic programming","tree","graph","dfs","bfs","two pointers","binary search","bitmasks","dfs","combinatorics"]
def extra_features(text):
    return [
        len(text),
        len(text.split()),
        sum(text.count(k) for k in keywords),
        sum(1 for c in text if c in "+-*/%")
    ]

In [6]:
from scipy.sparse import hstack
import numpy as np

x_train_extra = np.array(x_train.apply(extra_features).tolist())
x_test_extra = np.array(x_test.apply(extra_features).tolist())

x_train_final = hstack([x_train_tfidf, x_train_extra])
x_test_final = hstack([x_test_tfidf, x_test_extra])

In [8]:
# Regression through Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

reg_lr = LinearRegression()
reg_lr.fit(x_train_final, y_score_train)

reg_lr_test = reg_lr.predict(x_test_final)

mae = mean_absolute_error(y_score_test, reg_lr_test)
rmse = np.sqrt(mean_squared_error(y_score_test, reg_lr_test))

print("Linear Regression MAE:", mae)
print("Linear Regression RMSE:", rmse)

Linear Regression MAE: 2.358895946004565
Linear Regression RMSE: 2.9345392852358567


In [9]:
# Regression through Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

reg_ridge = Ridge(alpha=1.0)
reg_ridge.fit(x_train_final, y_score_train)

reg_ridge_test = reg_ridge.predict(x_test_final)

mae = mean_absolute_error(y_score_test, reg_ridge_test)
rmse = np.sqrt(mean_squared_error(y_score_test, reg_ridge_test))

print("Ridge MAE:", mae)
print("Ridge RMSE:", rmse)

Ridge MAE: 1.7368451684479564
Ridge RMSE: 2.075326291673535


In [10]:
# Regression through Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

reg_rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

reg_rf.fit(x_train_final, y_score_train)
reg_rf_test = reg_rf.predict(x_test_final)

mae = mean_absolute_error(y_score_test, reg_rf_test)
rmse = np.sqrt(mean_squared_error(y_score_test, reg_rf_test))

print("Random Forest MAE:", mae)
print("Random Forest RMSE:", rmse)

Random Forest MAE: 1.672363304981774
Random Forest RMSE: 2.0218846541477107


In [12]:
# Regression through Gradient Boost Regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

reg_gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

reg_gbr.fit(x_train_final, y_score_train)
reg_gbr_test = reg_gbr.predict(x_test_final)

mae = mean_absolute_error(y_score_test, reg_gbr_test)
rmse = np.sqrt(mean_squared_error(y_score_test, reg_gbr_test))

print("Gradient Boosting MAE:", mae)
print("Gradient Boosting RMSE:", rmse)

Gradient Boosting MAE: 1.6867568626762863
Gradient Boosting RMSE: 2.015229391906969
