In [1]:
import pandas as pd


df = pd.read_json("../data/problems_data.jsonl", lines=True)

# Basic info
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns)


df.head(2).T


Shape: (4112, 8)

Columns:
Index(['title', 'description', 'input_description', 'output_description',
       'sample_io', 'problem_class', 'problem_score', 'url'],
      dtype='object')


Unnamed: 0,0,1
title,Uuu,House Building
description,Unununium (Uuu) was the name of the chemical\n...,A number of eccentrics from central New York h...
input_description,The input consists of one line with two intege...,"The input consists of $10$ test cases, which a..."
output_description,The output consists of $M$ lines where the $i$...,Print $K$ lines with\n the positions of the...
sample_io,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...","[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu..."
problem_class,hard,hard
problem_score,9.7,9.7
url,https://open.kattis.com/problems/uuu,https://open.kattis.com/problems/husbygge


In [2]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)   # remove special chars
    text = re.sub(r"\s+", " ", text)           # remove extra spaces
    return text.strip()

text_cols = [
    "title",
    "description",
    "input_description",
    "output_description"
]


for col in text_cols:
    df[col] = df[col].fillna("").apply(clean_text)


df["combined_text"] = (
    df["title"] + " " +
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)

df[["combined_text", "problem_class", "problem_score"]].head(3)


Unnamed: 0,combined_text,problem_class,problem_score
0,uuu unununium uuu was the name of the chemical...,hard,9.7
1,house building a number of eccentrics from cen...,hard,9.7
2,mario or luigi mario and luigi are playing a g...,hard,9.6


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,        # limit features for efficiency
    stop_words="english",     # remove common words
    ngram_range=(1, 2)        # unigrams + bigrams
)

X = vectorizer.fit_transform(df["combined_text"])

print("TF-IDF matrix shape:", X.shape)


TF-IDF matrix shape: (4112, 5000)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


y_class = df["problem_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_class,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.4738760631834751

Classification Report:
               precision    recall  f1-score   support

        easy       0.51      0.19      0.28       153
        hard       0.52      0.76      0.62       389
      medium       0.34      0.23      0.27       281

    accuracy                           0.47       823
   macro avg       0.45      0.39      0.39       823
weighted avg       0.45      0.47      0.44       823


Confusion Matrix:
 [[ 29  78  46]
 [ 11 296  82]
 [ 17 199  65]]


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


y_reg = df["problem_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_reg,
    test_size=0.2,
    random_state=42
)

reg = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("MAE:", mae)
print("RMSE:", rmse)


MAE: 1.6922964763061967
RMSE: 2.039205876542602




In [7]:
from sklearn.model_selection import train_test_split

y_class = df["problem_class"]

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X,
    y_class,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)


In [8]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

svm_clf = LinearSVC()
svm_clf.fit(X_train_cls, y_train_cls)

y_pred = svm_clf.predict(X_test_cls)

print("SVM Accuracy:", accuracy_score(y_test_cls, y_pred))
print("\nClassification Report:\n", classification_report(y_test_cls, y_pred))


SVM Accuracy: 0.456865127582017

Classification Report:
               precision    recall  f1-score   support

        easy       0.37      0.29      0.33       153
        hard       0.53      0.61      0.57       389
      medium       0.38      0.33      0.35       281

    accuracy                           0.46       823
   macro avg       0.42      0.41      0.41       823
weighted avg       0.44      0.46      0.45       823



In [9]:
import joblib
import os

# Create models folder if not exists
os.makedirs("../models", exist_ok=True)

# Save models
joblib.dump(svm_clf, "../models/classifier.pkl")
joblib.dump(reg, "../models/regressor.pkl")
joblib.dump(vectorizer, "../models/vectorizer.pkl")

print("Models saved successfully!")


Models saved successfully!
