Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Creating path to main folder

In [None]:
path = '/content/drive/MyDrive/acm_project'

Importing important libraries

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

Data Preprocessing and cleaning

In [None]:
df = pd.read_json(f"{path}/problems_data.jsonl", lines=True)

In [None]:
df.head()

Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3 3 C 2 C 1 C', 'output': 'GHOST...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0 10 0 10 10', 'output': '14.14'...",hard,9.6,https://open.kattis.com/problems/barktree


In [None]:
df.shape

(4112, 8)

Handling mising values

In [None]:
text_cols = ['title', 'description', 'input_description', 'output_description']
df[text_cols] = df[text_cols].fillna("")

Creating a full text column and further cleaning the data

In [None]:

df['full_text'] = df['title'] + " " + df['description'] + " " + df['input_description'] + " " + df['output_description']
#data cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s\+\-\*\=<>]", " ", text)
    return text
df['full_text'] = df['full_text'].apply(clean_text)


In [None]:
df['full_text']

Unnamed: 0,full_text
0,uuu unununium uuu was the name of the chemic...
1,house building a number of eccentrics from cen...
2,mario or luigi mario and luigi are playing a g...
3,the wire ghost ofka is bending a copper wire ...
4,barking up the wrong tree your dog spot is let...
...,...
4107,t lvunarfr ingar telja computer scientists co...
4108,velkomin welcome to forritunarkeppni framhald...
4109,til hamingju there is no input in this proble...
4110,hipp hipp there is no input in this problem ...


FEATURE ENGINEERING

In [None]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(df['full_text'])

In [None]:
#creating new features
df['text_length'] = df['full_text'].apply(len)

In [None]:
math_symbols = ["+", "-", "*", "/", "=", "<", ">"]
def count_symbols(text):
    return sum(text.count(sym) for sym in math_symbols)
df['math_symbols'] = df['full_text'].apply(count_symbols)#number of mathematical symbols

In [None]:
keywords = ["graph", "dp", "recursion", "math", "string", "array", "greedy", "binarysearch"]
for kw in keywords:
    df[f'{kw}_count'] = df["full_text"].str.lower().str.count(kw)# frequency of each keyword

TruncatedSVD is applied to reduce the high dimensionality of text-based features
 (e.g., TF-IDF or bag-of-words representations), which often contain a verylarge
number of sparse features. Reducing the feature space improves computational
efficiency, lowers memory usage, and helps mitigate the curse of dimensionality.

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=25, random_state=42)
X_reduced = svd.fit_transform(X_text)

In [None]:
X_reduced.shape

(4112, 25)

In [None]:
X_reduced = pd.DataFrame(X_reduced)

In [None]:
X_reduced["text_length"] = df['text_length']
X_reduced['math_symbols'] = df['math_symbols']
for kw in keywords:
   X_reduced[f'{kw}_count'] = df[f'{kw}_count']
print("Adding additional features to text features extracted")

Adding additional features to text features extracted


In [None]:
X_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,text_length,math_symbols,graph_count,dp_count,recursion_count,math_count,string_count,array_count,greedy_count,binarysearch_count
0,0.412872,0.012789,0.082151,0.011495,0.009994,0.042823,-0.023077,-0.036706,-0.115823,0.001482,...,1672,3,5,0,0,0,0,0,0,0
1,0.409548,-0.065984,0.020378,-0.022589,-0.042640,-0.033104,-0.059465,0.042504,-0.034282,-0.007279,...,1422,3,0,0,0,0,0,0,0,0
2,0.314675,-0.102371,0.078870,0.082283,-0.020164,0.069482,0.111993,-0.037518,0.011360,0.008771,...,1334,1,0,0,0,0,0,0,0,0
3,0.350846,-0.021838,-0.088157,-0.049159,-0.008071,-0.003203,-0.061901,-0.016298,0.042576,0.097519,...,1390,0,0,0,0,0,2,0,0,0
4,0.317803,-0.059108,0.047336,-0.051538,0.044427,-0.127037,0.054395,0.047712,-0.065885,-0.034637,...,2261,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,0.204727,-0.006771,0.008983,-0.000707,0.026528,-0.000732,0.023023,0.000651,0.023842,0.039317,...,442,2,0,0,0,0,0,0,0,0
4108,0.135621,0.069497,0.064112,0.073662,0.023070,0.019383,0.018488,0.048613,-0.075475,-0.000218,...,138,0,0,0,0,0,0,0,0,0
4109,0.116465,0.094176,0.048763,0.065761,0.019482,0.031685,0.016592,0.046447,-0.070616,-0.001686,...,149,0,0,0,0,0,0,0,0,0
4110,0.108891,0.081432,0.030026,0.048604,0.023663,0.020884,0.011879,0.027001,-0.084547,0.018001,...,103,0,0,0,0,0,0,0,0,0


In [None]:
X_reduced.columns = X_reduced.columns.astype(str)
print("Ensuring all features name have same type")

Ensuring all features name have same type


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_reduced['text_length'] = scaler.fit_transform(X_reduced[['text_length']])
print('scaling the text length')

scaling the text length


In [None]:
X_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,text_length,math_symbols,graph_count,dp_count,recursion_count,math_count,string_count,array_count,greedy_count,binarysearch_count
0,0.412872,0.012789,0.082151,0.011495,0.009994,0.042823,-0.023077,-0.036706,-0.115823,0.001482,...,0.061975,3,5,0,0,0,0,0,0,0
1,0.409548,-0.065984,0.020378,-0.022589,-0.042640,-0.033104,-0.059465,0.042504,-0.034282,-0.007279,...,-0.268436,3,0,0,0,0,0,0,0,0
2,0.314675,-0.102371,0.078870,0.082283,-0.020164,0.069482,0.111993,-0.037518,0.011360,0.008771,...,-0.384741,1,0,0,0,0,0,0,0,0
3,0.350846,-0.021838,-0.088157,-0.049159,-0.008071,-0.003203,-0.061901,-0.016298,0.042576,0.097519,...,-0.310729,0,0,0,0,0,2,0,0,0
4,0.317803,-0.059108,0.047336,-0.051538,0.044427,-0.127037,0.054395,0.047712,-0.065885,-0.034637,...,0.840425,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4107,0.204727,-0.006771,0.008983,-0.000707,0.026528,-0.000732,0.023023,0.000651,0.023842,0.039317,...,-1.563649,2,0,0,0,0,0,0,0,0
4108,0.135621,0.069497,0.064112,0.073662,0.023070,0.019383,0.018488,0.048613,-0.075475,-0.000218,...,-1.965429,0,0,0,0,0,0,0,0,0
4109,0.116465,0.094176,0.048763,0.065761,0.019482,0.031685,0.016592,0.046447,-0.070616,-0.001686,...,-1.950891,0,0,0,0,0,0,0,0,0
4110,0.108891,0.081432,0.030026,0.048604,0.023663,0.020884,0.011879,0.027001,-0.084547,0.018001,...,-2.011687,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split
y_class = df['problem_class']  # classification
y_score = df['problem_score']  # regression

# Train-test split
X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
    X_reduced, y_class, y_score, test_size=0.2, random_state=42)

Classification model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_class_train)
y_pred_logreg = logreg.predict(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_class_train)
y_pred_rf = rf.predict(X_test)

svm = SVC(kernel='linear', probability=True, random_state=42)
svm.fit(X_train, y_class_train)
y_pred_svm = svm.predict(X_test)


model_preds = {
    "Logistic Regression": y_pred_logreg,
    "Random Forest": y_pred_rf,
    "SVM": y_pred_svm
}

for name, y_pred in model_preds.items():
    acc = accuracy_score(y_class_test, y_pred)
    cm = confusion_matrix(y_class_test, y_pred)

    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print()


--- Logistic Regression ---
Accuracy: 0.5480
Confusion Matrix:
[[ 49  64  23]
 [ 19 356  50]
 [ 21 195  46]]

--- Random Forest ---
Accuracy: 0.5334
Confusion Matrix:
[[ 30  68  38]
 [ 12 332  81]
 [ 12 173  77]]

--- SVM ---
Accuracy: 0.5480
Confusion Matrix:
[[ 45  91   0]
 [ 18 406   1]
 [ 21 241   0]]



Regression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

linreg = LinearRegression()
linreg.fit(X_train, y_score_train)
y_pred_linreg = linreg.predict(X_test)

rf_r = RandomForestRegressor(n_estimators=100, random_state=42)
rf_r.fit(X_train, y_score_train)
y_pred_rf = rf_r.predict(X_test)

gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_score_train)
y_pred_gb = gb.predict(X_test)


model_preds = {
    "Linear Regression": y_pred_linreg,
    "Random Forest": y_pred_rf,
    "Gradient Boosting": y_pred_gb
}


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
results = {}
for name, y_pred in model_preds.items():
    mae = mean_absolute_error(y_score_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_score_test, y_pred))
    r2 = r2_score(y_score_test, y_pred)

    results[name] = {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }

for name, metrics in results.items():
    print(f"--- {name} ---")
    print(f"MAE: {metrics['MAE']:.4f}")
    print(f"RMSE: {metrics['RMSE']:.4f}")
    print(f"R2: {metrics['R2']:.4f}")
    print()


--- Linear Regression ---
MAE: 1.7119
RMSE: 2.0604
R2: 0.1156

--- Random Forest ---
MAE: 1.7478
RMSE: 2.0958
R2: 0.0850

--- Gradient Boosting ---
MAE: 1.7126
RMSE: 2.0588
R2: 0.1170



Best Classifier is SVM

Best Regressor is Gradient boosting

In [None]:
import joblib

joblib.dump(svm, f"{path}/models/classifier.pkl")
joblib.dump(tfidf, f"{path}/models/tfidf.pkl")
joblib.dump(gb, f"{path}/models/regressor.pkl")
joblib.dump(svd, f"{path}/models/svd.pkl")
joblib.dump(scaler,f'{path}/models/scaler.pkl')

print("All models are saved")


All models are saved
