In [1]:
import pandas as pd
import numpy as np
import math
import re
import joblib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler

from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
import os

In [2]:
# Data Exploring

df = pd.read_json("problems_data.jsonl", lines=True)
df.isna().sum()             # No null values found
df['problem_class'].value_counts()      # Easy class is imbalanced
print(df.columns)
# df.dtypes
pseudo_null = 'There is no input in this problem.'
missing_input = df[df['input_description'] == ''].shape[0]
missing_output = df[df['output_description'] == ''].shape[0]
missing = df[df['description'] == ''].shape[0]
print(missing_input)
print(missing_output)
print(missing)
print(df[df['input_description']==pseudo_null].shape[0])

# for line in list(df['input_description'].unique()):
    # print(line)
    
# df.loc[df['description'].str.len()<1,['title', 'description', 'input_description', 'output_description',
    #    'sample_io']]
df.groupby("problem_class").describe()



Index(['title', 'description', 'input_description', 'output_description',
       'sample_io', 'problem_class', 'problem_score', 'url'],
      dtype='object')
120
131
81
3


Unnamed: 0_level_0,problem_score,problem_score,problem_score,problem_score,problem_score,problem_score,problem_score,problem_score
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
problem_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
easy,766.0,1.970888,0.433289,1.1,1.6,2.0,2.3,2.8
hard,1941.0,7.071149,1.049729,5.5,6.2,7.0,7.9,9.7
medium,1405.0,4.125836,0.774216,2.8,3.5,4.1,4.8,5.5


In [3]:
os.makedirs("figures", exist_ok=True)

df['problem_class'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Problem Class')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('figures/class_distribution.png', dpi=300)
plt.close()

In [4]:
# Handling Missing Values

# If input/output descriptions are empty, fall back to main description
df['input_description'] = df.apply(
    lambda row: row['description'] if row['input_description'] == '' else row['input_description'],
    axis=1
)

df['output_description'] = df.apply(
    lambda row: row['description'] if row['output_description'] == '' else row['output_description'],
    axis=1
)

def fill_input_desc(row):
    val = row['input_description']
    if pd.isna(val) or str(val).strip() == "":
        return row['description']
    return val

df['input_description'] = df.apply(fill_input_desc, axis=1)

# Text Cleaning Function

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.replace('$', ' ')

    replacements = {
        r'\\le': '<=', r'\\ge': '>=', r'\\lt': '<', r'\\gt': '>',
        r'\\neq': '!=', r'\\times': '*', r'\\dots': '...',
        r'\\': ' ', r'\n': ' '
    }
    for pat, repl in replacements.items():
        text = re.sub(pat, repl, text)

    # Removing commas inside numbers
    while re.search(r'(\d)\s?,\s?(\d)', text):
        text = re.sub(r'(\d)\s?,\s?(\d)', r'\1\2', text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_input'] = df['input_description'].apply(clean_text)
df['clean_description'] = df['description'].apply(clean_text)
df['clean_output'] = df['output_description'].apply(clean_text)
df['sample_io'] = df['sample_io'].apply(clean_text)

df['combined_text'] = (                 # Rest of the columns are too noisy so Omitted
    df['title'] + " " +
    df['clean_input'] + " " +
    df['clean_description']
)

print("Dataset cleaned")


Dataset cleaned


In [5]:
# Feature Extraction

def extract_log_constraint(text):
    if not isinstance(text, str) or not text:
        return 0

    values = []

    powers = re.findall(r'(\d+)\s*\^\s*\{?(\d+)\}?', text)
    for base, exp in powers:
        try:
            val = float(base) ** float(exp)
            if val < 1e20:
                values.append(val)
        except:
            pass

    integers = re.findall(r'\b\d+\b', text)
    for x in integers:
        try:
            values.append(float(x))
        except:
            pass

    if not values:
        return 0

    relevant_values = [v for v in values if 10 < v < 1e19]
    if not relevant_values:
        return 0

    return math.log(max(relevant_values))

df['log_max_constraint'] = df['clean_input'].apply(extract_log_constraint)


df['text_len'] = df['combined_text'].apply(len)

# Keyword Features
KEYWORD_FEATURES = {
    'easy_signals': [
        'swap', 'reverse', 'palindrome', 'even', 'odd', 'sort', 'min', 'max'
    ],
    'medium_signals': [
        'dynamic programming', 'dp', 'dijkstra', 'bfs', 'dfs',
        'greedy', 'binary search', 'modulo', 'prime', 'xor'
    ],
    'hard_signals': [
        'segment tree', 'bitmask', 'flow', 'matching',
        'centroid', 'heavy light', 'convex hull', 'fft'
    ]
}

def extract_keyword_features(text):
    text = text.lower()
    features = []
    for category in ['easy_signals', 'medium_signals', 'hard_signals']:
        for word in KEYWORD_FEATURES[category]:
            features.append(1 if word in text else 0)
    return features

keyword_data = np.array(
    df['combined_text'].apply(extract_keyword_features).tolist()
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=2000,
    stop_words='english',
    ngram_range=(1, 2)
)
X_text = tfidf.fit_transform(df['combined_text'])

# Combining
scaler = StandardScaler()
X_numeric = scaler.fit_transform(
    df[['log_max_constraint', 'text_len']]
)

X_final = hstack([
    X_text,
    X_numeric,
    csr_matrix(keyword_data)
])
y = df['problem_class']


In [6]:

os.makedirs("preprocessed", exist_ok=True)
save_npz("preprocessed/X_final.npz", X_final)
df['problem_class'].to_csv("preprocessed/y.csv", index=False)
df['problem_score'].to_csv("preprocessed/y_score.csv", index=False)

os.makedirs("Pickle", exist_ok=True)

joblib.dump(tfidf, "pickle/tfidf_vectorizer.pkl")
joblib.dump(scaler, "pickle/numeric_scaler.pkl")


['pickle/numeric_scaler.pkl']