In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# data Retrival Syntax (/, \ , ../) changes with System and Version, make sure you use the correct one

In [5]:
df = pd.read_csv('data/preprocessed_data.csv')
df.iloc[2]

Unnamed: 0,2
title,Mario or Luigi
description,Mario and Luigi are playing a game where they ...
input_description,
output_description,
sample_io,"[{'input': '', 'output': ''}]"
problem_class,hard
problem_score,9.6
full_text,Mario or Luigi Mario and Luigi are playing a g...
clean_text,mario or luigi mario and luigi are playing a g...


In [6]:
df["text_length"] = df["clean_text"].apply(len)


In [7]:
df["math_symbol_count"] = df["full_text"].apply(
    lambda x: sum(c in "+-*/=%<>" for c in x)
)


In [8]:
keywords = [
    "dp", "dynamic programming",
    "graph", "tree",
    "recursion", "greedy",
    "binary", "dfs", "bfs"
]

for kw in keywords:
    col_name = f"kw_{kw.replace(' ', '_')}"
    df[col_name] = df["clean_text"].apply(lambda x: x.count(kw))


In [9]:

df["line_count"] = df["full_text"].apply(lambda x: x.count("\n") + 1)

df["digit_count"] = df["full_text"].apply(lambda x: sum(c.isdigit() for c in x))

df["word_count"] = df["clean_text"].apply(lambda x: len(x.split()))


In [10]:
import re

df["large_number_count"] = df["full_text"].apply(
    lambda x: len(re.findall(r"\b10\^\d+\b", x))
)


In [13]:
df["big_o_count"] = df["full_text"].apply(
    lambda x: len(re.findall(r"O\s*\(", x))
)


In [11]:
df["constraint_density"] = df["digit_count"] / (df["word_count"] + 1)


In [12]:
operators = "+-*/=%<>^"

df["operator_diversity"] = df["full_text"].apply(
    lambda x: len(set(c for c in x if c in operators))
)


In [14]:
algo_keywords = {
    "dp": ["dp", "dynamic programming"],
    "graph": ["graph", "dfs", "bfs"],
    "tree": ["tree", "binary tree"],
    "greedy": ["greedy"],
    "math": ["mod", "gcd", "lcm", "prime"],
    "bit": ["bit", "xor", "and", "or"],
    "string": ["string", "substring", "palindrome"]
}

for algo, words in algo_keywords.items():
    df[f"algo_{algo}"] = df["clean_text"].apply(
        lambda x: int(any(w in x for w in words))
    )


In [15]:
df["multi_input"] = df["input_description"].apply(
    lambda x: int(str(x).lower().count(" ") > 50)
)


In [16]:
df["array_matrix_flag"] = df["clean_text"].apply(
    lambda x: int("array" in x or "matrix" in x or "[" in x)
)


In [17]:
df["conditional_count"] = df["clean_text"].apply(
    lambda x: x.count("if") + x.count("else") + x.count("while")
)


In [18]:
df["unique_word_ratio"] = df["clean_text"].apply(
    lambda x: len(set(x.split())) / (len(x.split()) + 1)
)


In [20]:
df.iloc[2]


Unnamed: 0,2
title,Mario or Luigi
description,Mario and Luigi are playing a game where they ...
input_description,
output_description,
sample_io,"[{'input': '', 'output': ''}]"
problem_class,hard
problem_score,9.6
full_text,Mario or Luigi Mario and Luigi are playing a g...
clean_text,mario or luigi mario and luigi are playing a g...
text_length,1139


In [21]:
y_class = df["problem_class"]
y_score = df["problem_score"]


In [22]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    stop_words="english"
)


In [23]:
X_tfidf = tfidf.fit_transform(df["clean_text"])
X_tfidf.shape


(4112, 5000)

In [25]:
feature_cols = [
    "text_length",
    "word_count",
    "digit_count",
    "line_count",
    "math_symbol_count",
    "constraint_density",
    "operator_diversity",
    "big_o_count",
    "large_number_count",
    "unique_word_ratio",
    "conditional_count",
    "multi_input",
    "array_matrix_flag"
] + [col for col in df.columns if col.startswith("kw_")] + [col for col in df.columns if col.startswith("algo_")]

X_extra = df[feature_cols].values
X_extra.shape


(4112, 29)

In [27]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [29]:
scaler = StandardScaler()

X_extra_scaled = scaler.fit_transform(X_extra)


In [30]:
X = hstack([X_tfidf, X_extra])
X.shape


(4112, 5029)

In [31]:
joblib.dump(tfidf,"models/tfidf_vectorizer.pkl")

['models/tfidf_vectorizer.pkl']

In [33]:
joblib.dump(scaler,"models/numeric_scaler.pkl")

['models/numeric_scaler.pkl']

In [34]:
joblib.dump(X,"data/inputs.pkl")


['data/inputs.pkl']

In [35]:
joblib.dump(y_class,"data/y_class.pkl")
joblib.dump(y_score,"data/y_score.pkl")

['data/y_score.pkl']