In [1]:
import pandas as pd

df_train = pd.read_csv("../data/train_final.csv")
df_test  = pd.read_csv("../data/test_final.csv")

df_train.shape, df_test.shape


((9309, 6), (458, 6))

In [2]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [3]:
# Columns that form the problem statement
TEXT_COLUMNS = ["title", "description", "input_format", "output_format"]

# Safety: replace missing values with empty strings
for col in TEXT_COLUMNS:
    df_train[col] = df_train[col].fillna("")
    df_test[col] = df_test[col].fillna("")

# Create combined text
df_train["combined_text"] = (
    df_train["title"] + " " +
    df_train["description"] + " " +
    df_train["input_format"] + " " +
    df_train["output_format"]
)

df_test["combined_text"] = (
    df_test["title"] + " " +
    df_test["description"] + " " +
    df_test["input_format"] + " " +
    df_test["output_format"]
)


In [4]:
df_train["clean_text"] = df_train["combined_text"].apply(clean_text)
df_test["clean_text"] = df_test["combined_text"].apply(clean_text)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words="english"
)


In [6]:
X_train_tfidf = vectorizer.fit_transform(df_train["clean_text"])
X_test_tfidf = vectorizer.transform(df_test["clean_text"])


In [7]:
X_train_tfidf.shape, X_test_tfidf.shape


((9309, 5000), (458, 5000))

In [8]:
import joblib
import os

os.makedirs("../models", exist_ok=True)
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")


['../models/tfidf_vectorizer.pkl']

## Textual problem statements were converted into numerical representations using TF-IDF vectorization. Unigrams and bigrams were used to capture both individual keywords and short algorithmic phrases. Stopword removal and vocabulary size limitation were applied to reduce noise and dimensionality while preserving discriminative information.