In [1]:
from datasets import load_dataset

ds = load_dataset("open-r1/codeforces", "default")


In [2]:
ds


DatasetDict({
    train: Dataset({
        features: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'rating', 'tags', 'testset_size', 'official_tests', 'official_tests_complete', 'input_mode', 'generated_checker', 'executable', 'generated_tests'],
        num_rows: 9556
    })
    test: Dataset({
        features: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'rating', 'tags', 'testset_size', 'official_tests', 'official_tests_complete', 'input_mode', 'generated_checker', 'executable', 'generated_tests'],
        num_rows: 468
    })
})

In [3]:
import pandas as pd

df_train = ds["train"].to_pandas()
df_test = ds["test"].to_pandas()


In [4]:
df_train.shape, df_test.shape


((9556, 27), (468, 27))

In [5]:
missing_train = df_train.isnull().sum()
missing_test = df_test.isnull().sum()

missing_train


id                            0
aliases                    7966
contest_id                    0
contest_name                  0
contest_type                  0
contest_start                 0
contest_start_year            0
index                         0
time_limit                   30
memory_limit                 30
title                         0
description                  43
input_format                158
output_format               267
interaction_format         9364
note                       2781
examples                    128
editorial                  3719
rating                      247
tags                          0
testset_size                  0
official_tests                0
official_tests_complete       0
input_mode                    0
generated_checker          7829
executable                    0
generated_tests               0
dtype: int64

In [6]:
missing_train[missing_train > 0]


aliases               7966
time_limit              30
memory_limit            30
description             43
input_format           158
output_format          267
interaction_format    9364
note                  2781
examples               128
editorial             3719
rating                 247
generated_checker     7829
dtype: int64

In [7]:
missing_test[missing_test > 0]


aliases               447
input_format            4
output_format          15
interaction_format    453
note                   73
examples                1
editorial             208
rating                 10
generated_checker     415
dtype: int64

In [8]:
df_train = df_train.dropna(subset=["rating"])
df_test = df_test.dropna(subset=["rating"])


In [9]:
text_columns = ["title", "description", "input_format", "output_format"]

for col in text_columns:
    df_train[col] = df_train[col].fillna("")
    df_test[col] = df_test[col].fillna("")


#### Missing value analysis showed that several metadata fields contained a large number of null values.
#### Since the project focuses only on textual difficulty estimation, these fields were ignored.
#### Rows with missing difficulty scores (rating) were removed, as they cannot be used for supervised learning.
#### Missing text fields were filled with empty strings to preserve maximum usable data while ensuring compatibility with NLP feature extraction methods.

In [10]:
df_train[text_columns + ["rating"]].isnull().sum()


title            0
description      0
input_format     0
output_format    0
rating           0
dtype: int64

In [11]:
df_test[text_columns + ["rating"]].isnull().sum()

title            0
description      0
input_format     0
output_format    0
rating           0
dtype: int64

In [12]:


df_train["combined_text"] = (
    df_train["title"] + " " +
    df_train["description"] + " " +
    df_train["input_format"] + " " +
    df_train["output_format"]
)


In [13]:
df_test["combined_text"] = (
    df_test["title"] + " " +
    df_test["description"] + " " +
    df_test["input_format"] + " " +
    df_test["output_format"]
)


In [14]:
df_train["combined_text"].head(2)


0    Digits John gave Jack a very hard problem. He ...
1    Neural Network country Due to the recent popul...
Name: combined_text, dtype: object

In [15]:
df_train["combined_text"].isnull().sum()


np.int64(0)

In [16]:
def rating_to_difficulty(rating):
    if rating < 1200:
        return "Easy"
    elif rating < 1800:
        return "Medium"
    else:
        return "Hard"


In [17]:
df_train["difficulty"] = df_train["rating"].apply(rating_to_difficulty)
df_test["difficulty"] = df_test["rating"].apply(rating_to_difficulty)


In [18]:
df_train["difficulty"].value_counts()


difficulty
Hard      4915
Medium    2525
Easy      1869
Name: count, dtype: int64

In [19]:
df_test["difficulty"].value_counts()

difficulty
Hard      237
Easy      116
Medium    105
Name: count, dtype: int64

In [20]:
# Step 1: Define final columns
FINAL_COLUMNS = [
    "title",
    "description",
    "input_format",
    "output_format",
    "rating",
    "difficulty"
]

# Step 2: Slice final datasets
df_train_final = df_train[FINAL_COLUMNS]
df_test_final  = df_test[FINAL_COLUMNS]

# Step 3: Ensure data directory exists
import os
os.makedirs("../data", exist_ok=True)

# Step 4: Save ONLY final datasets
df_train_final.to_csv("../data/train_final.csv", index=False)
df_test_final.to_csv("../data/test_final.csv", index=False)



In [21]:
import pandas as pd
pd.read_csv("../data/train_final.csv").head()


Unnamed: 0,title,description,input_format,output_format,rating,difficulty
0,Digits,John gave Jack a very hard problem. He wrote a...,First line contains a positive integer N (1 ≤ ...,"Output exactly three lines, the steps Jack nee...",2500.0,Hard
1,Neural Network country,Due to the recent popularity of the Deep learn...,The first line of input contains N (1 ≤ N ≤ 10...,"Output a single integer, the number of paths D...",2000.0,Hard
2,Property,Bill is a famous mathematician in BubbleLand. ...,The first line contains one integer number n (...,Output contains n distinct integers separated ...,2100.0,Hard
3,Exploration plan,The competitors of Bubble Cup X gathered after...,"The first line contains four integers: V, E, N...",Output a single integer that represents the mi...,2100.0,Hard
4,Casinos and travel,John has just bought a new car and is planning...,"In the first line, a positive integer N (1 ≤ N...","Output one number, the answer to the problem m...",2100.0,Hard
