In [None]:
#install libs
!pip install requests beautifulsoup4 lxml



In [None]:
#import dependencies

import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

In [None]:
#import drive to mount files

from google.colab import drive
from google.colab import files

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#setting CodeForces function - Official API

def fetch_codeforces_problems():
    url = "https://codeforces.com/api/problemset.problems"
    response = requests.get(url)
    data = response.json()

    problems = data['result']['problems']
    stats = data['result']['problemStatistics']

    # Join statistics (like solved count) with problems
    stats_map = {f"{s['contestId']}_{s['index']}": s['solvedCount'] for s in stats}

    formatted = []
    for p in problems:
        pid = f"{p['contestId']}_{p['index']}"
        formatted.append({
            "id": f"codeforces_{pid}",
            "platform": "Codeforces",
            "problem_id": p.get("index"),
            "name": p.get("name"),
            "url": f"https://codeforces.com/problemset/problem/{p['contestId']}/{p['index']}",
            "tags": p.get("tags"),
            "rating": p.get("rating"),
            "contest_id": p.get("contestId"),
            "solved_count": stats_map.get(pid, None)
        })

    return pd.DataFrame(formatted)

df_codeforces = fetch_codeforces_problems()
df_codeforces.head()


Unnamed: 0,id,platform,problem_id,name,url,tags,rating,contest_id,solved_count
0,codeforces_2116_B,Codeforces,B,Gellyfish and Baby's Breath,https://codeforces.com/problemset/problem/2116/B,"[greedy, math, sortings]",,2116,11924
1,codeforces_2116_A,Codeforces,A,Gellyfish and Tricolor Pansy,https://codeforces.com/problemset/problem/2116/A,"[games, greedy]",,2116,19944
2,codeforces_2115_F2,Codeforces,F2,Gellyfish and Lycoris Radiata (Hard Version),https://codeforces.com/problemset/problem/2115/F2,[data structures],,2115,22
3,codeforces_2115_F1,Codeforces,F1,Gellyfish and Lycoris Radiata (Easy Version),https://codeforces.com/problemset/problem/2115/F1,[data structures],,2115,16
4,codeforces_2115_E,Codeforces,E,Gellyfish and Mayflower,https://codeforces.com/problemset/problem/2115/E,"[dp, graphs]",,2115,73


In [None]:
#setting AtCoder function - Community API

def fetch_atcoder_merged_problems():
    # Load merged-problems.json (basic metadata)
    url_merged = "https://kenkoooo.com/atcoder/resources/merged-problems.json"
    response_merged = requests.get(url_merged)
    problems = response_merged.json()

    # Load problem-models.json (difficulty estimates)
    url_difficulty = "https://kenkoooo.com/atcoder/resources/problem-models.json"
    response_difficulty = requests.get(url_difficulty)
    difficulty_data = response_difficulty.json()

    formatted = []
    for p in problems:
        problem_id = p['id']
        difficulty = difficulty_data.get(problem_id, {}).get("difficulty")

        # Filter invalid difficulty values: negative or too high
        if difficulty is not None and (difficulty < 0 or difficulty > 5000):
            difficulty = None

        formatted.append({
            "id": f"atcoder_{problem_id}",
            "platform": "AtCoder",
            "problem_id": problem_id,
            "name": p['title'],
            "url": f"https://atcoder.jp/contests/{p['contest_id']}/tasks/{problem_id}",
            "tags": [],
            "rating": difficulty,
            "contest_id": p.get("contest_id"),
            "solved_count": int(p.get("solver_count")) if p.get("solver_count") is not None else 0,
            "point": p.get("point")  # ← Official contest-assigned point value
        })

    return pd.DataFrame(formatted)

# Example usage:
df_atcoder = fetch_atcoder_merged_problems()
df_atcoder.head()

Unnamed: 0,id,platform,problem_id,name,url,tags,rating,contest_id,solved_count,point
0,atcoder_1202Contest_a,AtCoder,1202Contest_a,A. DEGwer's Doctoral Dissertation,https://atcoder.jp/contests/DEGwer2023/tasks/1...,[],,DEGwer2023,604,
1,atcoder_1202Contest_b,AtCoder,1202Contest_b,B. vs. DEGwer,https://atcoder.jp/contests/DEGwer2023/tasks/1...,[],,DEGwer2023,16,
2,atcoder_1202Contest_c,AtCoder,1202Contest_c,C. binarydigit,https://atcoder.jp/contests/DEGwer2023/tasks/1...,[],,DEGwer2023,52,
3,atcoder_1202Contest_d,AtCoder,1202Contest_d,D. Coincidence,https://atcoder.jp/contests/DEGwer2023/tasks/1...,[],,DEGwer2023,38,
4,atcoder_1202Contest_e,AtCoder,1202Contest_e,E. Half Palindromes,https://atcoder.jp/contests/DEGwer2023/tasks/1...,[],,DEGwer2023,47,


In [None]:
#setting Kattis function - using a scrapper

def fetch_kattis_problems():
    url = "https://russelldash332.github.io/kattis/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table')
    rows = table.find_all('tr')[1:]

    problems = []
    for row in rows:
        cols = row.find_all('td')
        if len(cols) >= 3:
            name = cols[0].text.strip()
            problem_id = cols[1].text.strip()
            difficulty = cols[2].text.strip()
            clean_difficulty = difficulty.split()[0] if difficulty else None

            problems.append({
                "id": f"kattis_{problem_id}",
                "platform": "Kattis",
                "problem_id": problem_id,
                "name": name,
                "url": f"https://open.kattis.com/problems/{problem_id}",
                "difficulty": float(clean_difficulty) if clean_difficulty else None,
                "tags": [],
                "contest_id": None,
                "solved_count": 0
            })

    return pd.DataFrame(problems)

# Fetch and preview
df_kattis = fetch_kattis_problems()
df_kattis.head()


Unnamed: 0,id,platform,problem_id,name,url,difficulty,tags,contest_id,solved_count
0,kattis_10kindsofpeople,Kattis,10kindsofpeople,10 Kinds of People,https://open.kattis.com/problems/10kindsofpeople,6.3,[],,0
1,kattis_1dfroggereasy,Kattis,1dfroggereasy,1-D Frogger (Easy),https://open.kattis.com/problems/1dfroggereasy,2.9,[],,0
2,kattis_1dfroggerhard,Kattis,1dfroggerhard,1-D Frogger (Hard),https://open.kattis.com/problems/1dfroggerhard,8.3,[],,0
3,kattis_1sforall,Kattis,1sforall,1's For All,https://open.kattis.com/problems/1sforall,6.3,[],,0
4,kattis_2048,Kattis,2048,2048,https://open.kattis.com/problems/2048,3.0,[],,0


In [None]:
#merging the dataframes

df_all = pd.concat([df_codeforces, df_atcoder, df_kattis], ignore_index=True)

df_all.head()


Unnamed: 0,id,platform,problem_id,name,url,tags,rating,contest_id,solved_count,difficulty
0,codeforces_2116_B,Codeforces,B,Gellyfish and Baby's Breath,https://codeforces.com/problemset/problem/2116/B,"[greedy, math, sortings]",,2116,11924,
1,codeforces_2116_A,Codeforces,A,Gellyfish and Tricolor Pansy,https://codeforces.com/problemset/problem/2116/A,"[games, greedy]",,2116,19944,
2,codeforces_2115_F2,Codeforces,F2,Gellyfish and Lycoris Radiata (Hard Version),https://codeforces.com/problemset/problem/2115/F2,[data structures],,2115,22,
3,codeforces_2115_F1,Codeforces,F1,Gellyfish and Lycoris Radiata (Easy Version),https://codeforces.com/problemset/problem/2115/F1,[data structures],,2115,16,
4,codeforces_2115_E,Codeforces,E,Gellyfish and Mayflower,https://codeforces.com/problemset/problem/2115/E,"[dp, graphs]",,2115,73,


In [None]:
#transform to .csv file and download it

df_all.to_csv("merged_problems_dataset.csv", index=False)
files.download("merged_problems_dataset.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#normalizing difficulty (kattis) and rating (codeforces, atcoder)

cf_min, cf_max = 800, 3500  # Target difficulty range

# Raw rating ranges for other platforms
atcoder_min, atcoder_max = 0, 4000
kattis_min, kattis_max = 1.0, 10.0  # Kattis difficulty is already parsed as float

# Function to scale all platforms to Codeforces scale
def scale_to_codeforces(row):
    if row['platform'] == 'Codeforces' and pd.notna(row['rating']):
        return int(round(row['rating']))  # Codeforces: use rating as-is
    elif row['platform'] == 'AtCoder' and pd.notna(row['rating']):
        return int(round((row['rating'] - atcoder_min) / (atcoder_max - atcoder_min) * (cf_max - cf_min) + cf_min))
    elif row['platform'] == 'Kattis' and pd.notna(row['difficulty']):
        return int(round((row['difficulty'] - kattis_min) / (kattis_max - kattis_min) * (cf_max - cf_min) + cf_min))
    else:
        return None

# Apply the function to create a unified difficulty column
df_all['cf_scaled_difficulty'] = df_all.apply(scale_to_codeforces, axis=1)

# Optional: check sample rows
df_all[['platform', 'rating', 'difficulty', 'cf_scaled_difficulty']].sample(10)


Unnamed: 0,platform,rating,difficulty,cf_scaled_difficulty
5958,Codeforces,1700.0,,1700.0
12033,AtCoder,3192.0,,2955.0
18303,Kattis,,3.8,1640.0
8120,Codeforces,1700.0,,1700.0
6901,Codeforces,2600.0,,2600.0
8736,Codeforces,,,
10783,AtCoder,,,
14942,AtCoder,,,
11947,AtCoder,,,
18184,Kattis,,5.4,2120.0


In [None]:
#download the normalized dataset

df_all.to_csv("merged_problems_dataset_norm.csv", index=False)
files.download("merged_problems_dataset_norm.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>