In [2]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset


## Source Code
There are some datasets that provide me source code of the solution, but statement aren't provide.

#### Codeforces 1
- Read JSONL file, extract file with C++ compiler.
- Format in .csv with "writer" "problem_id" "submission_id "site_url" "source" label.


In [None]:
CLEAN_RECORD = "./Humans/Cleaned/codeforces1.csv"
RAW_JSONL = "./Humans/Raw/Codeforces1.jsonl"

df = df = pd.read_json(RAW_JSONL, lines=True)
df

Unnamed: 0,submission_id,language,source
0,12746065,GNU C++11,/*\n******************************************...
1,12746876,GNU C++,#include<iostream>\n\nusing namespace std;\n\n...
2,12747297,MS C++,#include <iostream>\n#include <sstream>\n#incl...
3,12747301,GNU C++11,#include<stdio.h>\n#include<algorithm>\n#inclu...
4,12747302,GNU C++11,#include <bits/stdc++.h>\nusing namespace std;...
...,...,...,...
1262905,37179780,MS C++,#include <iostream>\n#include<vector>\n#includ...
1262906,37179856,MS C++,#include<iostream>\n#include<vector>\nusing na...
1262907,37182624,MS C++,"#include""iostream""\n#include""algorithm""\n#incl..."
1262908,37182648,MS C++,"#include""iostream""\n#include""algorithm""\n#incl..."


In [16]:
df['website'] = 'https://codeforces.com'
df

Unnamed: 0,submission_id,language,source,website
0,12746065,GNU C++11,/*\n******************************************...,https://codeforces.com
1,12746876,GNU C++,#include<iostream>\n\nusing namespace std;\n\n...,https://codeforces.com
2,12747297,MS C++,#include <iostream>\n#include <sstream>\n#incl...,https://codeforces.com
3,12747301,GNU C++11,#include<stdio.h>\n#include<algorithm>\n#inclu...,https://codeforces.com
4,12747302,GNU C++11,#include <bits/stdc++.h>\nusing namespace std;...,https://codeforces.com
...,...,...,...,...
1262905,37179780,MS C++,#include <iostream>\n#include<vector>\n#includ...,https://codeforces.com
1262906,37179856,MS C++,#include<iostream>\n#include<vector>\nusing na...,https://codeforces.com
1262907,37182624,MS C++,"#include""iostream""\n#include""algorithm""\n#incl...",https://codeforces.com
1262908,37182648,MS C++,"#include""iostream""\n#include""algorithm""\n#incl...",https://codeforces.com


In [20]:
num_elements = df['language'].value_counts()
num_elements

language
GNU C++14                451022
GNU C++11                329281
GNU C++                  242294
Java 8                    63483
MS C++                    34856
Python 3                  30827
GNU C++17                 27384
GNU C                     26880
Python 2                   9346
FPC                        8156
GNU C11                    5925
Java 7                     5564
GNU C++17 Diagnostics      4420
MS C#                      4195
PyPy 3                     2947
PyPy 2                     2298
Mono C#                    2081
PascalABC.NET              2075
JavaScript                 1670
Delphi                     1177
Go                         1124
Haskell                     983
Scala                       975
Ruby                        972
PHP                         798
Kotlin                      577
Rust                        461
D                           429
Perl                        348
Clang++17 Diagnostics       208
Ocaml                       154

In [None]:
# I will extract GNU C++14, GNU C++11, GNU C++, MS C++ and GNU C++17
ndf = df[df['language'].str.contains('C++', case=False, na=False)]
ndf['language'].value_counts()

In [None]:
match_compilers = ["GNU C++14", "GNU C++11", "GNU C++", "MS C++", "GNU C++17"]
filtered_df = df[df['language'].isin(match_compilers)]
filtered_df['language'].value_counts(), filtered_df['language'].count()

# 1,084,837 source code left

(language
 GNU C++14    451022
 GNU C++11    329281
 GNU C++      242294
 MS C++        34856
 GNU C++17     27384
 Name: count, dtype: int64,
 np.int64(1084837))

In [32]:
filtered_df['writer'] = 'human'
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['writer'] = 'human'


Unnamed: 0,submission_id,language,source,website,writer
0,12746065,GNU C++11,/*\n******************************************...,https://codeforces.com,human
1,12746876,GNU C++,#include<iostream>\n\nusing namespace std;\n\n...,https://codeforces.com,human
2,12747297,MS C++,#include <iostream>\n#include <sstream>\n#incl...,https://codeforces.com,human
3,12747301,GNU C++11,#include<stdio.h>\n#include<algorithm>\n#inclu...,https://codeforces.com,human
4,12747302,GNU C++11,#include <bits/stdc++.h>\nusing namespace std;...,https://codeforces.com,human
...,...,...,...,...,...
1262905,37179780,MS C++,#include <iostream>\n#include<vector>\n#includ...,https://codeforces.com,human
1262906,37179856,MS C++,#include<iostream>\n#include<vector>\nusing na...,https://codeforces.com,human
1262907,37182624,MS C++,"#include""iostream""\n#include""algorithm""\n#incl...",https://codeforces.com,human
1262908,37182648,MS C++,"#include""iostream""\n#include""algorithm""\n#incl...",https://codeforces.com,human


In [87]:
# Language is obviously C++

filtered_df['problem_id']=np.nan
filtered_df = filtered_df[['writer','problem_id','submission_id','website','source']]
filtered_df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,human,,12746065,https://codeforces.com,/*\n******************************************...
1,human,,12746876,https://codeforces.com,#include<iostream>\n\nusing namespace std;\n\n...
2,human,,12747297,https://codeforces.com,#include <iostream>\n#include <sstream>\n#incl...
3,human,,12747301,https://codeforces.com,#include<stdio.h>\n#include<algorithm>\n#inclu...
4,human,,12747302,https://codeforces.com,#include <bits/stdc++.h>\nusing namespace std;...
...,...,...,...,...,...
1262905,human,,37179780,https://codeforces.com,#include <iostream>\n#include<vector>\n#includ...
1262906,human,,37179856,https://codeforces.com,#include<iostream>\n#include<vector>\nusing na...
1262907,human,,37182624,https://codeforces.com,"#include""iostream""\n#include""algorithm""\n#incl..."
1262908,human,,37182648,https://codeforces.com,"#include""iostream""\n#include""algorithm""\n#incl..."


In [88]:
CLEAN_RECORD = "./Humans/Cleaned/codeforces1.csv"

filtered_df.to_csv(CLEAN_RECORD,index=False)

In [89]:
read_df = pd.read_csv(CLEAN_RECORD)
read_df


Unnamed: 0,writer,problem_id,submission_id,website,source
0,human,,12746065,https://codeforces.com,/*\n******************************************...
1,human,,12746876,https://codeforces.com,#include<iostream>\n\nusing namespace std;\n\n...
2,human,,12747297,https://codeforces.com,#include <iostream>\n#include <sstream>\n#incl...
3,human,,12747301,https://codeforces.com,#include<stdio.h>\n#include<algorithm>\n#inclu...
4,human,,12747302,https://codeforces.com,#include <bits/stdc++.h>\nusing namespace std;...
...,...,...,...,...,...
1084832,human,,37179780,https://codeforces.com,#include <iostream>\n#include<vector>\n#includ...
1084833,human,,37179856,https://codeforces.com,#include<iostream>\n#include<vector>\nusing na...
1084834,human,,37182624,https://codeforces.com,"#include""iostream""\n#include""algorithm""\n#incl..."
1084835,human,,37182648,https://codeforces.com,"#include""iostream""\n#include""algorithm""\n#incl..."


In [57]:
read_df['source'].iloc[0]

'/*\n***************************************************************************************************************\n\n                            Author : Yash Sadhwani\n\n**************************************************************************************************************\n*/\n#include<stdio.h>\n#include<iostream>\n#include<vector>\n#include<string.h>\n#include<algorithm>\n#include<deque>\n#include<map>\n#include<set>\n#include<stdlib.h>\n#include<math.h>\n#include<queue>\n#include<stack>\n#include<functional>\nusing namespace std;\n#define ll long long\n#define si(x) scanf("%d",&x)\n#define sl(x) scanf("%lld",&x)\n#define sd(x) scanf("%lf",&x)\n#define sc(x) scanf("%c",&x)\n#define ss(x) scanf("%s",x)\n#define vl vector<ll>\n#define vi vector<int>\n#define vvl vector< vl >\n#define vvi vector< vi >\n#define pb push_back\n#define mod 1000000007\n#define mem(x,y) memset(x,y,sizeof(x))\n#define f(i,a,b) for(int i=(a);i<(b);i++)\n#define max_int_value 2147483647\n#define max_l

#### Codeforces 2
- Need to remove all non-.cpp file.
- Transform .cpp to jsonl format
- Format in .csv with "writer" "problem_id" "submission_id "site_url" "source" label.

In [84]:
CLEAN_RECORD  = "./Humans/Cleaned/codeforces2.csv"
RAW_DIRECTORY = "./Humans/Raw/Codeforces2/"

paths = []
for top_dir in os.listdir(RAW_DIRECTORY):
    top_path = RAW_DIRECTORY+top_dir+"/"
    for sub_dir in os.listdir(top_path):
        sub_path = top_path+sub_dir+"/"
        for file in os.listdir(sub_path):
            file_path = sub_path+file
            if file.endswith('.cpp'):
                # Extract the problem_id from the directory structure
                problem_id = f"{top_dir}{sub_dir}"
                submission_id = file.split('.')[0]
                paths.append((file_path,problem_id,submission_id))
            else:
                os.remove(file_path) # Remove non-cpp
paths

[('./Humans/Raw/Codeforces2/1141/A/193911650.cpp', '1141A', '193911650'),
 ('./Humans/Raw/Codeforces2/1141/A/194030944.cpp', '1141A', '194030944'),
 ('./Humans/Raw/Codeforces2/1141/A/194111406.cpp', '1141A', '194111406'),
 ('./Humans/Raw/Codeforces2/1141/A/194130677.cpp', '1141A', '194130677'),
 ('./Humans/Raw/Codeforces2/1141/A/194132874.cpp', '1141A', '194132874'),
 ('./Humans/Raw/Codeforces2/1141/A/194148897.cpp', '1141A', '194148897'),
 ('./Humans/Raw/Codeforces2/1141/A/194175296.cpp', '1141A', '194175296'),
 ('./Humans/Raw/Codeforces2/1141/A/194185157.cpp', '1141A', '194185157'),
 ('./Humans/Raw/Codeforces2/1141/A/194222048.cpp', '1141A', '194222048'),
 ('./Humans/Raw/Codeforces2/1141/A/194242967.cpp', '1141A', '194242967'),
 ('./Humans/Raw/Codeforces2/1141/A/194243539.cpp', '1141A', '194243539'),
 ('./Humans/Raw/Codeforces2/1141/A/194265318.cpp', '1141A', '194265318'),
 ('./Humans/Raw/Codeforces2/1141/A/194274237.cpp', '1141A', '194274237'),
 ('./Humans/Raw/Codeforces2/1141/A/194

In [86]:
df = pd.DataFrame()

for file_path, problem_id, submission_id in paths:
    with open(file_path,'r',encoding='utf-8') as file:
        source_code = file.read()
        json_line = json.dumps({"problem_id": problem_id, "submission_id": submission_id, "source": source_code})
        row = json.loads(json_line)
        df = pd.concat([df,pd.DataFrame([row])],ignore_index=True)
        
df

Unnamed: 0,problem_id,submission_id,source
0,1141A,193911650,#include <bits/stdc++.h>\n\nusing namespace st...
1,1141A,194030944,#include <bits/stdc++.h>\ntypedef long long ll...
2,1141A,194111406,#include<bits/stdc++.h>\n\n\n\n//My codes-----...
3,1141A,194130677,#include <bits/stdc++.h>\n\nusing namespace st...
4,1141A,194132874,#include<bits/stdc++.h>\n\nusing namespace std...
...,...,...,...
886,1293D,189523135,#include <queue>\n\n#include <cmath>\n\n#inclu...
887,1293D,189598223,#include <bits/stdc++.h>\n\n\n\nusing namespac...
888,1293D,192461798,#include <stdio.h>\n#define LL long long\nLL x...
889,1293D,192792340,#include<bits/stdc++.h>\n\nusing namespace std...


In [93]:
df['writer'] = 'human'
df['website'] = 'https://codeforces.com'
df = df[['writer','problem_id','submission_id','website','source']]
df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,human,1141A,193911650,https://codeforces.com,#include <bits/stdc++.h>\n\nusing namespace st...
1,human,1141A,194030944,https://codeforces.com,#include <bits/stdc++.h>\ntypedef long long ll...
2,human,1141A,194111406,https://codeforces.com,#include<bits/stdc++.h>\n\n\n\n//My codes-----...
3,human,1141A,194130677,https://codeforces.com,#include <bits/stdc++.h>\n\nusing namespace st...
4,human,1141A,194132874,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
...,...,...,...,...,...
886,human,1293D,189523135,https://codeforces.com,#include <queue>\n\n#include <cmath>\n\n#inclu...
887,human,1293D,189598223,https://codeforces.com,#include <bits/stdc++.h>\n\n\n\nusing namespac...
888,human,1293D,192461798,https://codeforces.com,#include <stdio.h>\n#define LL long long\nLL x...
889,human,1293D,192792340,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...


In [None]:
CLEAN_RECORD  = "./Humans/Cleaned/codeforces2.csv"
df.to_csv(CLEAN_RECORD,index=False)

In [95]:
read_df = pd.read_csv(CLEAN_RECORD)
read_df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,human,1141A,193911650,https://codeforces.com,#include <bits/stdc++.h>\n\nusing namespace st...
1,human,1141A,194030944,https://codeforces.com,#include <bits/stdc++.h>\ntypedef long long ll...
2,human,1141A,194111406,https://codeforces.com,#include<bits/stdc++.h>\n\n\n\n//My codes-----...
3,human,1141A,194130677,https://codeforces.com,#include <bits/stdc++.h>\n\nusing namespace st...
4,human,1141A,194132874,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
...,...,...,...,...,...
886,human,1293D,189523135,https://codeforces.com,#include <queue>\n\n#include <cmath>\n\n#inclu...
887,human,1293D,189598223,https://codeforces.com,#include <bits/stdc++.h>\n\n\n\nusing namespac...
888,human,1293D,192461798,https://codeforces.com,#include <stdio.h>\n#define LL long long\nLL x...
889,human,1293D,192792340,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...


### IOI Official Solutions
- Official Solutions from IOI

This dataset contains official solutions prepared by the organizers of each IOI contest. See the IOI dataset for the full list of problems, subtasks and statements; and the IOI Test Cases dataset for the full list of test cases.

In [17]:
CLEAN_RECORD = "./Humans/Cleaned/IOI-1.csv"

from datasets import load_dataset
ds = load_dataset("open-r1/ioi-sample-solutions")

README.md:   0%|          | 0.00/31.9k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/806k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5225 [00:00<?, ? examples/s]

In [18]:
df = ds["train"].to_pandas()
df

Unnamed: 0,year,day,problem_name,problem_id,subtask,label,code,status,subtask_score,subtask_points,subtask_max_points,test_case_scores,test_case_status,test_case_feedbacks
0,2020,day1,Carnival Tickets,tickets,00-samples,wy-subtask1.cpp,"#include ""tickets.h""\n#include <bits/stdc++.h>...",RE,0.0,0.0,0,"[0.0, 0.0]","[RE, SKIPPED]",[Wrong Answer\nThere is multiple tickets of co...
1,2020,day1,Carnival Tickets,tickets,01-calc-median,wy-subtask1.cpp,"#include ""tickets.h""\n#include <bits/stdc++.h>...",AC,1.0,11.0,11,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[AC, AC, AC, AC, AC, AC]","[Correct\n\n, Correct\n\n, Correct\n\n, Correc..."
2,2020,day1,Carnival Tickets,tickets,02-one-day-sort,wy-subtask1.cpp,"#include ""tickets.h""\n#include <bits/stdc++.h>...",RE,0.0,0.0,16,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[RE, SKIPPED, SKIPPED, SKIPPED, SKIPPED, SKIPPED]",[Wrong Answer\nThere is multiple tickets of co...
3,2020,day1,Carnival Tickets,tickets,03-two-locations,wy-subtask1.cpp,"#include ""tickets.h""\n#include <bits/stdc++.h>...",RE,0.0,0.0,14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[RE, SKIPPED, SKIPPED, SKIPPED, SKIPPED, SKIPP...",[Wrong Answer\nThere is multiple tickets of co...
4,2020,day1,Carnival Tickets,tickets,04-all-sent,wy-subtask1.cpp,"#include ""tickets.h""\n#include <bits/stdc++.h>...",RE,0.0,0.0,14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[RE, SKIPPED, SKIPPED, SKIPPED, SKIPPED, SKIPP...",[Wrong Answer\nThere is multiple tickets of co...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5220,2024,day2,Hieroglyphs,hieroglyphs,02-cnt3,correct/yiping-full.cpp,"#include<bits/stdc++.h>\n#include""hieroglyphs....",AC,1.0,15.0,15,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, A...","[Output is correct\n\n, Output is correct\n\n,..."
5221,2024,day2,Hieroglyphs,hieroglyphs,03-bin,correct/yiping-full.cpp,"#include<bits/stdc++.h>\n#include""hieroglyphs....",AC,1.0,10.0,10,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, A...","[Output is correct\n\n, Output is correct\n\n,..."
5222,2024,day2,Hieroglyphs,hieroglyphs,04-hasucs,correct/yiping-full.cpp,"#include<bits/stdc++.h>\n#include""hieroglyphs....",AC,1.0,16.0,16,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, A...","[Output is correct\n\n, Output is correct\n\n,..."
5223,2024,day2,Hieroglyphs,hieroglyphs,05-n2,correct/yiping-full.cpp,"#include<bits/stdc++.h>\n#include""hieroglyphs....",AC,1.0,14.0,14,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, AC, A...","[Output is correct\n\n, Output is correct\n\n,..."


In [21]:
filtered_df = pd.DataFrame()

# Assign new values
filtered_df['problem_id'] = df['year'].astype(str)+"_"+ df['subtask']
filtered_df['submission_id'] = np.nan
filtered_df['website'] = 'https://ioinformatics.org/'
filtered_df['source'] = df['code']
filtered_df['writer'] = 'Human'
filtered_df = filtered_df[['writer', 'problem_id', 'submission_id', 'website', 'source']]
filtered_df.dropna(subset=['source'])
filtered_df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,Human,2020_00-samples,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <bits/stdc++.h>..."
1,Human,2020_01-calc-median,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <bits/stdc++.h>..."
2,Human,2020_02-one-day-sort,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <bits/stdc++.h>..."
3,Human,2020_03-two-locations,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <bits/stdc++.h>..."
4,Human,2020_04-all-sent,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <bits/stdc++.h>..."
...,...,...,...,...,...
5220,Human,2024_02-cnt3,,https://ioinformatics.org/,"#include<bits/stdc++.h>\n#include""hieroglyphs...."
5221,Human,2024_03-bin,,https://ioinformatics.org/,"#include<bits/stdc++.h>\n#include""hieroglyphs...."
5222,Human,2024_04-hasucs,,https://ioinformatics.org/,"#include<bits/stdc++.h>\n#include""hieroglyphs...."
5223,Human,2024_05-n2,,https://ioinformatics.org/,"#include<bits/stdc++.h>\n#include""hieroglyphs...."


In [27]:
# Drop Duplicate and None
filtered_df = filtered_df.dropna(subset=['source']).drop_duplicates(subset=['source'])
filtered_df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,Human,2020_00-samples,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <bits/stdc++.h>..."
8,Human,2020_00-samples,,https://ioinformatics.org/,"#include ""biscuits.h""\n#include <vector>\nusin..."
14,Human,2020_00-samples,,https://ioinformatics.org/,"#include ""stations.h""\n#define MAXN 1000\n\nst..."
20,Human,2020_00-samples,,https://ioinformatics.org/,"#include ""biscuits.h""\n#include <vector>\n#inc..."
26,Human,2020_00-samples,,https://ioinformatics.org/,"#include ""tickets.h""\n#include <vector>\n\nlon..."
...,...,...,...,...,...
5190,Human,2024_00-samples,,https://ioinformatics.org/,"#include ""hieroglyphs.h""\n#include<bits/stdc++..."
5197,Human,2024_00-samples,,https://ioinformatics.org/,"#include ""hieroglyphs.h""\n#include<bits/stdc++..."
5204,Human,2024_00-samples,,https://ioinformatics.org/,"#include<bits/stdc++.h>\n#include ""hieroglyphs..."
5211,Human,2024_00-samples,,https://ioinformatics.org/,"#include<bits/stdc++.h>\n#include ""hieroglyphs..."


In [30]:
CLEAN_RECORD = "./Humans/Cleaned/IOI-1.csv"

filtered_df.to_csv(CLEAN_RECORD,index=False)

## Statement
There are some datasets that provide me only a statement of problem (some of them provide a source code.) .\
These datasets will be used to generating source code with AI.

#### Codeforces Statement 1,2,3,4
All of them came in csv extension, all I need is to clean data.\
CodeforcesStatement3 has provided me a solution, which will be cleaned and format like source code datasets.

All of them will be labeled in the "problem_id" , "site_url" and "statement" format.

### Codeforces Statement 1

In [None]:
PATH = "./Statements/Raw/Codeforces/CodeforcesStatement1.csv"
CLEAN_RECORD = "./Statements/Cleaned/CodeforcesStatement1.csv"

df = pd.read_csv(PATH)
df

Unnamed: 0,problem_statement,input,output,time_limit,memory_limit,tags
0,F. Nene and the Passing Gametime limit per tes...,521 11 121 12 231 31 31 351 12 21 52 21 161 25...,2 2 2 1 3,4 seconds,256 megabytes,"['constructive algorithms', 'data structures',..."
1,E2. Nene vs. Monsters (Hard Version)time limit...,532 5 320 041 5 7 244 2 1 2131 1 4 5 1 4 1 9 1...,1 1 0 1 1 2 1 3 6 1 3 6 8 10 12,2 seconds,256 megabytes,"['brute force', 'greedy', 'implementation', 'm..."
2,E1. Nene vs. Monsters (Easy Version)time limit...,532 5 320 041 5 7 244 2 1 2131 1 4 5 1 4 1 9 1...,1 1 0 1 1 2 1 3 6 1 3 6 8 10 12,2 seconds,256 megabytes,"['brute force', 'implementation', 'math', '*25..."
3,D. Nene and the Mex Operatortime limit per tes...,20 1,4 1 1 2,2 seconds,256 megabytes,"['bitmasks', 'brute force', 'constructive algo..."
4,C. Nene's Magical Matrixtime limit per test2 s...,212,1 1 1 1 1 7 3 1 1 1 2 1 2 1 2 2 1 1 2,2 seconds,256 megabytes,"['constructive algorithms', 'greedy', 'math', ..."
...,...,...,...,...,...,...
9233,B. The least round waytime limit per test2 sec...,Input31 2 34 5 67 8 9,Output0DDRR,2 seconds,64 megabytes,"['dp', 'math', '*2000']"
9234,A. Winnertime limit per test1 secondmemory lim...,Input3mike 3andrew 5mike 2,Outputandrew,1 second,64 megabytes,"['hashing', 'implementation', '*1500']"
9235,C. Ancient Berland Circustime limit per test2 ...,Input0.000000 0.0000001.000000 1.0000000.00000...,Output1.00000000,2 seconds,64 megabytes,"['geometry', 'math', '*2100']"
9236,B. Spreadsheetstime limit per test10 secondsme...,Input2R23C55BC23,OutputBC23R23C55,10 seconds,64 megabytes,"['implementation', 'math', '*1600']"


In [190]:
# Get prefix of statement

cleaned_df = pd.DataFrame()
cleaned_df['problem_id'] = "(?????)"+df['problem_statement'].str.extract(r'^([a-zA-Z])')
cleaned_df.value_counts()

problem_id
(?????)A      1686
(?????)B      1677
(?????)C      1423
(?????)D      1416
(?????)E      1400
(?????)F       860
(?????)G       396
(?????)H       172
(?????)I        80
(?????)J        45
(?????)K        26
(?????)L        24
(?????)M        18
(?????)N         8
(?????)O         3
(?????)P         1
(?????)Q         1
(?????)R         1
Name: count, dtype: int64

In [191]:
# Add Problem Statement
cleaned_df['problem_statement'] = df['problem_statement']
cleaned_df['website'] = 'https://codeforces.com'
cleaned_df = cleaned_df[['problem_id','website','problem_statement']]
cleaned_df

Unnamed: 0,problem_id,website,problem_statement
0,(?????)F,https://codeforces.com,F. Nene and the Passing Gametime limit per tes...
1,(?????)E,https://codeforces.com,E2. Nene vs. Monsters (Hard Version)time limit...
2,(?????)E,https://codeforces.com,E1. Nene vs. Monsters (Easy Version)time limit...
3,(?????)D,https://codeforces.com,D. Nene and the Mex Operatortime limit per tes...
4,(?????)C,https://codeforces.com,C. Nene's Magical Matrixtime limit per test2 s...
...,...,...,...
9233,(?????)B,https://codeforces.com,B. The least round waytime limit per test2 sec...
9234,(?????)A,https://codeforces.com,A. Winnertime limit per test1 secondmemory lim...
9235,(?????)C,https://codeforces.com,C. Ancient Berland Circustime limit per test2 ...
9236,(?????)B,https://codeforces.com,B. Spreadsheetstime limit per test10 secondsme...


In [192]:
CLEAN_RECORD = "./Statements/Cleaned/CodeforecesStatement1.csv"
cleaned_df.to_csv(CLEAN_RECORD,index=False)

### Codeforces Statement 2
- Change the label name, remove problem_tags, all done!
- The statement doesn't have Example Input/Output. 

In [None]:
PATH = "./Statements/Raw/Codeforces/CodeforcesStatement2.csv"
CLEAN_RECORD = "./Statements/Cleaned/CodeforcesStatement2.csv"

df = pd.read_csv(PATH)
df

Unnamed: 0,contest,problem_name,problem_statement,problem_tags
0,325,A,You are given n rectangles. The corners of rec...,"implementation,*1500"
1,325,B,Daniel is organizing a football tournament. He...,"binarysearch,math,*1800"
2,325,C,Piegirl has found a monster and a book about m...,"dfsandsimilar,graphs,shortestpaths,*2600"
3,325,D,"In a far away land, there exists a planet shap...","dsu,*2900"
4,325,E,Piegirl found the red button. You have one las...,"combinatorics,dfsandsimilar,dsu,graphs,greedy,..."
...,...,...,...,...
8338,1271,B,There are $$$n$$$ blocks arranged in a row and...,"greedy,math,*1300"
8339,1271,C,The map of the capital of Berland can be viewe...,"bruteforce,geometry,greedy,implementation,*1300"
8340,1271,D,"You play a strategic video game (yeah, we ran ...","datastructures,dp,greedy,implementation,sortin..."
8341,1271,E,"At first, let's define function $$$f(x)$$$ as ...","binarysearch,combinatorics,dp,math,*2100"


In [195]:
cleaned_df = pd.DataFrame()
cleaned_df['problem_id'] = df['contest'].astype(str) + df['problem_name']
cleaned_df['website'] = 'https://codeforces.com'
cleaned_df['problem_statement'] = df['problem_statement']
cleaned_df = cleaned_df[['problem_id', 'website', 'problem_statement']]
cleaned_df

Unnamed: 0,problem_id,website,problem_statement
0,325A,https://codeforces.com,You are given n rectangles. The corners of rec...
1,325B,https://codeforces.com,Daniel is organizing a football tournament. He...
2,325C,https://codeforces.com,Piegirl has found a monster and a book about m...
3,325D,https://codeforces.com,"In a far away land, there exists a planet shap..."
4,325E,https://codeforces.com,Piegirl found the red button. You have one las...
...,...,...,...
8338,1271B,https://codeforces.com,There are $$$n$$$ blocks arranged in a row and...
8339,1271C,https://codeforces.com,The map of the capital of Berland can be viewe...
8340,1271D,https://codeforces.com,"You play a strategic video game (yeah, we ran ..."
8341,1271E,https://codeforces.com,"At first, let's define function $$$f(x)$$$ as ..."


In [196]:
cleaned_df.to_csv(CLEAN_RECORD,index=False)

### Codeforces Statement 3

In [197]:
PATH = "./Statements/Raw/Codeforces/CodeforcesStatement3.csv"
CLEAN_RECORD = "./Statements/Cleaned/CodeforecesStatement3.csv"
CLEAN_RECORD2 = "./Human/Cleaned/codeforeces4.csv"

df = pd.read_csv(PATH)
df

Unnamed: 0.1,Unnamed: 0,problem_link,problem_name,problem_statement,input,output,editorial,coding_solution
0,0,https://codeforces.com//contest/1741/problem/A,A. Compare T-Shirt Sizes,Two T-shirt sizes are given: and . The T-shir...,The first line of the input contains a single ...,"For each test case, print on a separate line t...","Let , are the last characters of lines and ...",#include <bits/stdc++.h>\n \nusing namespace s...
1,1,https://codeforces.com//contest/1741/problem/B,B. Funny Permutation,A sequence of numbers is called if it contai...,The first line of input data contains a single...,"For each test case, print on a separate line: ...","We cannot make a funny permutation only when ,...","#include ""bits/stdc++.h""\nusing namespace std;..."
2,2,https://codeforces.com//contest/1741/problem/C,C. Minimize the Thickness,You are given a sequence consisting of inte...,The first line contains a single integer () —...,"For each test case, output one integer — the m...",Let's iterate over the length of the first seg...,#include <bits/stdc++.h>\n\nusing namespace st...
3,3,https://codeforces.com//contest/1741/problem/D,D. Masha and a Beautiful Tree,The girl named Masha was walking in the forest...,The first line contains single integer () — n...,"For each test case in a separate line, print t...",Let some vertex be responsible for a segment o...,#include <bits/stdc++.h>\n\nusing namespace st...
4,4,https://codeforces.com//contest/1741/problem/E,E. Sending a Sequence Over the Network,The sequence is sent over the network as foll...,The first line of input data contains a single...,For each test case print on a separate line: ...,Let's introduce the dynamics. if on the prefi...,#include <bits/stdc++.h>\n\nusing namespace st...
...,...,...,...,...,...,...,...,...
1545,1545,https://codeforces.com//contest/600/problem/A,A. Extract Numbers,You are given string . Let's call any largest...,The only line of input contains the string ()...,Print the string to the first line and string...,This is a technical problem. You should do exa...,"/*\nYeah\n\nHere she comes again, she's feelin..."
1546,1546,https://codeforces.com//contest/600/problem/B,B. Queries about less or equal elements,You are given two arrays of integers and . Fo...,The first line contains two integers () — the...,"Print integers, separated by spaces: the -th ...",Let's sort all numbers in a. Now let's iterate...,/*\nMass inclination will fuel my frustration\...
1547,1547,https://codeforces.com//contest/600/problem/C,C. Make Palindrome,A string is called palindrome if it reads the ...,The only line contains string () consisting o...,Print the lexicographically smallest palindrom...,Let's denote cntc — the number of occurences o...,/*\nYour insecurities makes me feel\nSo helple...
1548,1548,https://codeforces.com//contest/600/problem/D,D. Area of Two Circles' Intersection,You are given two circles. Find the area of th...,The first line contains three integers () — t...,Print the area of the intersection of the circ...,If the circles don't intersect than the answer...,/*\nShe's wearin' dresses on the border line\n...


In [198]:
# Get prefix of statement

cleaned_df = pd.DataFrame()
cleaned_df['problem_index'] = df['problem_name'].str.extract(r'^([a-zA-Z])')
cleaned_df.value_counts()

problem_index
E                241
D                239
F                238
C                237
B                235
A                230
G                120
H                  9
I                  1
Name: count, dtype: int64

In [None]:
import re
def extract_contest_id(url):
    match = re.search(r'contest/(\d+)', url)
    if match:
        return match.group(1)
    return np.nan

cleaned_df['problem_id'] = df['problem_link'].apply(extract_contest_id) + cleaned_df['problem_index']
cleaned_df['problem_id'].value_counts()

Unnamed: 0,problem_index,problem_id
357,D,1249D
358,D,1249D


In [201]:
def get_statement(prooblem_name,statement,ex_in,ex_out):
    return prooblem_name+"\n"+statement+"\nInput: \n"+ex_in+"\nOutput: \n"+ex_out

cleaned_df['problem_statement'] = df.apply(
    lambda row: get_statement(row['problem_name'], row['problem_statement'], row['input'], row['output']), axis=1
)
cleaned_df

Unnamed: 0,problem_index,problem_id,problem_statement
0,A,1741A,A. Compare T-Shirt Sizes\nTwo T-shirt sizes ar...
1,B,1741B,B. Funny Permutation\nA sequence of numbers i...
2,C,1741C,C. Minimize the Thickness\nYou are given a seq...
3,D,1741D,D. Masha and a Beautiful Tree\nThe girl named ...
4,E,1741E,E. Sending a Sequence Over the Network\nThe se...
...,...,...,...
1545,A,600A,A. Extract Numbers\nYou are given string . Let...
1546,B,600B,B. Queries about less or equal elements\nYou a...
1547,C,600C,C. Make Palindrome\nA string is called palindr...
1548,D,600D,D. Area of Two Circles' Intersection\nYou are ...


In [204]:
cleaned_df['website'] = 'https://codeforces.com'
cleaned_df = cleaned_df[['problem_id','website','problem_statement']]
cleaned_df

Unnamed: 0,problem_id,website,problem_statement
0,1741A,https://codeforces.com,A. Compare T-Shirt Sizes\nTwo T-shirt sizes ar...
1,1741B,https://codeforces.com,B. Funny Permutation\nA sequence of numbers i...
2,1741C,https://codeforces.com,C. Minimize the Thickness\nYou are given a seq...
3,1741D,https://codeforces.com,D. Masha and a Beautiful Tree\nThe girl named ...
4,1741E,https://codeforces.com,E. Sending a Sequence Over the Network\nThe se...
...,...,...,...
1545,600A,https://codeforces.com,A. Extract Numbers\nYou are given string . Let...
1546,600B,https://codeforces.com,B. Queries about less or equal elements\nYou a...
1547,600C,https://codeforces.com,C. Make Palindrome\nA string is called palindr...
1548,600D,https://codeforces.com,D. Area of Two Circles' Intersection\nYou are ...


In [None]:
CLEAN_RECORD = "./Statements/Cleaned/CodeforcesStatement3.csv"

cleaned_df.to_csv(CLEAN_RECORD,index=False)

In [207]:
source_cleaned_df = cleaned_df.copy()
source_cleaned_df['source'] = df['coding_solution']
source_cleaned_df['writer'] = 'human'
source_cleaned_df['website'] = 'https://codeforces.com'
source_cleaned_df['submission_id'] = np.nan
source_cleaned_df = source_cleaned_df[['writer','problem_id','submission_id','website','source']]

In [None]:
source_cleaned_df = source_cleaned_df[['writer','problem_id','submission_id','website','source']]
source_cleaned_df

'/*\nNight time, see the castles burning\nSmoke in the skies and tears in their eyes\nAs the world keeps turning\n\nSleep now, hear a distant thunder\nIt\'s far away at least for today\nClose your eyes and wonder\n\nSpring turns so quickly to summer\nSummer so quickly to fall\nIt seemed far away or it was yesterday\nWhen time didn\'t matter at all\n\nAnd then you met your winter\nWhile dancing with her daughters\nTill tired and cold, were much wiser than bold\nYou wait for tomorrow to call\n\nAll of your life you have waited alone for a Savior\nHe\'s not coming\nA carousel horse who is constantly lost\nStanding still but always running\n\nAnd all of those things that you needed so bad\nYou have found they mean nothing\nAnd, oh Lord, I\'m coming home\n\nI\'m searching through the haze\nThat\'s drifting through my mind\nStare in my looking glass\nAnd wonder who I\'ll find\n\nNo one would listen\nTo a man upon the water\nUntil they were old and their mountains of gold\nCouldn\'t buy any m

In [None]:
CLEAN_RECORD2 = "./Humans/Cleaned/codeforces3.csv"
source_cleaned_df.to_csv(CLEAN_RECORD2,index=False)

### Codeforces 4
From Hugging Face.

In [210]:
# Read the validation and test datasets
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
validation_df = pd.read_parquet("hf://datasets/mrfire15/CodeforcesProblems/" + splits["validation"])
test_df = pd.read_parquet("hf://datasets/mrfire15/CodeforcesProblems/" + splits["test"])
df = pd.read_parquet("hf://datasets/mrfire15/CodeforcesProblems/" + splits["train"])

# Concatenate train, validation, and test datasets
df = pd.concat([df, validation_df, test_df], ignore_index=True)
df

Unnamed: 0,Problem ID,Problem Description,Rating,math,greedy,implementation,dp,data structures,constructive algorithms,brute force,binary search,sortings,graphs,__index_level_0__
0,130A,Problem - 130A - Codeforces =============== x...,900.0,False,False,True,False,False,False,False,False,False,False,9363
1,493C,Vasya follows a basketball game and marks the ...,1600.0,False,False,True,False,True,False,True,True,True,False,7861
2,1210F1,This is an easier version of the problem. In t...,3100.0,False,False,False,False,False,False,True,False,False,False,4631
3,721A,Recently Adaltik discovered japanese crossword...,800.0,False,False,True,False,False,False,False,False,False,False,6931
4,1761D,Problem - 1761D - Codeforces =============== ...,2100.0,True,False,False,False,False,False,False,False,False,False,1745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9071,1915F,There are $$$n$$$ people on the number line; t...,1500.0,False,False,False,False,True,False,False,False,True,False,805
9072,1741G,Kirill lives on a connected undirected graph o...,2200.0,False,False,False,True,False,False,True,False,False,True,1832
9073,2029F,You are given a cycle with $$$n$$$ vertices nu...,2500.0,False,True,False,False,False,True,False,False,False,True,92
9074,1771C,Hossam has $$$n$$$ trainees. He assigned a num...,1600.0,True,True,False,False,False,False,False,False,False,False,1667


In [212]:
df['website'] = 'https://codeforces.com'
df['problem_statement'] = df['Problem Description']
df['problem_id'] = df['Problem ID']
df = df[['problem_id', 'website', 'problem_statement']]
df

Unnamed: 0,problem_id,website,problem_statement
0,130A,https://codeforces.com,Problem - 130A - Codeforces =============== x...
1,493C,https://codeforces.com,Vasya follows a basketball game and marks the ...
2,1210F1,https://codeforces.com,This is an easier version of the problem. In t...
3,721A,https://codeforces.com,Recently Adaltik discovered japanese crossword...
4,1761D,https://codeforces.com,Problem - 1761D - Codeforces =============== ...
...,...,...,...
9071,1915F,https://codeforces.com,There are $$$n$$$ people on the number line; t...
9072,1741G,https://codeforces.com,Kirill lives on a connected undirected graph o...
9073,2029F,https://codeforces.com,You are given a cycle with $$$n$$$ vertices nu...
9074,1771C,https://codeforces.com,Hossam has $$$n$$$ trainees. He assigned a num...


In [None]:
CLEAN_RECORD = "./Statements/Cleaned/CodeforcesStatement4.csv"

df.to_csv(CLEAN_RECORD,index=False)

## AI-Generated
There are many AI-Genrated Code datasets from hugging face that I can use (Most of it is open-r1)

Link :
- https://huggingface.co/datasets/open-r1/codeforces-cots

### Codeforces R1 (1)

In [None]:
CLEAN_RECORD = "./AI-Generated/Cleaned/codeforces-r1-1.csv"

from datasets import load_dataset
ds = load_dataset("open-r1/codeforces-cots", "solutions")
df = ds["train"].to_pandas()
df

In [None]:
cpp_pattern = re.compile(r'```cpp\s+(.*?)```', re.DOTALL)

df['source'] = df['messages'].apply(lambda x: cpp_pattern.search(x[1]['content']).group(1) if cpp_pattern.search(x[1]['content']) else None)
df

In [None]:
# Reset filtered_df to ensure it starts fresh
filtered_df = pd.DataFrame()

# Assign new values
filtered_df['problem_id'] = df['contest_id'] + df['index']
filtered_df['submission_id'] = np.nan
filtered_df['website'] = 'https://codeforces.com'
filtered_df['source'] = df['source']
filtered_df['writer'] = 'AI'
filtered_df = filtered_df[['writer', 'problem_id', 'submission_id', 'website', 'source']]
filtered_df.dropna(subset=['source'])
filtered_df

In [264]:
CLEAN_RECORD = "./AI-Generated/Cleaned/codeforces-r1-1.csv"

filtered_df.to_csv(CLEAN_RECORD,index=False)

### Codeforces R1 (2)

In [16]:
CLEAN_RECORD = "./AI-Generated/Cleaned/codeforces-r1-2.csv"

from datasets import load_dataset
ds = load_dataset("mlfoundations-dev/openr1_codeforces")

In [None]:
df = ds["train"].to_pandas()

In [None]:
import re

cpp_pattern = re.compile(r'```cpp\s+(.*?)```', re.DOTALL)

df['source'] = df['original_reasoning_trace'].apply(lambda x: cpp_pattern.search(x).group(1) if cpp_pattern.search(x) else None)
df

In [None]:
# Reset filtered_df to ensure it starts fresh
filtered_df = pd.DataFrame()

# Assign new values
filtered_df['problem_id'] = np.nan
filtered_df['submission_id'] = np.nan
filtered_df['source'] = df['source']
filtered_df['website'] = 'https://codeforces.com'
filtered_df['writer'] = 'AI'
filtered_df = filtered_df[['writer', 'problem_id', 'submission_id', 'website', 'source']]
filtered_df.dropna(subset=['source'])
filtered_df

In [None]:
CLEAN_RECORD = "./AI-Generated/Cleaned/codeforces-r1-2.csv"

filtered_df.to_csv(CLEAN_RECORD,index=False)

### IOI - DeepSeekR1

In [1]:
CLEAN_RECORD = "./AI-Generated/Cleaned/IOI-r1-1.csv"

from datasets import load_dataset
ds = load_dataset("open-r1/ioi-cots")

README.md:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00004.parquet:   0%|          | 0.00/139M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00001-of-00004.parquet:   0%|          | 0.00/139M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00002-of-00004.parquet:   0%|          | 0.00/114M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00003-of-00004.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11450 [00:00<?, ? examples/s]

In [3]:
df = ds["train"].to_pandas()
df

Unnamed: 0,year,day,problem_name,problem_id,target_subtask,prompt,generation,uuid,metadata,finish_reason,code,code_compiles,target_subtask_score,target_subtask_status,all_subtasks_points,all_subtasks_results,messages
0,2023,day1,Beech Tree,beechtree,01-tinyN,You are an expert competitive programmer. You ...,"<think>\nOkay, I need to solve this problem wh...",958f646a-d5a7-476f-9960-6aa99c647f12,"{'completion_tokens': 8162, 'prompt_tokens': 2...",stop,"#include <bits/stdc++.h>\n#include ""beechtree....",True,1.0,AC,9.0,"[{'points': 0, 'problem': 'beechtree', 'score'...",[{'content': 'You are an expert competitive pr...
1,2023,day1,Beech Tree,beechtree,01-tinyN,You are an expert competitive programmer. You ...,"<think>\nOkay, let's try to figure out how to ...",b3f3eb69-8967-4b88-88e3-11a76bd845e6,"{'completion_tokens': 6844, 'prompt_tokens': 2...",stop,"#include <bits/stdc++.h>\n#include ""beechtree....",True,1.0,AC,9.0,"[{'points': 0, 'problem': 'beechtree', 'score'...",[{'content': 'You are an expert competitive pr...
2,2023,day1,Beech Tree,beechtree,01-tinyN,You are an expert competitive programmer. You ...,"<think>\nOkay, let's see. I need to solve this...",81241910-7511-45d5-bd97-66830515eb7f,"{'completion_tokens': 7092, 'prompt_tokens': 2...",stop,"#include <bits/stdc++.h>\n#include ""beechtree....",True,1.0,AC,9.0,"[{'points': 0, 'problem': 'beechtree', 'score'...",[{'content': 'You are an expert competitive pr...
3,2023,day1,Beech Tree,beechtree,01-tinyN,You are an expert competitive programmer. You ...,"<think>\nOkay, let's tackle this problem. Hmm,...",a0c52415-6215-4eac-82ee-86b8b71e7b8e,"{'completion_tokens': 6745, 'prompt_tokens': 2...",stop,"#include <bits/stdc++.h>\n#include ""beechtree....",True,1.0,AC,9.0,"[{'points': 0, 'problem': 'beechtree', 'score'...",[{'content': 'You are an expert competitive pr...
4,2023,day1,Beech Tree,beechtree,01-tinyN,You are an expert competitive programmer. You ...,"<think>\nOkay, I need to solve this problem wh...",e44d73b2-b5fa-4777-a65f-2aaf3ff12ecd,"{'completion_tokens': 8668, 'prompt_tokens': 2...",stop,"#include <bits/stdc++.h>\n#include ""beechtree....",True,1.0,AC,9.0,"[{'points': 0, 'problem': 'beechtree', 'score'...",[{'content': 'You are an expert competitive pr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11445,2020,day2,Stations,stations,02-binary-tree,You are an expert competitive programmer. You ...,"<think>\nOkay, I need to solve this problem wh...",414300de-5084-4582-bd16-68201da8e593,"{'completion_tokens': 19241, 'prompt_tokens': ...",stop,"#include <bits/stdc++.h>\n#include ""stations.h...",True,0.0,RE,0.0,"[{'points': 0, 'problem': 'stations', 'score':...",[{'content': 'You are an expert competitive pr...
11446,2020,day2,Stations,stations,02-binary-tree,You are an expert competitive programmer. You ...,"<think>\nOkay, I need to solve this problem wh...",8f59ac48-1bbc-4cf9-84ce-45e1553fb906,"{'completion_tokens': 19426, 'prompt_tokens': ...",stop,"#include <bits/stdc++.h>\n#include ""stations.h...",True,0.0,RE,0.0,"[{'points': 0, 'problem': 'stations', 'score':...",[{'content': 'You are an expert competitive pr...
11447,2020,day2,Stations,stations,02-binary-tree,You are an expert competitive programmer. You ...,"<think>\nOkay, let's see. I need to solve this...",8648921a-8cc6-4873-8276-f1b397304150,"{'completion_tokens': 11249, 'prompt_tokens': ...",stop,"#include <bits/stdc++.h>\n#include ""stations.h...",True,1.0,AC,8.0,"[{'points': 0, 'problem': 'stations', 'score':...",[{'content': 'You are an expert competitive pr...
11448,2020,day2,Stations,stations,02-binary-tree,You are an expert competitive programmer. You ...,"<think>\nOkay, let's see. I need to solve this...",6f52099d-4106-4f9f-a79e-aafac49db260,"{'completion_tokens': 11035, 'prompt_tokens': ...",stop,"#include <bits/stdc++.h>\n#include ""stations.h...",True,1.0,AC,8.0,"[{'points': 0, 'problem': 'stations', 'score':...",[{'content': 'You are an expert competitive pr...


In [11]:
filtered_df = pd.DataFrame()

# Assign new values
filtered_df['problem_id'] = df['year'].astype(str)+"_"+ df['problem_id']+"_"+ df['target_subtask']
filtered_df['submission_id'] = np.nan
filtered_df['website'] = 'https://ioinformatics.org/'
filtered_df['source'] = df['code']
filtered_df['writer'] = 'AI'
filtered_df = filtered_df[['writer', 'problem_id', 'submission_id', 'website', 'source']]
filtered_df.dropna(subset=['source'])
filtered_df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
1,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
2,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
3,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
4,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
...,...,...,...,...,...
11445,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."
11446,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."
11447,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."
11448,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."


In [14]:
# Drop Duplicate and None
filtered_df = filtered_df.dropna(subset=['source']).drop_duplicates(subset=['source'])
filtered_df

Unnamed: 0,writer,problem_id,submission_id,website,source
0,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
1,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
2,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
3,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
4,AI,2023_beechtree_01-tinyN,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""beechtree...."
...,...,...,...,...,...
11445,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."
11446,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."
11447,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."
11448,AI,2020_stations_02-binary-tree,,https://ioinformatics.org/,"#include <bits/stdc++.h>\n#include ""stations.h..."


In [16]:
CLEAN_RECORD = "./AI-Generated/Cleaned/IOI-r1-1.csv"

filtered_df.to_csv(CLEAN_RECORD,index=False)

### Codeforces ShareGPT

I can't install 3.8 GiB datasets with python, so I use colab to load and clean it.

: https://colab.research.google.com/drive/1frRHJNMug-SWB4z7cQ8YL1ouHPML9GQ_?usp=sharing

## @PakinDioxide (codeforces4, programmingin2, graderchan2)
Thanks for incredible solutions!!
Provided Codeforces, Programming.in.th and Graderchan solution (and from other source) 

In [54]:
import os
import pandas as pd
import numpy as np

def find_cpp_files(folder, website):
    df = pd.DataFrame(columns=['writer', 'problem_id', 'submission_id', 'website', 'source'])
    
    for dirpath, dirnames, filenames in os.walk(folder):
        for file in filenames:
            if file.endswith('.cpp'):
                if file == "grader.cpp":
                    continue  # Skip this file
                
                full_path = os.path.join(dirpath, file)
                writer = 'Human'
                problem_id = np.nan
                submission_id = np.nan
                
                with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
                    source = f.read()
                
                new_row = pd.DataFrame([{
                    'writer': writer,
                    'problem_id': problem_id,
                    'submission_id': submission_id,
                    'website': website,
                    'source': source
                }])
                
                df = pd.concat([df, new_row], ignore_index=True)

    return df


In [55]:
cf_PATH = "./Humans/Raw/PAKIN_DIOXIDE/_CODEFORCES"
beta_PATH = "./Humans/Raw/PAKIN_DIOXIDE/_BETA"
gchan_PATH = "./Humans/Raw/PAKIN_DIOXIDE/_GCHAN"

In [56]:
cf_DF = find_cpp_files(cf_PATH,"https://codeforces.com")
beta_DF = find_cpp_files(beta_PATH,"https://programming.in.th")
gchan_DF = find_cpp_files(gchan_PATH,"https://firefly.gchan.moe")

In [57]:
cf_DF.to_csv("./Humans/Cleaned/codeforces4.csv",index=False)
beta_DF.to_csv("./Humans/Cleaned/programmingin2.csv",index=False)
gchan_DF.to_csv("./Humans/Cleaned/graderchan2.csv",index=False)

## @TTAMX (codeforces5, programmingin3, graderchan3)
Thanks for incredible solutions!!
Provided Codeforces, Programming.in.th and Graderchan solution (and from other source) 

In [59]:
import os
import pandas as pd
import numpy as np

def find_cpp_files(folder, website):
    df = pd.DataFrame(columns=['writer', 'problem_id', 'submission_id', 'website', 'source'])
    
    for dirpath, dirnames, filenames in os.walk(folder):
        for file in filenames:
            if file.endswith('.cpp'):
                if file == "grader.cpp":
                    continue  # Skip this file
                
                full_path = os.path.join(dirpath, file)
                writer = 'Human'
                problem_id = np.nan
                submission_id = np.nan
                
                with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
                    source = f.read()
                
                new_row = pd.DataFrame([{
                    'writer': writer,
                    'problem_id': problem_id,
                    'submission_id': submission_id,
                    'website': website,
                    'source': source
                }])
                
                df = pd.concat([df, new_row], ignore_index=True)

    return df


In [60]:
cf_PATH = "./Humans/Raw/TTAMX/_CODEFORCES"
beta_PATH = "./Humans/Raw/TTAMX/_BETA"
gchan_PATH = "./Humans/Raw/TTAMX/_GCHAN"

In [61]:
cf_DF = find_cpp_files(cf_PATH,"https://codeforces.com")
beta_DF = find_cpp_files(beta_PATH,"https://programming.in.th")
gchan_DF = find_cpp_files(gchan_PATH,"https://firefly.gchan.moe")

In [62]:
cf_DF.to_csv("./Humans/Cleaned/codeforces5.csv",index=False)
beta_DF.to_csv("./Humans/Cleaned/programmingin3.csv",index=False)
gchan_DF.to_csv("./Humans/Cleaned/graderchan3.csv",index=False)

Unnamed: 0,writer,problem_id,submission_id,website,source
0,Human,,,https://codeforces.com,"#include ""template.hpp""\n#include ""flow/dinic...."
1,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
2,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
3,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
4,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
...,...,...,...,...,...
1813,Human,,,https://codeforces.com,"#include ""template.hpp""\n#include ""modular-ari..."
1814,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
1815,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...
1816,Human,,,https://codeforces.com,#include<bits/stdc++.h>\n\nusing namespace std...


## AI-Clean

In [37]:
source_code_path = "./AI-Generated/Raw/"
df = pd.DataFrame()

AI_DF = pd.DataFrame()

for root, dirs, files in os.walk(source_code_path):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            AI_DF = pd.concat([AI_DF, pd.read_csv(file_path)], ignore_index=True)
    
AI_DF

Unnamed: 0,writer,problem_id,submission_id,website,source
0,AI,abacus,,https://firefly.gchan.moe,#include <iostream>\n#include <string>\n#inclu...
1,AI,adjacency_list,,https://firefly.gchan.moe,#include <iostream>\n#include <vector>\n#inclu...
2,AI,amogus,,https://firefly.gchan.moe,#include <iostream>\n#include <string>\n\nusin...
3,AI,bookshelf,,https://firefly.gchan.moe,#include <iostream>\n#include <vector>\n#inclu...
4,AI,c1_bkk60_1,,https://firefly.gchan.moe,#include <iostream>\n#include <vector>\n\nusin...
...,...,...,...,...,...
2078,AI,tumso20_penguin2,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...
2079,AI,tumso20_shibuyajihen2,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...
2080,AI,tumso20_simplepotatolanguage,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...
2081,AI,tumso20_space2,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...


In [40]:
GCHAN_DF = AI_DF[AI_DF['website'] == 'https://firefly.gchan.moe']
BETA_DF = AI_DF[AI_DF['website'] == 'https://programming.in.th']

In [46]:
BETA_DF = BETA_DF.drop_duplicates(subset='source', keep='last')

In [48]:
BETA_DF

Unnamed: 0,writer,problem_id,submission_id,website,source
723,AI,0011,,https://programming.in.th,#include <iostream>\n#include <vector>\n#inclu...
724,AI,0012,,https://programming.in.th,#include <iostream>\n#include <string>\n#inclu...
725,AI,0013,,https://programming.in.th,#include <iostream>\n#include <vector>\n#inclu...
726,AI,0014,,https://programming.in.th,#include <iostream>\n\nusing namespace std;\n\...
727,AI,0015,,https://programming.in.th,#include <iostream>\n#include <algorithm>\n\nu...
...,...,...,...,...,...
2078,AI,tumso20_penguin2,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...
2079,AI,tumso20_shibuyajihen2,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...
2080,AI,tumso20_simplepotatolanguage,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...
2081,AI,tumso20_space2,,https://programming.in.th,#include <bits/stdc++.h>\nusing namespace std;...


In [51]:
GCHAN_DF = GCHAN_DF.drop_duplicates(subset='source', keep='last')

In [52]:
GCHAN_DF

Unnamed: 0,writer,problem_id,submission_id,website,source
0,AI,abacus,,https://firefly.gchan.moe,#include <iostream>\n#include <string>\n#inclu...
1,AI,adjacency_list,,https://firefly.gchan.moe,#include <iostream>\n#include <vector>\n#inclu...
2,AI,amogus,,https://firefly.gchan.moe,#include <iostream>\n#include <string>\n\nusin...
3,AI,bookshelf,,https://firefly.gchan.moe,#include <iostream>\n#include <vector>\n#inclu...
4,AI,c1_bkk60_1,,https://firefly.gchan.moe,#include <iostream>\n#include <vector>\n\nusin...
...,...,...,...,...,...
718,AI,snacks_hard,,https://firefly.gchan.moe,#include<bits/stdc++.h>\nusing namespace std;\...
719,AI,stock,,https://firefly.gchan.moe,#include <bits/stdc++.h>\nusing namespace std;...
720,AI,traveller,,https://firefly.gchan.moe,#include <bits/stdc++.h>\nusing namespace std;...
721,AI,watch_vtuber,,https://firefly.gchan.moe,#include <bits/stdc++.h>\nusing namespace std;...


In [53]:
GCHAN_DF.to_csv('./AI-Generated/Cleaned/gchan-gemini.csv')
BETA_DF.to_csv('./AI-Generated/Cleaned/programmingin-gemini.csv')