### Load Normal Datasets

In [69]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [70]:
train_df = pd.read_csv("./../../Splits/combined/train.csv")
validation_df = pd.read_csv('./../../Splits/combined/validation.csv')
test_df = pd.read_csv('./../../Splits/combined/test.csv')
unused_human_df = pd.read_csv("./../../Splits/unused/Human.csv")

### Pre-Process Function

In [71]:
## ADD Random Lines

import random
first_line = 7 # Set Prior to first ... lines

def random_lines(s,min_add = 3,max_add = 6, min_add2 = 5,max_add2=35):
    _add2 = random.randint(min_add2,max_add2)
    _add = random.randint(min_add,max_add)
    lines = s.splitlines()
    total_lines = len(lines)
    
    candidate_indices = list(range(0,first_line))
    selected_indices = set(random.sample(candidate_indices, min(_add, len(candidate_indices))))
    candidate_indices2 = list(range(first_line, total_lines))
    selected_indices2 = set(random.sample(candidate_indices2, min(_add2, len(candidate_indices2))))

    modified = []
    for i, line in enumerate(lines):
        if i in selected_indices2 or i in selected_indices:
            modified.append(line + '\n'*random.choices([1, 2], weights=[70, 30], k=1)[0])
        else:
            modified.append(line)

    return '\n'.join(modified)

In [72]:
# Normalize Preprocesser
def replace_preprocessor_lines(code: str) -> str:
    lines = code.splitlines()
    replaced = ['<PREPROCESSOR>' if line.strip().startswith('#') else line for line in lines]
    return '\n'.join(replaced)

In [73]:
# Add Random Define Template

define_arr = [
    "#define int long long",
    "#include <ext/pb_ds/assoc_container.hpp>",
    "#include <ext/pb_ds/tree_policy.hpp>",
    "#define ordered_set <int, null_type, less<int>, rb_tree_tag, tree_order_statistics_node_update>",
    "#define ordered_multiset <int, null_type, less_equal <int>, rb_tree_tag, tree_order_statistics_node_update>",
    "#define fix iota(par, par + N, 0ll)",
    "#define int long long",
    "#define double long double",
    "#define pii pair <int, int>",
    "#define tiii tuple <int, int, int>",
    "#define tiiii tuple <int, int, int, int>",
    "#define emb emplace_back",
    "#define all(a) a.begin(), a.end()",
    "#define rall(a) a.rbegin(), a.rend()",
    "const int mod = 1e9 + 7;",
    "const int inf = 1e18;",
    "const int N = 2e6 + 5;",
]

def add_random_defines(code: str) -> str:
    num_to_add = random.randint(0, 10)
    selected = random.sample(define_arr, num_to_add)
    return '\n'.join(selected) + '\n' + code


In [74]:
import re
import random

# Add Space to random brackets.

def add_random_space_to_braces(code: str) -> str:
    code = re.sub(r'\{', lambda m: ' ' * random.randint(0, 2) + '{', code)
    code = re.sub(r'\}', lambda m: ' ' * random.randint(0, 2) + '}', code)
    return code

In [75]:
# Add random comments
import string
import random

def generate_random_comment(length=10):
    """Generate a random comment string."""
    characters = string.ascii_letters + string.digits + "     "  # spaces to simulate real text
    return "//" + ' '*random.randint(0,2) + ''.join(random.choices(characters, k=length)).strip()

def add_random_comments(code: str, comment_probability=0.2, mx_comment_length=45):
    """
    Adds randomly generated comments to some lines of C++ code.

    :param code: C++ source code (as a string).
    :param comment_probability: Probability that a line will receive a comment.
    :param comment_length: Length of the random comment string.
    :return: Modified C++ code with random comments.
    """
    lines = code.splitlines()
    modified_lines = []

    for line in lines:
        if line.strip() and random.random() < comment_probability:
            comment = generate_random_comment(random.randint(4,mx_comment_length))
            modified_lines.append(f"{'\n'* random.randint(0,1)}{line} {comment}")
        else:
            modified_lines.append(line)

    return "\n".join(modified_lines)

In [83]:
source = test_df[test_df['writer'] == 'AI']['source'].iloc()[3]

# After Augment

source = add_random_comments(source)
source = add_random_defines(source)
source = random_lines(source)
source = add_random_space_to_braces(source)
print(source)


#define pii pair <int, int>

#define tiiii tuple <int, int, int, int>

const int N = 2e6 + 5;

const int mod = 1e9 + 7;

#define fix iota(par, par + N, 0ll)
#define int long long

#include <ext/pb_ds/tree_policy.hpp>


#define emb emplace_back

#include <bits/stdc++.h>
#include "beechtree.h" //K131wG8wJ

#include <vector> //  e6H90pRvsJtNUWm6dsWHIUHP7V0YaxsyAq3hrH

#include <unordered_set>
using namespace std;

vector<int> beechtree(int N, int M, vector<int> P, vector<int> C)  {


    vector<vector<int>> children(N);


    for (int i = 1; i < N; ++i)   {
        children[P[i]].push_back(i);


    }


    vector<bool> has_dup(N, false);

    for (int x = 0; x < N; ++x)   {
        unordered_set<int> colors;




        for (int y : children[x])   { // EVFymy w5QD4PxFXFjN8YwQ9n0PfV tfasS
            int c = C[y];

            if (colors.count(c)) { //7SL07BQURLzro 2 wsknW240vSgr



                has_dup[x] = true; // X6SjOxvngS2SVTHWasU3c Avw
                break;
             }

    

### Augmented

In [61]:
for idx, row in test_df.iterrows():
    if row['writer'] == 'AI':
        source = row['source']
        source = add_random_comments(source)
        source = add_random_defines(source)
        source = random_lines(source)
        source = add_random_space_to_braces(source)
        test_df.at[idx, 'source'] = source

In [62]:
print(test_df[test_df['writer'] == 'AI']['source'][1])



#include <iostream>



#include <algorithm> //Xwyuea5Ow4IJZY32pbWvQ mrLE V9Ix uV74P1yFA




using namespace std;



int main() {

    int V1, V2, V3, Vm;

    cin >> V1 >> V2 >> V3 >> Vm; //a i8LK mi9F6lHdOkiRWhSo yzUMpAVc 3mojSIB

    




    int s_min = max(V3, Vm); // 2A9 cPBMp

    int s_max = min(2*V3, 2*Vm);

    if (s_min > s_max)  {

        cout << -1 << endl;

        return 0;

     }

    

    for (int s = s_min; s <= s_max; ++s)   {


        int m_lower = max({V2, 2*Vm + 1, s + 1  });

        if (m_lower > 2*V2) continue;

        int f_lower = max(V1, m_lower + 1);

        if (f_lower > 2*V1) continue;


        cout << f_lower << '\n' << m_lower << '\n' << s << endl;


        return 0;

    }


    

    cout << -1 << endl; //  fIyyK pfuQP0

    return 0;

}



In [64]:
test_df.to_csv('./Harder_TestSet/test.csv',index=False)

In [None]:
## Make a sub-test

In [65]:
test_df['writer'].value_counts()

writer
Human    6813
AI       6812
Name: count, dtype: int64

In [66]:
ai_df = test_df[test_df['writer'] == 'AI'].sample(n=1200, random_state=42)
human_df = test_df[test_df['writer'] == 'Human'].sample(n=1200, random_state=42)
sub_test_df = pd.concat([ai_df, human_df])
sub_test_df = sub_test_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [68]:
sub_test_df.to_csv('../../Splits/harder_test/sub_test.csv')