In [16]:
import random
import re
import utils
import json


def check_tasks(tasks):
  prompt = ''
  for idx, task_dict in enumerate(tasks):
    if 'instances' in task_dict:
      (instruction, input, output) = task_dict["instruction"], task_dict["instances"][0]["input"], task_dict["instances"][0]["output"]
    else:
      (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
    instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
    
    input = "<noinput>" if input.lower() == "" else input
    prompt += f"###\n"
    prompt += f"{idx + 1}. Instruction: {instruction}\n"
    prompt += f"{idx + 1}. Input:\n{input}\n"
    prompt += f"{idx + 1}. Output:\n{output}\n"
  print(prompt)


def get_tasks_outputs(tasks):
  return [
    task_dict["instances"][0]["output"] if 'instances' in task_dict else task_dict["output"] 
    for task_dict in tasks
  ]

# Seed Tasks

In [17]:
seed_tasks = utils.jload('seed_tasks.json')
check_tasks(random.sample(seed_tasks, 3))

###
1. Instruction: Perform a code review on the given HTML/CSS code file and provide feedback on the following factors: readability, uniformity, understandability, correctness, performance, and security.
1. Input:
```
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            font-size: 14px;
            color: #333;
        }
        
        #header {
            background-color: #ccc;
            padding: 20px;
        }
        
        #content {
            margin-top: 50px;
            padding: 20px;
        }
        
        .box {
            background-color: #f0f0f0;
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 20px;
        }
        
        .box h2 {
            font-size: 18px;
            margin-bottom: 10px;
        }
        
        .box p {
            line-height: 1.5;
        }
        
        .box .button {
            background-c

# Generated Tasks

In [18]:
regen_tasks = utils.jload('regen.json')
print(f'Generated Tasks Length: {len(regen_tasks)}')

Generated Tasks Length: 100


## Check Tasks Output

In [19]:
def format_ouput(output):
    lines = output.split('\n')
    result = {}

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if line.startswith('-'):
            key = line
            result[key] = []
            i += 1
            while i < len(lines) and lines[i].startswith('  -'):
                result[key].append(lines[i])
                i += 1
        else:
            i += 1

    formatted_result = {}
    for key, values in result.items():
        formatted_result[key] = values

    return formatted_result
    
tasks_outputs = get_tasks_outputs(regen_tasks)

for idx, output in enumerate(tasks_outputs):
    print(f'Output #{idx+1}')
    print('avg_similarity_score\t>> ', regen_tasks[idx]["avg_similarity_score"]) # I use this for unique ID
    formatted_output = format_ouput(output)
    factors = list(formatted_output.keys())
    factors = [s.split('(')[0].strip('- ') for s in factors]
    print('factors\t\t\t>> ', factors)
    print()

Output #1
avg_similarity_score	>>  0.6498224159640215
factors			>>  ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

Output #2
avg_similarity_score	>>  0.6337972300592833
factors			>>  ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

Output #3
avg_similarity_score	>>  0.5606345009918464
factors			>>  ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

Output #4
avg_similarity_score	>>  0.5926186629462296
factors			>>  ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

Output #5
avg_similarity_score	>>  0.6784472824318495
factors			>>  ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

Output #6
avg_similarity_score	>>  0.5444074776153178
factors			>>  ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

Output #7
avg_similarity_sco

## Preprocess Tasks

In [20]:
regen_tasks_ = [{k: v for k, v in d.items() if k not in {'most_similar_instructions', 'avg_similarity_score'}} for d in regen_tasks]
print(json.dumps(regen_tasks_[:3], indent=2))

[
  {
    "instruction": "Perform a code review on the provided Python script and provide feedback on the readability, uniformity, understandability, correctness, performance, and security aspects of the code.",
    "input": "```\n# Python script to check if a number is prime or not\n\ndef is_prime(number):\n    if number < 2:\n        return False\n    for i in range(2, number):\n        if number % i == 0:\n            return False\n    return True\n\nnum = 17\n\nif is_prime(num):\n    print(f\"{num} is a prime number.\")\nelse:\n    print(f\"{num} is not a prime number.\")\n```",
    "output": "- Readability (Status: POSITIVE)\n  - The code is well-structured and easy to read.\n  - The use of comments helps in understanding the purpose of the different sections of code.\n- Uniformity (Status: POSITIVE)\n  - The code follows consistent indentation and spacing conventions, improving uniformity.\n- Understandability (Status: POSITIVE)\n  - The code is simple and straightforward to unde

In [24]:
import pandas as pd
task_df = pd.DataFrame(regen_tasks_)
task_df.head(3)

Unnamed: 0,instruction,input,output
0,Perform a code review on the provided Python s...,```\n# Python script to check if a number is p...,- Readability (Status: POSITIVE)\n - The code...
1,Conduct a code review on the provided JavaScri...,```javascript\n// JavaScript function to calcu...,- Readability (Status: POSITIVE)\n - The code...
2,Review the given C++ code and provide feedback...,```cpp\n// C++ program to reverse an array\n#i...,- Readability (Status: POSITIVE)\n - The code...


In [49]:
import math

word_count_df = task_df.applymap(lambda x: len(str(x).split()))
total_words = word_count_df.sum().sum()
ideal_total_words = 100_000
print('Total words:', total_words)
print('Ideal total words:', ideal_total_words)

n = math.ceil(ideal_total_words/total_words)
print(f'How many dataset we need? {n} * current dataset = {n*len(task_df)} data')

Total words: 21128
Ideal total words: 100000
How many dataset we need? 5 * current dataset = 500 data


In [21]:
# with open('coderev_data.json', 'w') as f:
#     json.dump(regen_tasks_, f)