# Libraries and Utils

In [163]:
import random
import re
import utils
import json


def check_tasks(tasks):
  prompt = ''
  for idx, task_dict in enumerate(tasks):
    if 'instances' in task_dict:
      (instruction, input, output) = task_dict["instruction"], task_dict["instances"][0]["input"], task_dict["instances"][0]["output"]
    else:
      (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
    instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
    
    input = "<noinput>" if input.lower() == "" else input
    prompt += f"###\n"
    prompt += f"{idx + 1}. Instruction: {instruction}\n"
    prompt += f"{idx + 1}. Input:\n{input}\n"
    prompt += f"{idx + 1}. Output:\n{output}\n"
  print(prompt)


def get_tasks_outputs(tasks):
  return [
    task_dict["instances"][0]["output"] if 'instances' in task_dict else task_dict["output"] 
    for task_dict in tasks
  ]

def format_ouput(output):
    lines = output.split('\n')
    result = {}

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if line.startswith('-'):
            key = line
            result[key] = []
            i += 1
            while i < len(lines) and lines[i].startswith('  -'):
                result[key].append(lines[i])
                i += 1
        else:
            i += 1

    formatted_result = {}
    for key, values in result.items():
        formatted_result[key] = values

    return formatted_result
    
def check_tasks_outputs(tasks):
    tasks_outputs = get_tasks_outputs(tasks)

    for idx, output in enumerate(tasks_outputs):
        formatted_output = format_ouput(output)
        factors = list(formatted_output.keys())
        factors = [s.split('(')[0].strip('- ') for s in factors]
        
        if factors != ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']:
            print(f'Output #{idx+1}')
            print('avg_similarity_score\t>> ', tasks[idx]["avg_similarity_score"]) # I use this for unique ID
            print('factors\t\t\t>> ', factors)
            print()
            
def filter_tasks(task):
  output = task['output']
  formatted_output = format_ouput(output)
  factors = list(formatted_output.keys())
  factors = [s.split('(')[0].strip('- ') for s in factors]
  
  return factors == ['Readability', 'Uniformity', 'Understandability', 'Correctness', 'Performance', 'Security']

# Seed Tasks

In [164]:
seed_tasks = utils.jload('seed_tasks.json')
check_tasks(random.sample(seed_tasks, 3))

###
1. Instruction: Perform a code review on the given Python code file and provide feedback on the following factors: readability, uniformity, understandability, correctness, performance, and security.
1. Input:
```
def calculate_area(radius):
    import math
    # Check if radius is a positive number
    if radius <= 0:
        return "Invalid radius. Please provide a positive number."
    # Calculate the area
    area = math.pi * radius ** 2
    return area
def calculate_circumference(radius):
    import math
    # Check if radius is a positive number
    if radius <= 0:
        return "Invalid radius. Please provide a positive number."
    # Calculate the circumference
    circumference = 2 * math.pi * radius
    return circumference
radius = float(input("Enter the radius: "))
area = calculate_area(radius)
print(f"The area of the circle is: {area}")
circumference = calculate_circumference(radius)
print(f"The circumference of the circle is: {circumference}")
```
1. Output:
- Readabi

# Generated Tasks

In [191]:
regen_tasks_path = 'regen.json'
regen_tasks = utils.jload(regen_tasks_path)
print(f'Generated Tasks Length: {len(regen_tasks)}')

Generated Tasks Length: 496


## Check Tasks Output

In [192]:
print('len(regen_tasks) >>', len(regen_tasks))
check_tasks_outputs(regen_tasks)

len(regen_tasks) >> 496


## Filter Tasks

In [193]:
filtered_regen_tasks = list(filter(filter_tasks, regen_tasks))

print('len(filtered_regen_tasks) >>', len(filtered_regen_tasks))
check_tasks_outputs(filtered_regen_tasks)

len(filtered_regen_tasks) >> 496


In [189]:
# with open(regen_tasks_path, 'w') as file:
#     json.dump(filtered_regen_tasks, file)

# print(f'{regen_tasks_path} filtered sucessfully.')

regen.json filtered sucessfully.


## Preprocess Tasks for Final Dataset

In [167]:
regen_tasks_ = [{k: v for k, v in d.items() if k not in {'most_similar_instructions', 'avg_similarity_score'}} for d in regen_tasks]
print(json.dumps(regen_tasks_[:3], indent=2))

[
  {
    "instruction": "Perform a code review on the provided Python script and provide feedback on the readability, uniformity, understandability, correctness, performance, and security aspects of the code.",
    "input": "```\n# Python script to check if a number is prime or not\n\ndef is_prime(number):\n    if number < 2:\n        return False\n    for i in range(2, number):\n        if number % i == 0:\n            return False\n    return True\n\nnum = 17\n\nif is_prime(num):\n    print(f\"{num} is a prime number.\")\nelse:\n    print(f\"{num} is not a prime number.\")\n```",
    "output": "- Readability (Status: POSITIVE)\n  - The code is well-structured and easy to read.\n  - The use of comments helps in understanding the purpose of the different sections of code.\n- Uniformity (Status: POSITIVE)\n  - The code follows consistent indentation and spacing conventions, improving uniformity.\n- Understandability (Status: POSITIVE)\n  - The code is simple and straightforward to unde

## Determine Min. Dataset Size

In [168]:
import pandas as pd
task_df = pd.DataFrame(regen_tasks_)
task_df.head(3)

Unnamed: 0,instruction,input,output
0,Perform a code review on the provided Python s...,```\n# Python script to check if a number is p...,- Readability (Status: POSITIVE)\n - The code...
1,Conduct a code review on the provided JavaScri...,```javascript\n// JavaScript function to calcu...,- Readability (Status: POSITIVE)\n - The code...
2,Review the given C++ code and provide feedback...,```cpp\n// C++ program to reverse an array\n#i...,- Readability (Status: POSITIVE)\n - The code...


In [169]:
import math

word_count_df = task_df.applymap(lambda x: len(str(x).split()))
total_words = word_count_df.sum().sum()
print('Total words:', total_words)
print('Total data:', len(task_df))
words_per_data = math.floor(total_words / len(task_df))
print('Words/data:', words_per_data)

print()
min_total_words = 100_000
print('Min. total words:', min_total_words)
print()

n_needed_data = math.floor((min_total_words-total_words) / words_per_data)
n_needed_data
print(f'Total data needed: {n_needed_data}')

Total words: 106501
Total data: 503
Words/data: 211

Min. total words: 100000

Total data needed: -31


## Export Dataset

In [170]:
with open('coderev_data.json', 'w') as f:
    json.dump(regen_tasks_, f)