In [1]:
from scripts_coder import *
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List
import instructor
import json

class Prompts(BaseModel):
    prompt: str 

client = instructor.from_openai(
    OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",  
    ),
    mode=instructor.Mode.JSON,
)


def get_difficulty(i):
    if i == 1:
        return ["pandas"] 
    elif i == 2:
        return ["pandas", "numpy"] 
    elif i == 3:
        return ["pandas", "pyjanitor"]  
    elif i == 4:
        return ["pandas", "dask"]
    elif i == 5:
        return ["pandas", "dask", "sqlalchemy"]  
    else:
        return None


def create_first_prompt(propriedade):
    prompts = []
    resp = client.chat.completions.create(
        model="gemma2:2b",
        messages=[  
            {
                "role": "user",
                "content": f"Imagine you are a human asking a language model to return Python code for cleaning a \
                    dirty CSV file. Create a prompt to request this code with a poor {propriedade} \
                    (rated 1 on a scale of 5). Do not include examples or fields to complete; focus on \
                    delivering a prompt that reflects a poor {propriedade} as described."
            }
        ],
        response_model=Prompts,
    )

    prompt = resp.prompt
    prompts_formated = {"propriedade": propriedade, "valor": 1, "prompt": prompt}

    return prompts_formated

def create_following_prompts(previous_prompt, message):
    
    resp = client.chat.completions.create(
        model="gemma2:2b",
        messages=[
            {
                "role": "user",
                "content": f"{message}: {previous_prompt["prompt"]}",  
            }
        ],
        response_model=Prompts,
    )
    prompt = resp.prompt
    prompts_formated = {"propriedade": previous_prompt["propriedade"], "valor": previous_prompt["valor"] + 1, "prompt": prompt}

    return prompts_formated

def create_complexity_prompts(i):
    resp = client.chat.completions.create(
        model="gemma2:2b",
        messages=[  
            {
                "role": "user",
                "content": f"Imagine you are a human asking a language model to return Python code for cleaning a \
                    dirty CSV file. Create a prompt to request this code and also request that it must contain\
                    the following libraries {get_difficulty(i)}."
            }
        ],
        response_model=Prompts,
    )

    prompt = resp.prompt
    prompts_formated = {"propriedade": "complecity", "valor": i, "prompt": prompt}

    return prompts_formated

    
def add_prompt_to_json(propriedade, prompts):
    json_filename = f'prompts_{propriedade}_coder.json'
    with open(json_filename, 'w', encoding='utf-8') as json_file:
        json.dump(prompts, json_file, ensure_ascii=False, indent=2)


### Clarity for Coder

In [10]:
prompt_1_clarity = create_first_prompt("clarity")
prompt_1_clarity

{'propriedade': 'clarity',
 'valor': 1,
 'prompt': 'I gotta clean up this messed-up CSV file...  Python, please! 🥺'}

In [16]:
message = "I found this prompt not very clear to understand, plase make it a BIT more clearer"
prompt_2_clarity = create_following_prompts(prompt_1_clarity, message)
prompt_2_clarity

{'propriedade': 'clarity',
 'valor': 2,
 'prompt': 'I need help cleaning up this messy CSV file using Python. Can you write a script that can handle the cleaning and data manipulation tasks efficiently?'}

In [17]:
prompt_3_clarity = create_following_prompts(prompt_2_clarity, message)
prompt_3_clarity

{'propriedade': 'clarity',
 'valor': 3,
 'prompt': 'I need help cleaning up this messy CSV file using Python. You need to create a Python script that can clean and manipulate it efficiently. The file is currently unorganized and needs to be properly formatted.'}

In [18]:
prompt_4_clarity = create_following_prompts(prompt_3_clarity, message)
prompt_4_clarity

{'propriedade': 'clarity',
 'valor': 4,
 'prompt': 'I need your help cleaning up a messy CSV file in Python. I want a Python script that can: \n\n1. **Clean the data:** Remove empty rows and fix inconsistencies.\n2. **Format the data:** Make it properly organized with correct header labels and column formatting. '}

In [20]:
prompt_5_clarity = create_following_prompts(prompt_4_clarity, message)
prompt_5_clarity

{'propriedade': 'clarity',
 'valor': 5,
 'prompt': "I need help creating a Python script to clean up an inconsistent CSV file and format the results neatly. Here's what I envision my script doing:\n\n1. **Data cleaning:** Identify and remove any empty rows or cells from the data.\n2. **Format improvement:** Put the data into a well-organized structure. Use correct header labels, define meaningful column names, and format the content to be easy to understand."}

In [23]:
prompts_clarity = [prompt_1_clarity, prompt_2_clarity, prompt_3_clarity, prompt_4_clarity, prompt_5_clarity]
add_prompt_to_json("clarity", prompts_clarity)

### Complexity for Coder

In [43]:
prompt_1_complexity = create_complexity_prompts(1)
prompt_1_complexity

{'propriedade': 'complecity',
 'valor': 1,
 'prompt': 'I have a dirty CSV file that I need to clean up in Python using the pandas library.  Please provide me with code that handles tasks such as removing empty rows, filling missing values with appropriate data, and handling inconsistent data types. Can you include an example for handling missing values specifically? This should be a simple program aimed at demonstrating my intent.'}

In [46]:
prompt_2_complexity = create_complexity_prompts(2)
prompt_2_complexity

{'propriedade': 'complecity',
 'valor': 2,
 'prompt': "Can you provide me with Python code that cleans up a dirty CSV file? The code should use the 'pandas' and 'numpy' libraries. It would be helpful if it included some basic cleaning steps like handling missing values, converting formats to correct types, and checking for inconsistencies in the data. Please focus on creating a concise and well-structured script that showcases best practices."}

In [53]:
prompt_3_complexity = create_complexity_prompts(3)
prompt_3_complexity

{'propriedade': 'complecity',
 'valor': 3,
 'prompt': "I need Python code to clean up this messy CSV file. Please create a function or script using pandas and pyjanitor to do the following: \n- Read in my CSV data\n- Address inconsistencies such as missing values, incorrect formats, and any other issues that might come up\n- Prepare the cleaned data for further processing!\n\nMake sure this code uses both 'pandas' and 'pyjanitor' libraries -  Thanks!"}

In [54]:
prompt_4_complexity = create_complexity_prompts(4)
prompt_4_complexity

{'propriedade': 'complecity',
 'valor': 4,
 'prompt': 'Can you provide me with some Python code to clean up a dirty CSV file using pandas and dask?  The goal is to handle common issues like missing values, inconsistent formatting, and outliers. The code should perform the cleaning step-by-step, detailing any crucial steps each library provides for data analysis. For example, how would you: \n* Handle missing values?\n* Convert date formats?\n* Deal with duplicates or outliers?\nPlease include comments to help understand your implementation.'}

In [55]:
prompt_5_complexity = create_complexity_prompts(5)
prompt_5_complexity

{'propriedade': 'complecity',
 'valor': 5,
 'prompt': 'Please provide a Python script for cleaning a dirty CSV file using pandas, dask and sqlalchemy. The script should be able to: \n1. Read the CSV.\n2. Identify and handle missing values where appropriate.\n3. Perform data type conversion and normalization as needed.\n4. Optionally, use SQLAlchemy to query an external database for additional data or verification if applicable.\n\nSpecifically, look into potential pandas, dask functionalities, and sqlalchemy integrations for maximum efficiency.'}

In [56]:
prompts_complexity = [prompt_1_complexity, prompt_2_complexity, prompt_3_complexity, prompt_4_complexity, prompt_5_complexity]
add_prompt_to_json("complexity", prompts_complexity)

### Size for Coder

In [68]:
prompt_1_size = create_first_prompt("size")
prompt_1_size

{'propriedade': 'size',
 'valor': 1,
 'prompt': 'Please provide Python code to clean a dirty CSV file.'}

In [100]:
message = "add about 30 words to this prompt to make it bigger"
prompt_2_size = create_following_prompts(prompt_1_size, message)
prompt_2_size

{'propriedade': 'size',
 'valor': 2,
 'prompt': 'Provide me with Python code that cleans a messy CSV file. This code should include steps for handling missing data, identifying and removing duplicate rows, converting columns to the correct type (e.g., Date format).  Remember any necessary libraries (Pandas or others) should be imported'}

In [101]:
prompt_3_size = create_following_prompts(prompt_2_size, message)
prompt_3_size

{'propriedade': 'size',
 'valor': 3,
 'prompt': 'I need a Python script that cleans my messy CSV file. It needs to handle missing data by using pandas techniques like imputation. The code should identify and remove duplicate rows, and convert columns like dates to the correct format.  Please make sure to mention any necessary libraries used like pandas, for example.  '}

In [102]:
prompt_4_size = create_following_prompts(prompt_3_size, message)
prompt_4_size

{'propriedade': 'size',
 'valor': 4,
 'prompt': 'I need a Python script that cleans my messy CSV file (.csv) thoroughly.  It needs to account for missing data (e.g., NaN values) by using pandas features like imputation. The script should also identify and remove duplicate rows, and convert columns like dates into the proper dates format.  Additionally, it would be helpful to list the libraries used in the process such as pandas, numpy, etc.  \n'}

In [104]:
prompt_5_size = create_following_prompts(prompt_4_size, message)
prompt_5_size

{'propriedade': 'size',
 'valor': 5,
 'prompt': "Please provide a Python script that cleans a messy CSV file (e.g., .csv), ensuring data is handled correctly: **\n** - It should handle missing values accurately using pandas' imputation features.\n - Identify and remove duplicate rows.\n - Convert columns like dates into proper date format.\n - List the libraries you used in the process (like pandas, numpy, etc.).   It's crucial to understand if there are any specific challenges during data cleaning or preprocessing that might need special consideration.  The script should demonstrate a thorough cleaning process"}

In [105]:
prompts_size = [prompt_1_size, prompt_2_size, prompt_3_size, prompt_4_size, prompt_5_size]
add_prompt_to_json("size", prompts_size)

### Coder Specificity

In [3]:
prompt_1_specificity = create_first_prompt("specificity")
prompt_1_specificity

{'propriedade': 'specificity',
 'valor': 1,
 'prompt': 'Write some python code to clean up a CSV file that has extra data, missing values etc.'}

In [11]:
message = "Can you make this prompt a little more specific??"
prompt_2_specificity = create_following_prompts(prompt_1_specificity, message)
prompt_2_specificity

{'propriedade': 'specificity',
 'valor': 2,
 'prompt': "Write some Python code to clean up a CSV file (e.g., 'data.csv') containing extra data such as unnecessary rows or columns, replace missing values with specific ones, and format the output in a way that is easy to analyze."}

In [13]:
prompt_3_specificity = create_following_prompts(prompt_2_specificity, message)
prompt_3_specificity

{'propriedade': 'specificity',
 'valor': 3,
 'prompt': "Write some Python code to clean up a CSV file ('data.csv'), removing unnecessary rows or columns (e.g., header information) and replacing missing data with specific values like 0 or NaN, then format the output for easier analysis.\n\nHere are some considerations:\n - The target output file should be named 'cleaned_data.csv'.\n - Use pandas to process the CSV. \n  - Format the cleaned output as concise and easily analyzable data."}

In [15]:
prompt_4_specificity = create_following_prompts(prompt_3_specificity, message)
prompt_4_specificity

{'propriedade': 'specificity',
 'valor': 4,
 'prompt': "Write Python code to clean a CSV ('data.csv') for easier analysis, preparing the final file named 'cleaned_data.csv' .  Specifically include these steps:\n\n1. Remove unnecessary rows/columns including header information.\n2. Replace missing data values with specified ones like 0 or NaN\n3. Format the output as a concise and analyzable CSV format.\n\n**Tools:** Use pandas for this operation. \n**Considerations:** target output csv file to be named 'cleaned_data.csv'.  "}

In [27]:
prompt_5_specificity = create_following_prompts(prompt_4_specificity, message)
prompt_5_specificity

{'propriedade': 'specificity',
 'valor': 5,
 'prompt': "Write Python code using pandas to clean the 'data.csv' CSV file, prepare it for analysis. Name the final cleaned file 'cleaned_data.csv'. \n Specifically:\n 1. Remove unnecessary rows and columns including the header information.\n 2. Replace missing data values with specified ones like 0 or NaN.\n 3. Format the output as a concise and analyzable CSV format.\n\n Tools: Use pandas to perform this operation and use 'cleaned_data.csv' for the final file creation. "}

In [28]:
prompts_specificity = [prompt_1_specificity, prompt_2_specificity, prompt_3_specificity, prompt_4_specificity, prompt_5_specificity]
add_prompt_to_json("specificity", prompts_specificity)