# Docstring writer with LLMs (Azure OpenAI)

## Imports

In [1]:
import os
import pandas as pd
import openai
from openai import AzureOpenAI
import json
from tqdm import tqdm
import time

from utils import (
    load_config, 
    create_prompt, 
    list_all_filepaths,
    download_proxy
    )
from project_dirs import PROJECT_DIR, DATA_DIR, OUTPUT_DIR

%load_ext autoreload
%autoreload 3

os.makedirs(OUTPUT_DIR, exist_ok=True)

## Config, Azure Open AI params

In [2]:
cnf = load_config(cnf_dir=PROJECT_DIR, cnf_name='config.yml')

deployment_name = ['deployment_name']
api_version = cnf['api_version']
api_key = open(os.path.join(PROJECT_DIR, "keys", "azure_openai_key.txt"), "r").read().strip("\n")
endpoint = cnf['endpoint']

openai.api_type = cnf['api_type']
openai.api_key = open(os.path.join(PROJECT_DIR, "keys", cnf['openai_key_file']), "r").read().strip("\n")
openai.api_base = endpoint 
openai.api_version = api_version

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint=endpoint,
)


## Set proxy

In [3]:
proxy_server_port = download_proxy(cnf['url_proxy'])

# Set the proxy server address and port
# proxy_server = 'proxyre12.group.credem.net'
# proxy_port = '8080'

# Set the proxy username and password
proxy_username = 'ut13419'
proxy_password = open(os.path.join(PROJECT_DIR, "keys", "proxy_password.txt"), "r").read().strip("\n")

# Set the environment variables
os.environ['HTTP_PROXY'] = f'http://{proxy_username}:{proxy_password}@{proxy_server_port}'
os.environ['HTTPS_PROXY'] = f'http://{proxy_username}:{proxy_password}@{proxy_server_port}'


## Data, Analysis

In [6]:
filepaths = list_all_filepaths(
    common_dir=DATA_DIR,
    folder="",
    extension="py"
    )

In [7]:
path  = filepaths[0]

with open(path, encoding='utf-8') as f:
    text = f.read()

funcs = text.split('\ndef ')

In [8]:
imports = funcs[0]
updated_funcs = []

for ix, func in enumerate(tqdm(funcs[1:])):
    func = ''.join(['def ', func.strip()])

    prompt = create_prompt(text=func)
    completion = client.chat.completions.create(
    model=deployment_name,  # e.g. gpt-35-instant
    messages=[
        {
            "role": "user",
            "content": prompt,
        },
    ],
    temperature=0,
    max_tokens=1000,
    )
    completion_text = completion.choices[0].message.content.strip()
    updated_func = ''.join(['\n\n', completion_text])
    
    # Make sure the func starts with def .
    if not completion_text.startswith('def '):
        try:
            completion_text = completion_text.split('def ')[1]
            updated_func = ''.join(['\n\ndef', completion_text])
        except IndexError:
            print(f"Function {ix} was not processed correctly. Writing raw output")
            pass

    updated_funcs.append(updated_func)

updated_functions = ''.join(updated_funcs)
final_str = f"{imports}{updated_functions}"

  0%|          | 0/14 [00:03<?, ?it/s]


APIConnectionError: Connection error.

In [41]:
output_path = os.path.join(OUTPUT_DIR, 'functions.py')

with open(output_path, 'w') as f:
    f.write(final_str)
    

# Tests

In [22]:
func = funcs[2]

In [23]:
func

'get_cols_too_similar(data, threshold=0.95):\n    """\n    Find features with too many similar values.\n    :return: the pandas dataframe of sought features with the fraction of values which are similar, \n             as well as a list containing the most present value.\n    \n    :data: (pd.DataFrame) dataset\n    :threshold: (float, default=0.95) fraction of similar values, must be a number in [0,1] interval\n    """\n    \n    L = len(data)\n    \n    cols_counts = list()\n\n    for col in data.columns:\n        try:\n            unique_values, unique_counts = np.unique(data[col].values, return_counts=True)\n        except TypeError:\n            unique_values, unique_counts = np.unique(data[col].astype(str).values, return_counts=True)\n\n        idx_max = np.argmax(unique_counts)\n        cols_counts.append((col, unique_values[idx_max], unique_counts[idx_max]))\n    \n    colname_and_values = map(lambda x: (x[0], x[2]), cols_counts)\n    most_present_value = map(lambda x: x[1], co

In [24]:
func = funcs[2]
func = ''.join(['def ', func.strip()])

prompt = create_prompt(text=func)
completion = client.chat.completions.create(
model=deployment_name,  # e.g. gpt-35-instant
messages=[
    {
        "role": "user",
        "content": prompt,
    },
],
temperature=0,
max_tokens=1000,
)
completion_text = completion.choices[0].message.content

# try:
#     completion_text = completion_text.split('def')[1]
#     updated_func = ''.join(['\n\ndef', completion_text])
# except IndexError:
#     print(f"Function {ix} was not processed correctly. Writing raw output")
#     updated_func = completion_text
#     pass

# updated_funcs.append(updated_func)

In [25]:
completion_text

'def get_cols_too_similar(data: pd.DataFrame, threshold: float = 0.95) -> Tuple[pd.DataFrame, List]:\n    """\n    Find features with too many similar values.\n\n    Args:\n        data: (pd.DataFrame) dataset\n        threshold: (float, default=0.95) fraction of similar values, must be a number in [0,1] interval\n\n    Returns:\n        Tuple[pd.DataFrame, List]: A tuple containing the pandas dataframe of sought features with the fraction of values\n        which are similar, as well as a list containing the most present value.\n\n    This function takes a pandas dataframe and a threshold value as input and returns a tuple containing the pandas\n    dataframe of sought features with the fraction of values which are similar, as well as a list containing the most\n    present value. The function finds features with too many similar values by calculating the fraction of similar\n    values for each feature and returning the features with a fraction greater than or equal to the threshold 

In [26]:
with open(os.path.join(OUTPUT_DIR, 'f2.py'), 'w') as f:
    f.write(completion_text)

In [28]:
len(completion_text.split('def'))

3

In [30]:
completion_text.split('def')[1]

' get_cols_too_similar(data: pd.DataFrame, threshold: float = 0.95) -> Tuple[pd.DataFrame, List]:\n    """\n    Find features with too many similar values.\n\n    Args:\n        data: (pd.DataFrame) dataset\n        threshold: (float, '

In [33]:
completion

ChatCompletion(id='chatcmpl-8hKFbS6Gog64JWixAZN4zPlcNO8OF', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='def get_cols_too_similar(data: pd.DataFrame, threshold: float = 0.95) -> Tuple[pd.DataFrame, List]:\n    """\n    Find features with too many similar values.\n\n    Args:\n        data: (pd.DataFrame) dataset\n        threshold: (float, default=0.95) fraction of similar values, must be a number in [0,1] interval\n\n    Returns:\n        Tuple[pd.DataFrame, List]: A tuple containing the pandas dataframe of sought features with the fraction of values\n        which are similar, as well as a list containing the most present value.\n\n    This function takes a pandas dataframe and a threshold value as input and returns a tuple containing the pandas\n    dataframe of sought features with the fraction of values which are similar, as well as a list containing the most\n    present value. The function finds features with too many sim