In [1]:
import os
import openai
import pandas as pd
import logging

openai.api_key = os.getenv("OPENAI_API_KEY")
logging.basicConfig(filename='../logs/chatbot_prototype.log', 
                    format='%(asctime)s - %(levelname)s:\n%(message)s\n*************************************************************************\n\n', 
                    datefmt="%m/%d/%Y %I:%M:%S %p %Z",
                   level=logging.INFO)

In [13]:
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def log_completion_call(args, running_prompt, gpt_response):
    logging.info("Parameters:\n{0}\n***************\nPrompt:\n{1}\n***************\nOutput:\n{2}".format(args, 
                                                                           running_prompt, 
                                                                           gpt_response))

# fine tuning

In [None]:
fine_tuning_data = pd.DataFrame(columns=["prompt", "completion"])

def add_row_to_fine_tuning_data(prompt, completion, fine_tuning_data=fine_tuning_data):
    fine_tuning_data.loc[len(fine_tuning_data)] = [prompt, completion]

In [None]:
add_row_to_fine_tuning_data("Define MAE", "the sum of absolute errors divided by the sample size")
add_row_to_fine_tuning_data("Define MSE", "the sum of squared errors divided by the sample size")
add_row_to_fine_tuning_data("Define accuracy", "the fraction of observations that you predicted correctly")
add_row_to_fine_tuning_data("Define precision", "the fraction of predicted positives that are actually positives")
add_row_to_fine_tuning_data("Define recall", "the fraction of actual positives that are actually positives")

In [None]:
fine_tuning_data

In [None]:
fine_tuning_data.to_csv("fine_tuning_data/fine_tuning_data_1.csv", index=False)

In [None]:
openai.File.create(
  file=open("fine_tuning_data/fine_tuning_data_1_prepared.jsonl"),
  purpose='fine-tune'
)


In [None]:
openai.File.list()

In [None]:
openai.FineTune.create(
    training_file="file-CTW0Uvb2hRnXwES7vv0KpiHr",
    model="davinci",
)

In [None]:
openai.FineTune.list_events(id="ft-RQqocx8ATON9RT2GU0c2rsHw")

In [None]:
openai.FineTune.retrieve(id="ft-RQqocx8ATON9RT2GU0c2rsHw")

In [None]:

# openai.Model.list()


# for a single topic. no context on what the use case is.

In [None]:
starting_prompt = """
Sam is interviewing Amol for a job as a Data Scientist at a company called Urbint. 
Sam is friendly and empathetic. 
Sam wants to discuss what MAE is. 
Sam must not answer the questions himself.

Follow the following logic when discussing a topic: 
- State that we are going to discuss that topic 
- Create a question about how to do it 

After Amol answers a question about how to do something: 
- If he has answered it correctly, tell him that it is correct and create a follow up question about how it works. 
- If he answers a question incorrectly, tell him that it is incorrect and explain the correct answer. Then output "TOPIC COMPLETE" 

After Amol answers a follow up question about how it works: 
- If he has answered it correctly, tell him that it is correct. Then output "TOPIC COMPLETE" 
- If he answers it incorrectly, tell him that it is incorrect and explain the correct answer. Then output "TOPIC COMPLETE"
"""

## manual 

In [None]:
gpt_response = ""
user_input = ""

running_prompt = starting_prompt + "\nSam:"

In [None]:
running_prompt += gpt_response + "\nAmol: " + user_input + "\nSam:"
running_prompt

In [None]:
gpt_response = openai.Completion.create(
    model="text-davinci-002",
    prompt=running_prompt,
    temperature=0.7,
    frequency_penalty=0.59,
    max_tokens=2125,
    stop=["Amol:", "Amol: "]
).choices[0].text

gpt_response

In [None]:
user_input = input(gpt_response)

user_input

## automatic

In [None]:
# while not all topics have been covered,
    # gpt3 -> (ask question / response to user input)
    # get user input
    
topic_id = 0

while (topic_id <= 0):
    
    count = 0
    # running_prompt = topic_prompt.format(topics[topic_id])
    running_prompt = starting_prompt + "\nSam:"
    gpt_response = ""
    user_input = ""
    
    while count >= 0:
        if count > 0:
            running_prompt += gpt_response + "\nAmol: " + user_input + "\nSam:"
        
        gpt_response = openai.Completion.create(
            # model="text-davinci-002",
            model="davinci:ft-personal-2022-11-03-13-01-28",
            prompt=running_prompt,
            temperature=0.7,
            frequency_penalty=0.59,
            max_tokens=2049 - int(len(running_prompt.split(' ')) * 5/3),
            stop=["Amol:", "Amol: "]
        ).choices[0].text
        
        if "TOPIC COMPLETE" in gpt_response:
            topic_id += 1
            break

        user_input = input(gpt_response)
        
        count += 1
    

# for multiple topics

In [2]:
starting_prompt = """
Sam is interviewing {0} for a job as a Data Scientist at a company called Urbint. 
Sam is friendly and empathetic.

One of the major use cases of Urbint is to prevent damage caused by different types of construction on gas pipes. Important indicators for the risk of a construction project are the type of work being done (i.e. installing a pole is not as risky as installing a sewer) and topological information about where the construction is being done. 
Urbint has given {0} a customer's dataset, which contains tickets, where each ticket contains binary information such as if a certain work type was done (work type are given numerical numbers and not named to hide proprietary Urbint information), topological information such as elevation and slope of elevation, the date the ticket was created and whether there was an accident (binary). 
{0} has to build a predictive model that will find whether there will be an accident. The model must be interpretable. The model doesn't need to have high performance. 
The relationship between the independent and dependant variables is nonlinear.

Sam wants to discuss how {0} {1}. Sam does not answer the questions himself.

Sam follows the following logic when discussing a topic: 
- State that he wants to discuss that topic 
- Create a question about how {0} approached it
- After {0} answers a question about how they approached it:
    - If {0} has answered it correctly, tell {0} that it is correct. Then output "TOPIC COMPLETE" 
    - If {0} answers a question incorrectly, tell {0} that it is incorrect and explain the correct answer. Then output "TOPIC COMPLETE".

"""

In [None]:
removed = """
- After {0} answers a question about how they approached it:
    - If {0} has answered it correctly, tell {0} that it is correct and create a follow up question about why {0} chose that approach as opposed to other options. 
    - If {0} answers a question incorrectly, tell {0} that it is incorrect and explain the correct answer. Then output "TOPIC COMPLETE" 

- After {0} answers a follow up question about why {0} chose that approach as opposed to other options:
    - If {0} has answered it correctly, tell {0} that it is correct. Then output "TOPIC COMPLETE" 
    - If {0} answers it incorrectly, tell {0} that it is incorrect and explain the correct answer. Then output "TOPIC COMPLETE""""

In [3]:
topics = ["Formulated the Problem ",
          "Selected the Algorithm ",
          "Designed the Experiment ",
          "Engineered the Features",
          "Selected the final Model",
          "Presented the Model to the client"]

## manual

In [9]:
gpt_response = ""
user_input = ""

running_prompt = starting_prompt.format("Amol", topics[1]) + "\nSam:"
running_prompt

'\nSam is interviewing Amol for a job as a Data Scientist at a company called Urbint. \nSam is friendly and empathetic.\n\nOne of the major use cases of Urbint is to prevent damage caused by different types of construction on gas pipes. Important indicators for the risk of a construction project are the type of work being done (i.e. installing a pole is not as risky as installing a sewer) and topological information about where the construction is being done. \nUrbint has given Amol a customer\'s dataset, which contains tickets, where each ticket contains binary information such as if a certain work type was done (work type are given numerical numbers and not named to hide proprietary Urbint information), topological information such as elevation and slope of elevation, the date the ticket was created and whether there was an accident (binary). \nAmol has to build a predictive model that will find whether there will be an accident. The model must be interpretable. The model doesn\'t ne

In [12]:
running_prompt += gpt_response + "\nAmol: " + user_input + "\nSam:"
running_prompt

'\nSam is interviewing Amol for a job as a Data Scientist at a company called Urbint. \nSam is friendly and empathetic.\n\nOne of the major use cases of Urbint is to prevent damage caused by different types of construction on gas pipes. Important indicators for the risk of a construction project are the type of work being done (i.e. installing a pole is not as risky as installing a sewer) and topological information about where the construction is being done. \nUrbint has given Amol a customer\'s dataset, which contains tickets, where each ticket contains binary information such as if a certain work type was done (work type are given numerical numbers and not named to hide proprietary Urbint information), topological information such as elevation and slope of elevation, the date the ticket was created and whether there was an accident (binary). \nAmol has to build a predictive model that will find whether there will be an accident. The model must be interpretable. The model doesn\'t ne

In [13]:
gpt_response = openai.Completion.create(
    model="text-davinci-002",
    prompt=running_prompt,
    temperature=0.7,
    frequency_penalty=0.59,
    max_tokens=2125,
    stop=["Amol:", "Amol: "]
).choices[0].text.lstrip()
gpt_response

"That's correct. TOPIC COMPLETE."

In [11]:
user_input = input(gpt_response)

user_input

I want to discuss how you approached selecting the algorithm for your predictive model. How did you go about doing that? I would use a random forest classifier because the relationship between the independent and dependant variables is nonlinear.


'I would use a random forest classifier because the relationship between the independent and dependant variables is nonlinear.'

## automatic

In [6]:
args = dict(model="text-davinci-002",
            temperature=0,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            max_tokens=2125,
            stop=["Amol:", "Amol: "])

In [7]:
topic_id = 0

while (topic_id < len(topics)):
    
    count = 0
    running_prompt = starting_prompt.format("Amol", topics[topic_id]) + "\nSam:"
    gpt_response = ""
    user_input = ""
    
    while count >= 0:
        if count > 0:
            running_prompt += gpt_response + "\nAmol: " + user_input + "\nSam:"
        
        gpt_response = openai.Completion.create(
            # model="davinci:ft-personal-2022-11-03-13-01-28",
            prompt=running_prompt,
            **args
        ).choices[0].text.lstrip()
        
        # log query
        log_completion_call(args, running_prompt, gpt_response)
        
        if "TOPIC COMPLETE" in gpt_response:
            gpt_response = print(gpt_response.split("TOPIC COMPLETE")[0])
            topic_id += 1
            break

        user_input = input(gpt_response)
        
        count += 1

I want to discuss how you formulated the problem. How did you go about it? binary classification


That's correct. 


KeyboardInterrupt: Interrupted by user