# Chapter 4 - generating the prompt engineering report

In [1]:
from docx import Document
from docx.shared import RGBColor
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
import openai
import json
import re

In [2]:
openai_api_key = "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"

In [3]:
file_path = 'H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter 4 prompts.docx'

# Define the colors as per the user's definitions
#GREEN = (112,173,71)
#PURPLE = (112,48,160)
GREEN = RGBColor(112, 173, 71)    # Convert from tuples to RGB colours at the start to avoid inconsistencies
PURPLE = RGBColor(112, 48, 160)

## Functions to read in prompts file and split it up based on 'New chat'

In [4]:
def read_docx(file_path):
    """
    Open and read a .docx file, extract text along with color information.
    """
    doc = Document(file_path)
    text = []
    for para in doc.paragraphs:
        for run in para.runs:
            if run.font.color.rgb is not None:
                color = run.font.color.rgb
            else:
                color = None
            text.append((run.text, color))
    return text

In [10]:
def split_doc_into_chats_single_string(file_path, max_words=2000):
    # Define the color codes for the prompt and response
    GREEN = RGBColor(112, 173, 71)
    PURPLE = RGBColor(112, 48, 160)

    # Extract the text along with color information
    text_color_tuples = read_docx(file_path)

    # Group the text by conversations
    chats = []
    current_chat = {"prompt": [], "response": []}
    current_prompt = ""
    current_response = ""
    for text, color in text_color_tuples:
        # Check for the start of a new chat
        if "[Start of new chatgpt 4 chat with code interpreter]" in text:
            if current_prompt or current_response:
                current_chat["prompt"].append(current_prompt.strip())
                current_chat["response"].append(current_response.strip())
                current_prompt = ""
                current_response = ""
            if current_chat["prompt"] or current_chat["response"]:
                chats.append(current_chat)
                current_chat = {"prompt": [], "response": []}
        # Check for AI prompts and responses by color
        if color == GREEN:
            if current_response:
                current_chat["response"].append(current_response.strip())
                current_response = ""
            current_prompt += " " + text
        elif color == PURPLE:
            if current_prompt:
                current_chat["prompt"].append(current_prompt.strip())
                current_prompt = ""
            current_response += " " + text

    # Add the last chat
    if current_prompt or current_response:
        current_chat["prompt"].append(current_prompt.strip())
        current_chat["response"].append(current_response.strip())
    if current_chat["prompt"] or current_chat["response"]:
        chats.append(current_chat)

    # Break up any chat that has over max_words words into smaller sections
    split_chats = []
    for chat in chats:
        prompts = chat["prompt"]
        responses = chat["response"]
        for prompt, response in zip(prompts, responses):
            prompt_words = prompt.split()
            response_words = response.split()
            max_length = max(len(prompt_words), len(response_words))
            for i in range(0, max_length, max_words):
                split_chat = {
                    "prompt": ' '.join(prompt_words[i:i + max_words]),
                    "response": ' '.join(response_words[i:i + max_words])
                }
                split_chats.append(split_chat)
    
    return split_chats

In [11]:
# Test the function on the same document
chats_single_string = split_doc_into_chats_single_string(file_path)
chats_single_string[:5]  # Display the first 5 chats as an example

[{'prompt': 'How do i check the attached dataset in python?',
  'response': "ChatGPT Finished working Hide work python Copy code import pandas as pd # Load the dataset data = pd.read_csv( '/mnt/data/NI_32200.csv' ) # Display the first few rows of the dataset data.head() RESULT Company Name Company Number \\ 0 1TO1MUSIC LTD NI617461 1 1TO1MUSIC LTD NI617461 2 1TO1MUSIC LTD NI617461 3 ANDANTE PERCUSSION LTD NI049797 4 ANDANTE PERCUSSION LTD NI049797 Registered office address Company status \\ 0 Unit 24 Lowes Industrial Estate, 31 Ballynahin... Active 1 Unit 24 Lowes Industrial Estate , 31 Ballynahin... Active 2 Unit 24 Lowes Industrial Estate, 31 Ballynahin... Active 3 97 Bannfield Road, Ballyroney, Banbridge, Co D... Active 4 97 Bannfield Road, Ballyroney, Banbridge, Co D... Active Company type Incorporated on \\ 0 Private limited Company 21 March 2013 1 Private limited Company 21 March 2013 2 Private limited Company 21 March 2013 3 Private limited Company 24 February 2004 4 Private lim

## Using ChatGPT to assess each prompt and response to write pros and cons

In [12]:
instruction = f"""Review the prompt and response.
                The 'prompt' is from a human interacting with an AI large language model.
                The 'response' has been generated by the AI in response to the prompt.
                The human is using the AI to aid in cleaning a dataset
                for a data analysis project.
                
                The AI has two main tasks depending on the prompt: 
                to provide advice on what type of data cleaning to perform on the provided dataset;
                OR to provide the python code that the human can take and execute themselves to perform the data cleaning.
                
                Give a list of pros and cons based on how effective the prompt is 
                at obtaining useful information from the AI large language model.
                """

In [13]:
def query_completion(prompt, response, instruction):
    openai.api_key = openai_api_key
    messages=[
        {"role": "system", "content": "You are an objective, analytical, and fair-minded assistant. You are able to approach discussions or arguments with an open mind, striving to understand the various perspectives involved. "},
        {"role": "user", "content": f"Prompt: {prompt}"},
        {"role": "user", "content": f"AI Response: {response}"},
        {"role": "user", "content": instruction},
    ]

    query_response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=messages
    )
    
    return query_response.choices[0].message['content']


In [14]:
doc = Document()

In [15]:
# NEW LOOP 06/08/23
# Loop through the chats_single_string dictionary
for single_chat in chats_single_string:
    # Get the prompt and response
    prompt = single_chat["prompt"]
    response = single_chat["response"]

    # Call the query_completion() function
    pros_cons_list = query_completion(prompt, response, instruction)

    # Print the pros and cons list
    print(pros_cons_list)

    # Add the prompt to the document in green
    para = doc.add_paragraph()
    run = para.add_run(f"Prompt: {prompt}")
    run.font.color.rgb = GREEN

    # Add the pros and cons list to the document
    doc.add_paragraph(f"Pros and Cons: {pros_cons_list}")

# Save the document
doc.save("H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter4PromptEngReport.docx")


Pros:
1. The prompt clearly specifies the objective of the human - to check the attached dataset in Python.
2. The prompt provides a clear context about the dataset being used for a data analysis project.
3. The prompt indicates that the human is seeking either advice on what type of data cleaning to perform or code to execute for data cleaning.

Cons:
1. The prompt lacks specific details about the dataset, such as its size, format, or any specific issues the human is encountering.
2. The prompt does not clearly indicate the desired outcome or purpose of checking the dataset.
3. The prompt does not specify the level of expertise the human possesses in Python or data cleaning techniques.

Overall, while the prompt provides some relevant information, it could be improved by including more specifics and clearly stating the desired outcome. This would help the AI generate a more accurate and tailored response.
Pros:
1. The prompt clearly specifies that the human is seeking assistance in pr

## Finally, write the original prompts and responses pasted from AI to the bottom of the document

In [17]:
from docx.shared import RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

source_doc = Document('H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter 4 prompts.docx')

for paragraph in source_doc.paragraphs:
    # Create a new paragraph in the target document
    new_paragraph = doc.add_paragraph()
    
    # Copy the alignment (and other properties if needed)
    new_paragraph.alignment = paragraph.alignment

    for run in paragraph.runs:
        # Create a new run in the new paragraph
        new_run = new_paragraph.add_run(run.text)

        # Copy the font style, including color
        new_run.font.name = run.font.name
        new_run.font.size = run.font.size
        new_run.font.bold = run.font.bold
        new_run.font.italic = run.font.italic
        new_run.font.underline = run.font.underline
        new_run.font.color.rgb = run.font.color.rgb

# Save the new document
doc.save("H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter4PromptEngReport.docx")