# Chapter 2 - generating the prompt engineering report

In [1]:
from docx import Document
from docx.shared import RGBColor
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
import openai
import json
import re

In [3]:
openai_api_key = "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"

In [2]:
file_path = 'H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter 2 text\Chapter 2 Prompts.docx'

# Define the colors as per the user's definitions
GREEN1 = (56,118,29)
GREEN2 = (39,78,19)
PURPLE = (103,78,167)

## Functions to read in prompts file and split it up based on 'New chat'

In [4]:
def read_docx(file_path):
    """
    Open and read a .docx file, extract text along with color information.
    """
    doc = Document(file_path)
    text = []
    for para in doc.paragraphs:
        for run in para.runs:
            if run.font.color.rgb is not None:
                color = run.font.color.rgb
            else:
                color = None
            text.append((run.text, color))
    return text

In [5]:
def split_doc_into_chats_single_string(file_path, max_words=2500):
    """
    Split a .docx document into separate chats based on color information and a "<New chat>" marker.
    If a chat contains more than max_words words, split it into multiple sections.
    Each prompt and response is appended to a single string.
    Return a list of chats, where each chat is a list of text strings.
    """
    # Extract the text along with color information
    text_color_tuples = read_docx(file_path)

    # Group the text by conversations
    chats = []
    current_chat = {"prompt": "", "response": ""}
    current_speaker = None
    for text, color in text_color_tuples:
        # Check for the start of a new chat
        if "<New chat>" in text:
            # If a chat is in progress, add it to the list of chats
            if current_chat["prompt"] or current_chat["response"]:
                chats.append(current_chat)
                current_chat = {"prompt": "", "response": ""}
                current_speaker = None
        # Check for AI prompts and responses by color
        if color == GREEN1 or color == GREEN2:
            current_speaker = "prompt"
        elif color == PURPLE:
            current_speaker = "response"
        # Append text to the current speaker
        if current_speaker:
            current_chat[current_speaker] += " " + text
    # Add the last chat
    if current_chat["prompt"] or current_chat["response"]:
        chats.append(current_chat)

    # Break up any chat that has over max_words words into smaller sections
    split_chats = []
    for chat in chats:
        split_chat = {}
        for speaker in ["prompt", "response"]:
            words = chat[speaker].split()
            if len(words) > max_words:
                # Split the chat into sections of max_words each
                sections = [words[i:i+max_words] for i in range(0, len(words), max_words)]
                # Convert the sections back into text strings
                sections = [' '.join(section) for section in sections]
                split_chat[speaker] = sections
            else:
                split_chat[speaker] = chat[speaker]
        split_chats.append(split_chat)
    
    return split_chats

In [24]:
# Test the function on the same document
chats_single_string = split_doc_into_chats_single_string(file_path)
chats_single_string[:5]  # Display the first 5 chats as an example

[{'prompt': " <New chat> I'm working on a data analysis project and conducting a survey of  organisations  in the vintage banjo industry. I'm searching for companies in the UK that either manufacture banjos, refurbish vintage banjos, or sell vintage banjos. Can you identify a list of companies for me? Is there any downloadable data available at this website: https://www.vintagebanjomaker.com/ I'm looking for a list of banjo makers based in the UK as well as other useful information such as such as their size (number of employees, revenue, etc.), location, product range, pricing, and market share.",
  'response': ' <ChatGPT4-with  WebPilot  plugin> Based on the search results, here are some companies in the UK that manufacture, refurbish, or sell vintage banjos: : A company that has a history dating back to the 19th century. : This  company  offers custom handmade banjos, designed in collaboration with their future owners. : A stringed instrument maker based in London. : This  company  

## Using ChatGPT to assess each prompt and response to write pros and cons

In [50]:
industry = "vintage banjo"
instruction = f"""Review the prompt and response.
                The 'prompt' is from a human interating with an AI large language model.
                The 'response' has been generated by the AI in response to the prompt.
                The human is using the AI to research the {industry} industry
                for a data analysis project.
                This section of the project is focused on obtaining data.
                Give a list of pros and cons based on how effective the prompt is 
                at obtaining useful data from the AI large language model."""

In [53]:
def query_completion(prompt, response, instruction):
    openai.api_key = openai_api_key
    messages=[
        {"role": "system", "content": "You are an objective, analytical, and fair-minded assistant. You are able to approach discussions or arguments with an open mind, striving to understand the various perspectives involved. "},
        {"role": "user", "content": f"Prompt: {prompt}"},
        {"role": "user", "content": f"AI Response: {response}"},
        {"role": "user", "content": instruction},
    ]

    query_response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=messages
    )
    
    return query_response.choices[0].message['content']


In [7]:
doc = Document()
green = RGBColor(39,78,19)

In [55]:
# Loop through the chats_single_string dictionary
for single_chat in chats_single_string:
    # Get the prompt and response
    prompts = single_chat["prompt"]
    responses = single_chat["response"]

    # Ensure prompts and responses are lists for consistency
    if isinstance(prompts, str):
        prompts = [prompts]
    if isinstance(responses, str):
        responses = [responses]

    # Iterate over each prompt-response pair
    for prompt, response in zip(prompts, responses):
        # Call the query_completion() function
        pros_cons_list = query_completion(prompt, response, instruction)

        # Print the pros and cons list
        print(pros_cons_list)
        
        # Add the prompt to the document in green
        para = doc.add_paragraph()
        run = para.add_run(f"Prompt: {prompt}")
        run.font.color.rgb = green

        # Add the pros and cons list to the document
        doc.add_paragraph(f"Pros and Cons: {pros_cons_list}")

# Save the document
doc.save("H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter2PromptEngReport.docx")

Pros:
1. The prompt clearly states the objective of the project, which is to gather a list of companies in the vintage banjo industry in the UK and obtain specific information about them.
2. The prompt provides a specific website URL for the AI model to analyze and check for downloadable data.
3. The prompt includes specific data points that the researcher is interested in, such as company size, location, product range, pricing, and market share.

Cons:
1. The AI response starts by providing a list of companies in the vintage banjo industry, but it does not provide all the requested data points such as company size, revenue, and market share.
2. The response mentions that the Vintage Banjo Maker website does not have downloadable data, which means the researcher will need to manually gather the desired information.
3. There is no mention of the availability of specific data points like number of employees or revenue for each company, which may require additional research or direct cont

## Finally, write the original prompts and responses pasted from AI to the bottom of the document

In [57]:
# Add prompts and responses after the pros and cons
# Load the existing document
source_doc = Document('H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter 2 text\Chapter 2 Prompts.docx')

# Add the contents of the existing document to the new document
for paragraph in source_doc.paragraphs:
    # Copy the text of each paragraph
    text = paragraph.text
    # Add the text to the new document
    doc.add_paragraph(text)

# Save the new document
doc.save("H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter2PromptEngReport.docx")

In [8]:
# This method maintains both the colour and spacing of the original prompts word doc when copying to the 'prompt eng report'.
from docx.shared import RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

source_doc = Document('H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter 2 text\Chapter 2 Prompts.docx')

for paragraph in source_doc.paragraphs:
    # Create a new paragraph in the target document
    new_paragraph = doc.add_paragraph()
    
    # Copy the alignment (and other properties if needed)
    new_paragraph.alignment = paragraph.alignment

    for run in paragraph.runs:
        # Create a new run in the new paragraph
        new_run = new_paragraph.add_run(run.text)

        # Copy the font style, including color
        new_run.font.name = run.font.name
        new_run.font.size = run.font.size
        new_run.font.bold = run.font.bold
        new_run.font.italic = run.font.italic
        new_run.font.underline = run.font.underline
        new_run.font.color.rgb = run.font.color.rgb

# Save the new document
doc.save("H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter2PromptEngReport.docx")