This notebook generates the project report based on the vast amount of text gathered through manual prompting of various LLM's. The code breaks up the differnet chats from my 'Chapter 2 Prompts' document, summarises the information, and creates the report.

This is the simplified route I went with for Chapter 2, as I was having a difficult time completely automating the colection of suitable data having made numerous attempts at it.

An attempt to fully automate this chapter can be seen in the following notebook which I've included in the submission: 'Chapter 2 - Fully Automated attempt'.

In [1]:
from docx import Document
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
import openai
import json
import re

In [2]:
# Path to the file
file_path = 'H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter 2 text\Chapter 2 Prompts.docx'

# Open the document
doc = Document(file_path)

# Define RGB colors
GREEN1 = (56,118,29)
GREEN2 = (39,78,19)
PURPLE = (103,78,167)

In [3]:
openai_api_key = YOUR_API_KEY
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

## Function to extract headings and purple AI generated text

In [4]:
def extract_sections(doc, purple_rgb_color):
    # Placeholder for sections and the current section
    sections = []
    current_section = [] 

    # Iterate over the paragraphs
    for para in doc.paragraphs:
        # Check if the paragraph is a heading
        if para.style.name.startswith('Heading'):
            # If we have a current section, save it to sections
            if current_section:
                sections.append(current_section)
            # Start a new section with the heading
            current_section = [para.text]
        else:
            # Check if the paragraph contains purple text
            for run in para.runs:
                if run.font.color.rgb == purple_rgb_color:
                    current_section.append(para.text)
                    break  # We add the whole paragraph if we find a purple run, so we can break

    # Don't forget to add the last section
    if current_section:
        sections.append(current_section)

    return sections

## Function to convert list to dict, and join content to single strings

In [5]:
def sections_to_dict(sections):
    # Create a dictionary where the key is the heading and the content is the rest of the text
    sections_dict = {}

    # Iterate over the sections
    for section in sections:
        # The first element is the heading, the rest is the content
        heading = section[0]
        content = section[1:]
        # Join the content into a single string and store it in the dictionary
        sections_dict[heading] = " ".join(content)

    return sections_dict

## Function to count tokens in dict content, and split if necessary

In [6]:
def split_large_content(content_dict, token_limit):
    split_content_dict = {}  # Dictionary to store the split content

    for key, content in content_dict.items():
        tokens = llm.get_num_tokens(content)  # Checking token count

        # Check if tokens is greater than the token limit
        if tokens > token_limit:
            # Create a RecursiveCharacterTextSplitter object with the given parameters
            # 1 token is approximately 4 characters or 0.75 words, so we multiply the token limit by 4 to get the chunk size in characters
            text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " ", ""], chunk_size=4*token_limit, chunk_overlap=500)
            # Split the content into smaller documents
            docs = text_splitter.create_documents([content])
            split_content_dict[key] = docs
            
            print(f"Section '{key}' is split into {len(docs)} documents.")
                  
        else:
            # If the content doesn't exceed the token limit, keep it as it is
            split_content_dict[key] = [content]
                  
            print(f"Section '{key}' is not split. It remains as a single document.")

    return split_content_dict

## Function to summarize dict paragraphs

In [8]:
def summarize_paragraphs_working(paragraph_dict, prompt):
    openai.api_key = openai_api_key
    
    doc = Document()
    
    for section, content_parts in paragraph_dict.items():
        doc.add_heading(section, 2)
        
        for part_index, paragraph in enumerate(content_parts, start=1):
            summarized_text = ""
        
            print(f"\nSummarizing {section} - Part {part_index}")
            print(f"Type of paragraph: {type(paragraph)}")
            print(f"Paragraph content: {paragraph}")
        
            # Explicitly convert paragraph to string
            paragraph_str = str(paragraph)
            
            
            doc_type = type(paragraph_str)
            print(f"THE TYPE IS: {doc_type}")
            
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                temperature=0.2,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": paragraph_str},
                ]
            )
            # Append the model's reply (i.e., the summary of the paragraph) to the summarized_text string
            summarized_text = response['choices'][0]['message']['content']   

            doc.add_paragraph(f"{summarized_text}")
    
    # Save the document
    doc.save('H:\Documents\Software Development\QUB Software Development\Data_analysis_module\Chapter2ProjectReport.docx')  # replace with your desired path and filename


    return doc

In [9]:
import json

def summarize_paragraphs_json(paragraph_dict, prompt):
    openai.api_key = openai_api_key
    
    doc = Document()
    
    for section, content_parts in paragraph_dict.items():
        doc.add_heading(section, 2)
        
        for part_index, content in enumerate(content_parts, start=1):
            print(f"\nSummarizing {section} - Part {part_index}")
            print(f"Type of paragraph: {type(content)}")
            print(f"Paragraph content: {content}")

            # Summarize the content
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": content},
                ]
            )
            summarized_text = response['choices'][0]['message']['content']
            
            # Check if the summarized text is JSON (tabular data)
            try:
                table_data = json.loads(summarized_text)
                if isinstance(table_data, list) and isinstance(table_data[0], list): # Assuming table data is a list of lists
                    table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
                    for i, row in enumerate(table_data):
                        for j, cell in enumerate(row):
                            table.cell(i, j).text = str(cell)
            except json.JSONDecodeError:
                # Handle regular text
                doc.add_paragraph(f"{summarized_text}")

    # Save the document
    doc.save('H:\\Documents\\Software Development\\QUB Software Development\\Data_analysis_module\\Chapter2ProjectReport.docx')  # replace with your desired path and filename

    return doc



## Prompt

In [11]:
industry = "vintage banjo"
prompt = f"""You are an expert data analyst helping a business in the {industry} industry
            to improve their business and increase revenue.
            I have carried out an investigation in to the wider {industry} industry
            and collected some data for you.
            You must review the following extract from my investigation. 
            I want you to summarise the key findings, and extract important data.
            Focus on the facts, figures, and numbers.
            Only reference numbers present in the text.
            Your summarisation will be included in a report
            which will be presented to the owner of the business you are helping
            to give them an overview of the industry.
            """

# Start calling functions

--Start by extracting the AI generated text from the document containing all prompts and responses of the investigation

--Convert the list to a dict

--Split the sections of the dict if they exceed the specified number of tokens. (1 token = 4 characters approximately).

--split_dict will be in the following structure, depending on how many parts the content has to be divided into:
    split_dict = {
        "Section 1": ["Content part 1", "Content part 2", ..., "Content part n"],
        "Section 2": ["Content"],
        "Section 3": ["Content part 1", "Content part 2"],
        ...
    }

--summarize_paragraphs uses a nested for loop to loop through each part of the content for each section, and summarise it.
--After each loop, each section is individually written to the document to prevent the output of the function exceeding the token limit.

In [12]:
extracted_sections = extract_sections(doc, PURPLE)

In [13]:
sections_dict = sections_to_dict(extracted_sections)

In [14]:
split_dict = split_large_content(sections_dict, 3500)

Section '2.1 The Space Parts of Work' is not split. It remains as a single document.
,Section '2.1.1 Organizations' is split into 2 documents.
,Section '2.1.2 Projects' is split into 3 documents.
,Section '2.1.3 Tasks' is not split. It remains as a single document.
,Section '2.1.4 Careers' is split into 3 documents.
,Section '2.1.5 Jobs' is not split. It remains as a single document.
,Section '2.2 Lenses' is not split. It remains as a single document.
,Section '2.2.1 Financial' is not split. It remains as a single document.
,Section '2.2.2 Government Statistics' is not split. It remains as a single document.
,Section '2.2.3 Careers' is not split. It remains as a single document.
,Section '2.2.4 Research' is not split. It remains as a single document.
,Section '' is not split. It remains as a single document.
,Section '2.2.5. Professional Associations, Market Analysis' is not split. It remains as a single document.
,Section '2.3 Values' is not split. It remains as a single document.
,Se

In [15]:
for key, value in split_dict.items():
    print(f"Section: {key}")
    for i, doc in enumerate(value):
        print(f"Document {i+1}:")
        print(doc)
        print()

Section: 2.1 The Space Parts of Work
,Document 1:
,
,
,Section: 2.1.1 Organizations
,Document 1:
,page_content='<ChatGPT4-with WebPilot plugin> Based on the search results, here are some companies in the UK that manufacture, refurbish, or sell vintage banjos: : A company that has a history dating back to the 19th century. : This company offers custom handmade banjos, designed in collaboration with their future owners. : A stringed instrument maker based in London. : This company sells a variety of musical instruments, including early banjos made by various manufacturers. : A small UK-based company that handcrafts custom banjos. : This website lists various vintage and modern banjo makers. : This company offers handcrafted old-time banjos in the UK. : This shop sells a range of musical instruments, including banjos. They also have a section for. : This company stocks vintage Irish tenor banjos, 4 string banjos, 5 string banjos from top brands. Please note that the availability of vintag

In [16]:
# Function automatically pastes to the new document
summarised = summarize_paragraphs_working(split_dict, prompt)


,Summarizing 2.1 The Space Parts of Work - Part 1
,Type of paragraph: <class 'str'>
,Paragraph content: 
,THE TYPE IS: <class 'str'>
,
,Summarizing 2.1.1 Organizations - Part 1
,Type of paragraph: <class 'langchain.schema.document.Document'>
,Paragraph content: page_content='<ChatGPT4-with WebPilot plugin> Based on the search results, here are some companies in the UK that manufacture, refurbish, or sell vintage banjos: : A company that has a history dating back to the 19th century. : This company offers custom handmade banjos, designed in collaboration with their future owners. : A stringed instrument maker based in London. : This company sells a variety of musical instruments, including early banjos made by various manufacturers. : A small UK-based company that handcrafts custom banjos. : This website lists various vintage and modern banjo makers. : This company offers handcrafted old-time banjos in the UK. : This shop sells a range of musical instruments, including banjos. They als