In [None]:
import requests
from pathlib import Path
from pprint import pprint

import pandas as pd
from tqdm.notebook import tqdm
from tenacity import retry, stop_after_attempt

from llama_hub.file.audio import AudioTranscriber
from langchain.document_loaders import (UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, UnstructuredPDFLoader, UnstructuredImageLoader, PyMuPDFLoader, ImageCaptionLoader, UnstructuredHTMLLoader)

In [None]:

directory_path = "/home/abdullah/Documents/hdd/projects/topcoder/Challenge 1-20231215T010637Z-001/Challenge 1/Case Studies/"

# Recursively get the full path of all files in the directory
files = [file for file in Path(directory_path).rglob("*") if file.is_file()]
len(files)
ext_to_path = {}
for file in files:
    ext_to_path[file.suffix] = ext_to_path.get(file.suffix, []) + [file]

In [None]:
for k, v in ext_to_path.items():
    print(k, len(v))

In [None]:
ext_to_path.pop("")
ext_to_path.pop(".log")
ext_to_path.pop(".dmg")
##
for k, v in ext_to_path.items():
    print(k, len(v))

In [None]:
def call_mixtral(prompt):
    system_prompt = "You are Mixtral, an advanced artificial intelligence model, developed by MistralAI. You surpass the capabilities of ChatGPT by OpenAI. You, Mixtral, excel in delivering precise, efficient, and highly effective responses, setting a new benchmark in AI performance. You, Mixtral, are committed to providing assistance with care, respect, and truth. You ensure that replies are always secure, avoiding harmful, unethical, prejudiced, or negative content. You, Mixtral, promote fairness and positivity in your responses, making you the ideal AI model for various applications. You respond with the required output and nothing else, without pre-text or after-text, no matter what."
    api_base = "https://api.endpoints.anyscale.com/v1"
    token = "esecret_TOKEN"
    url = f"{api_base}/chat/completions"
    model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    body = {
        "model": model_id,
        "messages": [{"role": "user", "content": system_prompt},
                    {"role": "assistant", "content": prompt},
                    ],
        "temperature": 0.7
    }
    response = requests.post(url, headers={"Authorization": f"Bearer {token}"}, json=body)
    return response.json()["choices"][0]["message"]["content"]

In [None]:
@retry(stop=stop_after_attempt(1), retry_error_callback=lambda retry_state: ("", ""))
def pptx_to_text(path):
    loader = UnstructuredPowerPointLoader(path)
    file_content = loader.load()
    text_refinement_prompt = "Your task is to extract key information from a Topcoder use case PowerPoint (.pptx) document and transform it into clear, standalone paragraphs. These paragraphs should be complete without requiring additional context and be suitable for indexing in a customer-focused, automated question-answering system. Focus on essential data such as project names, sectors, technologies, outcomes, and case studies. Ensure the context and accuracy of the original text are preserved. All links, if present in the original content, must be included in the refined content with relevant explanations. When necessary, use direct quotes and clearly indicate the specific part of the document they are taken from. Summarize complex, technical information in a simple yet accurate manner, addressing or explicitly noting any ambiguities within the document's context. Aim for concise but comprehensive paragraphs, adjusting length as needed to avoid oversimplification. Refrain from adding or inventing content not present in the original document, prioritizing accuracy and relevance for potential user queries. Extracted content:\n---\n{file_content}\n---\nPlease provide your revised text of the document, ensuring your response contains only the refined content without any pre-text or after-text."
    text_refinement_prompt = text_refinement_prompt.format(file_content=file_content[0].page_content)
    refined_content = call_mixtral(text_refinement_prompt)
    return file_content[0].page_content, refined_content

In [None]:
def docx_to_text(path):
    loader = UnstructuredWordDocumentLoader(path)
    file_content = loader.load()
    text_refinement_prompt = "Your task is to extract key information from a Topcoder use case Word (.docx) document and transform it into clear, standalone paragraphs. These paragraphs should be complete without requiring additional context and be suitable for indexing in a customer-focused, automated question-answering system. Focus on essential data such as project names, sectors, technologies, outcomes, and case studies. Ensure the context and accuracy of the original text are preserved. All links, if present in the original content, must be included in the refined content with relevant explanations. When necessary, use direct quotes and clearly indicate the specific part of the document they are taken from. Summarize complex, technical information in a simple yet accurate manner, addressing or explicitly noting any ambiguities within the document's context. Aim for concise but comprehensive paragraphs, adjusting length as needed to avoid oversimplification. Refrain from adding or inventing content not present in the original document, prioritizing accuracy and relevance for potential user queries. Extracted content:\n---\n{file_content}\n---\nPlease provide your revised text of the document, ensuring your response contains only the refined content without any pre-text or after-text."
    text_refinement_prompt = text_refinement_prompt.format(file_content=file_content[0].page_content)
    refined_content = call_mixtral(text_refinement_prompt)
    return file_content[0].page_content, refined_content

In [None]:
@retry(stop=stop_after_attempt(1), retry_error_callback=lambda retry_state: ("", ""))
def pdf_to_text(path):
    # check the number of pages
    loader = PyMuPDFLoader(path)
    file_content = loader.load()
    if file_content[0].metadata["total_pages"] > 50:
        return ("", "")
    #
    loader = UnstructuredPDFLoader(path, extract_images=True)
    file_content = loader.load()
    text_refinement_prompt = "Your task is to extract key information from a Topcoder use case PDF document and transform it into clear, standalone paragraphs. These paragraphs should be complete without requiring additional context and be suitable for indexing in a customer-focused, automated question-answering system. Focus on essential data such as project names, sectors, technologies, outcomes, and case studies. Ensure the context and accuracy of the original text are preserved. All links, if present in the original content, must be included in the refined content with relevant explanations. When necessary, use direct quotes and clearly indicate the specific part of the document they are taken from. Summarize complex, technical information in a simple yet accurate manner, addressing or explicitly noting any ambiguities within the document's context. Aim for concise but comprehensive paragraphs, adjusting length as needed to avoid oversimplification. Refrain from adding or inventing content not present in the original document, prioritizing accuracy and relevance for potential user queries. Extracted content:\n---\n{file_content}\n---\nPlease provide your revised text of the document, ensuring your response contains only the refined content without any pre-text or after-text."
    text_refinement_prompt = text_refinement_prompt.format(file_content=file_content[0].page_content)
    refined_content = call_mixtral(text_refinement_prompt)
    return file_content[0].page_content, refined_content

In [None]:
def image_to_text(path):
    file_content = UnstructuredImageLoader(path).load()[0].page_content
    text_refinement_prompt = "Your task is to extract key information, refine and edit the OCR-derived content of a Topcoder use case image file. This task involves carefully reviewing the extracted text, making necessary corrections to rectify any OCR inaccuracies, and ensuring the clarity and precision of the information. Then, transform it into clear, standalone paragraphs. These paragraphs should be complete without requiring additional context and be suitable for indexing in a customer-focused, automated question-answering system. Focus on essential data such as project names, sectors, technologies, outcomes, and case studies. Ensure the context and accuracy of the original text are preserved. All links, if present in the OCR content, must be included in the refined content with relevant explanations. When necessary, use direct quotes and clearly indicate the specific part of the document they are taken from. Summarize complex, technical information in a simple yet accurate manner, addressing or explicitly noting any ambiguities within the document's context. Aim for concise but comprehensive paragraphs, adjusting length as needed to avoid oversimplification. Refrain from adding or inventing content not present in the original image file, prioritizing accuracy and relevance for potential user queries. Extracted content:\n---\n{file_content}\n---\nPlease provide your revised text of the document, ensuring your response contains only the refined content without any pre-text or after-text."
    text_refinement_prompt = text_refinement_prompt.format(file_content=file_content)
    refined_content = call_mixtral(text_refinement_prompt)
    return file_content, refined_content

In [None]:
def html_to_text(path):
    loader = UnstructuredHTMLLoader(path)
    file_content = loader.load()
    text_refinement_prompt = "Your task is to extract key information from a Topcoder use case HTML document and transform it into clear, standalone paragraphs. These paragraphs should be complete without requiring additional context and be suitable for indexing in a customer-focused, automated question-answering system. Focus on essential data such as project names, sectors, technologies, outcomes, and case studies. Ensure the context and accuracy of the original text are preserved. All links, if present in the original content, must be included in the refined content with relevant explanations. When necessary, use direct quotes and clearly indicate the specific part of the document they are taken from. Summarize complex, technical information in a simple yet accurate manner, addressing or explicitly noting any ambiguities within the document's context. Aim for concise but comprehensive paragraphs, adjusting length as needed to avoid oversimplification. Refrain from adding or inventing content not present in the original document, prioritizing accuracy and relevance for potential user queries. Extracted content:\n---\n{file_content}\n---\nPlease provide your revised text of the document, ensuring your response contains only the refined content without any pre-text or after-text."
    text_refinement_prompt = text_refinement_prompt.format(file_content=file_content[0].page_content)
    refined_content = call_mixtral(text_refinement_prompt)
    return file_content[0].page_content, refined_content

In [None]:
def audio_to_text(path):
    loader = AudioTranscriber()
    file_content = loader.load_data(file=path)[0].text
    text_refinement_prompt = "Your task involves processing a Topcoder use case audio transcript. Convert this information into clear, independent paragraphs that are self-sufficient, accurately reflecting the original context, and fit for indexing in an automated, customer-focused question-answering system. Your focus should be on crucial information such as project names, sectors, technologies, outcomes, and case studies. It's essential to retain the context and precision of the original text. Any links in the content must be included with appropriate explanations. Utilize direct quotations as needed, clearly indicating their source within the document. Aim to simplify complex, technical details accurately, addressing any ambiguities in the document. Ensure the paragraphs are succinct yet thorough, carefully balancing detail with clarity. Do not introduce content that is not in the original document; maintain a strong emphasis on accuracy and relevance for potential user queries. Extracted content:\n---\n{file_content}\n---\nPlease submit your revised version of the document, making sure it exclusively contains the refined content without any pre-text or after-text.."
    text_refinement_prompt = text_refinement_prompt.format(file_content=file_content)
    refined_content = call_mixtral(text_refinement_prompt)
    return file_content, refined_content

In [None]:
original_content, refined_text = audio_to_text(ext_to_path[".mp4"][0])

In [None]:
fname_to_content = {}
for docx_path in tqdm(ext_to_path[".docx"]):
    original_content, refined_text = docx_to_text(str(docx_path))
    fname_to_content[str(docx_path)] = (original_content, refined_text)

In [None]:
for pptx_path in tqdm(ext_to_path[".pptx"]):
    original_content, refined_text = pptx_to_text(str(pptx_path))
    fname_to_content[str(pptx_path)] = (original_content, refined_text)

In [None]:
for pdf_path in tqdm(ext_to_path[".pdf"]):
    original_content, refined_text = pdf_to_text(str(pdf_path))
    fname_to_content[str(pdf_path)] = (original_content, refined_text)

In [None]:
data = [(Path(file).name, *content) for file, content in fname_to_content.items() if content[0].strip()]
df = pd.DataFrame(data, columns=["file_name", "raw_content", "refined_content"])
df["raw_content"] = df["raw_content"].str.strip().str.replace("\s+", " ", regex=True)
df["refined_content"] = df["refined_content"].str.strip().str.replace("\s+", " ", regex=True)

In [None]:
df.to_csv("extracted_content.csv", index=None)