In [1]:
!pip install -q pymupdf sentence-transformers transformers torch faiss-cpu openai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import fitz
import openai
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import faiss
import numpy as np
import textwrap

  from tqdm.autonotebook import tqdm, trange


In [4]:
# API key from colab secrets (if you run in local environment you will need different methodology this is for colab only. Typically API key is stored in environment variable)
from google.colab import drive, userdata
drive.mount('/content/drive')
openai.api_key = userdata.get('ChatGPT')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
def extract_text_from_pdf(pdf_path, start_page=0, end_page=None):
    pdf_document = fitz.open(pdf_path)

    if end_page is None:
        end_page = pdf_document.page_count - 1

    extracted_text = ""

    for page_num in range(start_page, end_page + 1):
        page = pdf_document.load_page(page_num)
        extracted_text += page.get_text("text")

    pdf_document.close()
    return extracted_text


# Function to chunk the extracted text
def chunk_text(text):
    paragraphs = text.split('\n\n')
    return paragraphs

# doc retrieval and reranking function
def retrieve_and_rerank_documents(query, documents, embedding_model, cross_encoder_model, cross_encoder_tokenizer, top_k=3):
    # doc encoding
    doc_embeddings = embedding_model.encode(documents, normalize_embeddings=True)
    query_embedding = embedding_model.encode(query, normalize_embeddings=True)

    # FAISS similarity search (Facebook AI Similarity Search)
    dimension = doc_embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(doc_embeddings)
    distances, indices = faiss_index.search(np.expand_dims(query_embedding, axis=0), top_k * 5)  # Retrieve more for re-ranking

    # top documents
    retrieved_docs = [documents[idx] for idx in indices.flatten()]

    # Cross-encoder reranking
    inputs = cross_encoder_tokenizer(
        [query] * len(retrieved_docs),
        retrieved_docs,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    with torch.no_grad():
        scores = cross_encoder_model(**inputs).logits.squeeze()

    # top indices
    top_indices = torch.topk(scores, k=top_k).indices.tolist()
    most_similar_documents = [retrieved_docs[idx] for idx in top_indices]

    return most_similar_documents

# response generatio n function
def generate_response_chatgpt(prompt, model="gpt-4"):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=400,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

In [35]:
# Main
if __name__ == "__main__":

    pdf_path = '/content/drive/My Drive/Capstone/basics_of_strength_and_conditioning_manual.pdf'

    # Specify the range of pages
    start_page = 48
    end_page = 49

    pdf_text = extract_text_from_pdf(pdf_path, start_page, end_page)

    # Load pre-trained sentence transformer model for embedding
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Load cross-encoder model for re-ranking
    cross_encoder_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
    cross_encoder_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")

    # Query
    query = "I'm really having a hard time with my squats, how can I improve?"

    # Retrieve and re-rank documents
    most_similar_documents = retrieve_and_rerank_documents(
        query,
        text_chunks,
        embedding_model,
        cross_encoder_model,
        cross_encoder_tokenizer,
        top_k=3
    )

    # Prompt
    CONTEXT = ""
    for doc in most_similar_documents:
        wrapped_text = textwrap.fill(doc, width=100)
        CONTEXT += wrapped_text + "\n\n"

    prompt = f"""
    Assume the role of a physical trainer or athletics coach, giving short a short phrase of positive encouragement if the user expresses negative sentiment.
    Be sure to do more than just provide positive encouragement and include a short response using the following CONTEXT to answer the QUESTION at the end.
    If you don't know the answer, just say that you don't know; don't try to make up an answer."

    CONTEXT:
    {CONTEXT}
    QUESTION: {query}
    """

    response = generate_response_chatgpt(prompt)

    print("Generated Response:")
    print(response)

    # -> save pdf extracted text
    output_text_path = '/content/drive/My Drive/Capstone/extracted_text.txt'
    with open(output_text_path, 'w', encoding='utf-8') as text_file:
        text_file.write(pdf_text)

Generated Response:
Keep going, progress takes time! The key to improving squats is focusing on form and technique, as mentioned in the manual. Make sure you're properly positioned under the bar, engaging your core, and maintaining the correct torso angle throughout the lift. Also, remember to breathe properly - inhale deeply before starting your descent, and exhale at or near the top of the squat. Try incorporating a variety of abdominal exercises into your routine as well, as strong abdominals help maintain torso stability during the squat. Keep at it, you're doing great!
