In [2]:
pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting aiohttp (from openai==0.28)
  Using cached aiohttp-3.9.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.5 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->openai==0.28)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->openai==0.28)
  Using cached frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->openai==0.28)
  Using cached multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.2 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->openai==0.28)
  Using cached yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (31 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached aiohttp-3.9.5-cp312-cp312-macosx_11_0_arm64.whl (392 kB)
Using cach

In [3]:
import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
import fitz  # PyMuPDF
from docx import Document
import ipywidgets as widgets
from IPython.display import display

  from tqdm.autonotebook import tqdm, trange


In [4]:
# Load pre-trained models
model = SentenceTransformer('all-MiniLM-L6-v2')

openai.api_key = 'sk-proj-fXh075SXDYfYCaKHREjrT3BlbkFJxJPr8Gtq2A93qjSomnlS'

In [5]:
# Directory of files to be indexed
directory_path = '/Users/alexandrageer/Desktop/rag txt files'  # Update this to your relative local path

In [6]:
# Function to read text from different file types
def read_file(file_path):
    if file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    elif file_path.endswith('.pdf'):
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        return "\n.join([para.text for para in doc.paragraphs])"
    else:
        return ""

In [7]:
# Read and process files
documents = []
file_paths = []
for file_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file_name)
    if os.path.isfile(file_path):
        content = read_file(file_path)
        if content:  # Only add if content is not empty
            documents.append(content)
            file_paths.append(file_path)

In [8]:
# Generate embeddings
embeddings = model.encode(documents, convert_to_tensor=True)

In [9]:
# Convert embeddings to numpy array
embeddings = embeddings.cpu().detach().numpy()

In [10]:
# Indexing with FAISS
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [11]:
# Query system
def query_system(query, top_k=5):
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().detach().numpy()
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for idx in indices[0]:
        results.append(file_paths[idx])
    return results

In [12]:
# Generate answer using retrieved documents
def generate_answer(query, top_k=5):
    relevant_docs = query_system(query, top_k)
    combined_docs = ''
    for doc in relevant_docs:
        combined_docs += read_file(doc) + '\n'
    prompt = f"Context: {combined_docs}\n\nQuestion: {query}\n\nAnswer:"

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        n=1,
        stop=None,
        temperature=0.7,
    )
    
    answer = response['choices'][0]['message']['content'].strip()
    return answer

In [13]:
# Create UI components
query_input = widgets.Text(
    value='',
    placeholder='Type your query here',
    description='Query:',
    disabled=False
)

In [14]:
top_k_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=10,
    step=1,
    description='Top K:',
    continuous_update=False
)

In [15]:
search_button = widgets.Button(
    description='Search',
    disabled=False,
    button_style='',
    tooltip='Click to search',
    icon='search'
)

In [16]:
output_area = widgets.Output()

def on_search_button_clicked(b):
    with output_area:
        output_area.clear_output()
        query = query_input.value
        top_k = top_k_slider.value
        print(f"Query: {query}")
        print("Searching...")
        results = query_system(query, top_k)
        for result in results:
            print(result)
        
        print("Generating answer...")
        answer = generate_answer(query, top_k)
        print("Generated Answer:")
        print(answer)

search_button.on_click(on_search_button_clicked)

In [17]:
# Display UI components
display(query_input, top_k_slider, search_button, output_area)

Text(value='', description='Query:', placeholder='Type your query here')

IntSlider(value=5, continuous_update=False, description='Top K:', max=10, min=1)

Button(description='Search', icon='search', style=ButtonStyle(), tooltip='Click to search')

Output()