In [1]:
!pip install sentence-transformers
!pip install openai

Defaulting to user installation because normal site-packages is not writeable
... (truncated) ...


In [2]:
import sentence_transformers
import openai

print("Libraries are installed successfully!")

Libraries are installed successfully!


In [3]:
!pip install sentence-transformers pinecone-client

import os
import numpy as np
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec, Index

# setup
YOUR_API_KEY = "pcsk_r2pz5_UV2WuDc4A8KtaHcUPNca6fxptkvnFS14VU7rNCAB99kqmSDqAFSVdf4PwJgx5Mo"  # <-- Replace with your actual Pinecone key
INDEX_NAME   = "chatbot-index"

# cleaned dir paths
TXT_CLEANED_DIR = './files/processed_txt_files'
M_CLEANED_DIR   = './files/processed_m_files'

# Initialize sentence transformers model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Connect to Pinecone
pc = Pinecone(api_key=YOUR_API_KEY)

# Create the index if not already present
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{INDEX_NAME}' created!")
else:
    print(f"Index '{INDEX_NAME}' already exists.")

desc = pc.describe_index(INDEX_NAME)
host = desc.host
index = Index(api_key=YOUR_API_KEY, host=host)

# Helper function for Chunking
def chunk_text(text, chunk_size=2000, overlap=200):
    chunks = []
    start = 0
    text_length = len(text)
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

# Function to read files, chunk, embed, and upsert
def upsert_folder(folder_path, file_extension, chunk_size=2000, overlap=200):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(file_extension):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            text_chunks = chunk_text(content, chunk_size=chunk_size, overlap=overlap)
            for i, chunk in enumerate(text_chunks):
                chunk_id = f"{file_name}_chunk_{i}"
                embedding = model.encode(chunk).tolist()
                metadata = {
                    "text": chunk,
                    "original_file": file_name,
                    "chunk_index": i
                }
                index.upsert([(chunk_id, embedding, metadata)])
            print(f"Upserted {len(text_chunks)} chunks for file: {file_name}")

# Upsert both directories
upsert_folder(TXT_CLEANED_DIR, '.txt', chunk_size=2000, overlap=200)
upsert_folder(M_CLEANED_DIR, '.m', chunk_size=2000, overlap=200)

print("All chunks upserted to Pinecone successfully!")


Defaulting to user installation because normal site-packages is not writeable
... (truncated) ...
All chunks upserted to Pinecone successfully!


In [4]:
query_embedding = model.encode("Explain how wing rotation works for insect flight").tolist()
results = index.query(vector=query_embedding, top_k=3, include_metadata=True)

for match in results["matches"]:
    print(f"ID: {match['id']}")
    print(f"Score: {match['score']}")
    if "metadata" in match:
        snippet = match["metadata"].get("text", "")
        print(f"Chunk Text: {snippet[:200]}...\n")
    else:
        print("No metadata available for this match.\n")


ID: Dickinson, Lehman, Sane, 1999, Wing Rotation and the Aerodynamic Basis of Insect Flight, science.284.5422.1954.txt
Score: 0.76667285
No metadata available for this match.

ID: Dickinson, Lehman, Sane, 1999, Wing Rotation and the Aerodynamic Basis of Insect Flight, science.284.5422.1954.txt_chunk_0
Score: 0.76667285
Chunk Text: research articles wing rotation and the aerodynamic basis of insect flight michael h dickinson1 fritzolaf lehmann2 sanjay p sane1 insects were the first animals to evolve active flight and remain unsu...

ID: Dickinson, Lehman, Sane, 1999, Wing Rotation and the Aerodynamic Basis of Insect Flight, science.284.5422.1954.txt_chunk_22
Score: 0.764329553
Chunk Text: flapping wings suggest that the aerodynamics of insect flight may be explained by the interaction of three distinct yet interactive mechanisms delayed stall rotational circulation and wake capture whe...



In [5]:
!pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
... (truncated) ...


In [6]:
pip install openai==0.28.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
... (truncated) ...
Note: you may need to restart the kernel to use updated packages.


In [None]:
import openai

openai.api_key = os.environ.get("OPENAI_API_KEY")

def chatbot():
    print("Chatbot ready! Type 'exit' or 'quit' to stop.\n")

    while True:
        user_input = input("User: ")
        if user_input.lower() in ["exit", "quit"]:
            print("Goodbye!")
            break

        #Embed the user's question
        query_embedding = model.encode(user_input).tolist()

        #Query Pinecone for top chunks
        results = index.query(
            vector=query_embedding, 
            top_k=3, 
            include_metadata=True
        )

        # gather chunk text
        retrieved_chunks = []
        for match in results["matches"]:
            if "metadata" in match:
                chunk_text = match["metadata"].get("text", "")
                retrieved_chunks.append(chunk_text)

        context_text = "\n\n".join(retrieved_chunks)

        # build the prompt
        prompt = f"""
        You are an expert assistant. Use the following context to answer the user's question.
        Context:
        {context_text}

        Question:
        {user_input}

        Provide a helpful, concise answer:
        """

        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "system", "content": prompt}],
                temperature=0.7,
            )
            answer = response.choices[0].message.content
        except Exception as e:
            answer = f"Error calling OpenAI API: {e}"

        print(f"Chatbot: {answer}\n{'-'*60}\n")

chatbot()

Chatbot ready! Type 'exit' or 'quit' to stop.



User:  "Answer only based on the provided research data embeddings. If the information is not found, respond with: 'I don't know based on the research data provided.'" What is the capital of france?


Chatbot: I don't know based on the research data provided.
------------------------------------------------------------



User:  capital of india


Chatbot: New Delhi
------------------------------------------------------------



In [None]:
import pickle

with open('./files/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

print(f"Number of embeddings: {len(embeddings)}")
for file_name, embedding in list(embeddings.items())[:5]:
    print(f"File: {file_name}, Embedding Shape: {len(embedding)}")


In [None]:
!pip install pinecone-client

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="pcsk_r2pz5_UV2WuDc4A8KtaHcUPNca6fxptkvnFS14VU7rNCAB99kqmSDqAFSVdf4PwJgx5Mo")

index_name = "chatbot-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

print(f"Index {index_name} created successfully!")

In [None]:
pip install --upgrade pinecone-client

In [None]:
from pinecone import Pinecone, ServerlessSpec

#Initialize the Pinecone control-plane client
pc = Pinecone(
    api_key="pcsk_r2pz5_UV2WuDc4A8KtaHcUPNca6fxptkvnFS14VU7rNCAB99kqmSDqAFSVdf4PwJgx5Mo"
)

index_name = "chatbot-index"

#create the index if non-existent 
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

In [None]:
from pinecone import Pinecone, ServerlessSpec, Index
import pickle
import numpy as np

#control plane to create or check index
pc = Pinecone(
    api_key="pcsk_r2pz5_UV2WuDc4A8KtaHcUPNca6fxptkvnFS14VU7rNCAB99kqmSDqAFSVdf4PwJgx5Mo"
)

index_name = "chatbot-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{index_name}' created!")
else:
    print(f"Index '{index_name}' already exists.")

# data plane: desc, connect, upsert
desc = pc.describe_index(index_name)
host = desc.host
index = Index(
    api_key="pcsk_r2pz5_UV2WuDc4A8KtaHcUPNca6fxptkvnFS14VU7rNCAB99kqmSDqAFSVdf4PwJgx5Mo",
    host=host
)

# loading
with open('files/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

# upsert
for file_name, vector in embeddings.items():
    if isinstance(vector, np.ndarray):
        vector = vector.tolist()
    index.upsert([(file_name, vector)])

print("Embeddings uploaded successfully!")

# Query
query_vector = np.random.rand(384).tolist()
results = index.query(vector=query_vector, top_k=5, include_metadata=True)

print("Search Results:")
for match in results["matches"]:
    print(f"ID: {match['id']}, Score: {match['score']}")
