# Test

In [3]:
import tensorflow as tf
print(tf.__version__)

2.18.0


In [4]:
import keras
print(keras.__version__)

3.7.0


In [1]:
import socket
import PyPDF2
import nltk
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ''.join([page.extract_text() for page in reader.pages])
            return text.strip()
    except Exception as e:
        return f"Error extracting text from PDF: {e}"

# Step 2: Chunk text into smaller pieces
def chunk_text(document_text, chunk_size=500, overlap=50):
    if not document_text:
        return []
    chunks = []
    for i in range(0, len(document_text), chunk_size - overlap):
        chunk = document_text[i:i + chunk_size]
        if not chunk.endswith(('.', '!', '?')):
            last_period = chunk.rfind('.')
            chunk = chunk[:last_period + 1] if last_period != -1 else chunk
        chunks.append(chunk.strip())
    return chunks

# Step 3: Create embeddings
def create_embeddings(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks)
    return embeddings, model

# Step 4: Build FAISS index
def create_faiss_index(embeddings):
    embeddings = np.array(embeddings).astype('float32')
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# Step 5: Retrieve the most relevant chunk
def retrieve_relevant_chunk(query, model, index, chunks):
    query_embedding = model.encode([query]).astype('float32')
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k=1)

    if distances[0][0] > 0.8:  # Threshold for match quality
        return "No relevant information found for your query."

    return chunks[indices[0][0]]

# Server Functionality
def start_server(pdf_path):
    document_text = extract_text_from_pdf(pdf_path)
    if not document_text or "Error" in document_text:
        return f"Error: {document_text}"

    chunks = chunk_text(document_text)
    if not chunks:
        return "The PDF contains no readable text."

    embeddings, model = create_embeddings(chunks)
    index = create_faiss_index(embeddings)

    # Socket setup
    host = "127.0.0.1"
    port = 5000
    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server_socket.bind((host, port))
    server_socket.listen(1)
    print(f"Server listening on {host}:{port}")

    while True:
        client_socket, address = server_socket.accept()
        print(f"Connection from {address}")

        # Receive query
        query = client_socket.recv(1024).decode("utf-8").strip()
        print(f"Received query: {query}")

        if query.lower() == "exit":
            response = "Goodbye!"
            client_socket.send(response.encode("utf-8"))
            client_socket.close()
            break

        # Get relevant chunk
        response = retrieve_relevant_chunk(query, model, index, chunks)
        client_socket.send(response.encode("utf-8"))

        client_socket.close()

if __name__ == "__main__":
    pdf_path = "DatasetNLP.pdf"  # Replace with the actual path to your PDF
    start_server(pdf_path)





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AsmaA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Server listening on 127.0.0.1:5000
Connection from ('127.0.0.1', 53821)
Received query: who is ramesses?


# Run

In [1]:
import socket
import PyPDF2
import nltk
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ''.join([page.extract_text() for page in reader.pages])
            return text.strip()
    except Exception as e:
        return f"Error extracting text from PDF: {e}"

# Step 2: Chunk text into smaller pieces
def chunk_text(document_text, chunk_size=500, overlap=50):
    chunks = []
    for i in range(0, len(document_text), chunk_size - overlap):
        chunk = document_text[i:i + chunk_size]
        if not chunk.endswith(('.', '!', '?')):
            last_period = chunk.rfind('.')
            chunk = chunk[:last_period + 1] if last_period != -1 else chunk
        chunks.append(chunk.strip())
    return chunks

# Step 3: Create embeddings
def create_embeddings(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks)
    return embeddings, model

# Step 4: Build FAISS index
def create_faiss_index(embeddings):
    embeddings = np.array(embeddings).astype('float32')
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# Step 5: Retrieve the most relevant chunk
def retrieve_relevant_chunk(query, model, index, chunks):
    query_embedding = model.encode([query]).astype('float32')
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k=1)

    if distances[0][0] > 0.8:  # Threshold for match quality
        return "No relevant information found for your query."

    return chunks[indices[0][0]]

# Server Functionality
def start_server(pdf_path):
    document_text = extract_text_from_pdf(pdf_path)
    if not document_text or "Error" in document_text:
        print(f"Error: {document_text}")
        return

    chunks = chunk_text(document_text)
    if not chunks:
        print("The PDF contains no readable text.")
        return

    embeddings, model = create_embeddings(chunks)
    index = create_faiss_index(embeddings)

    # Socket setup
    host = "127.0.0.1"
    port = 5000
    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server_socket.bind((host, port))
    server_socket.listen(1)
    print(f"Server listening on {host}:{port}")

    while True:
        print("Waiting for a client connection...")
        client_socket, address = server_socket.accept()
        print(f"Connection established with {address}")

        try:
            while True:
                # Receive query
                query = client_socket.recv(1024).decode("utf-8").strip()
                if not query:
                    break  # Handle empty query or client disconnection
                print(f"Received query: {query}")

                if query.lower() == "exit":
                    response = "Goodbye!"
                    client_socket.send(response.encode("utf-8"))
                    print(f"Connection with {address} closed.")
                    break

                # Get relevant chunk
                response = retrieve_relevant_chunk(query, model, index, chunks)
                client_socket.send(response.encode("utf-8"))
        except Exception as e:
            print(f"Error during communication: {e}")
        finally:
            client_socket.close()

if __name__ == "__main__":
    pdf_path = "DatasetNLP.pdf"  # Replace with your actual PDF file path
    start_server(pdf_path)





[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


Server listening on 127.0.0.1:5000
Waiting for a client connection...
Connection established with ('127.0.0.1', 52746)
Received query: who is ramesses?
Received query: who is ramesses?
Waiting for a client connection...


# Voice

In [6]:
import socket
import PyPDF2
import nltk
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pyttsx3

nltk.download('punkt')

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ''.join([page.extract_text() for page in reader.pages])
            return text.strip()
    except Exception as e:
        return f"Error extracting text from PDF: {e}"

# Step 2: Chunk text into smaller pieces
def chunk_text(document_text, chunk_size=500, overlap=50):
    chunks = []
    for i in range(0, len(document_text), chunk_size - overlap):
        chunk = document_text[i:i + chunk_size]
        if not chunk.endswith(('.', '!', '?')):
            last_period = chunk.rfind('.')
            chunk = chunk[:last_period + 1] if last_period != -1 else chunk
        chunks.append(chunk.strip())
    return chunks

# Step 3: Create embeddings
def create_embeddings(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks)
    return embeddings, model

# Step 4: Build FAISS index
def create_faiss_index(embeddings):
    embeddings = np.array(embeddings).astype('float32')
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# Step 5: Retrieve the most relevant chunk
def retrieve_relevant_chunk(query, model, index, chunks):
    query_embedding = model.encode([query]).astype('float32')
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k=1)

    if distances[0][0] > 0.8:  # Threshold for match quality
        return "No relevant information found for your query."

    return chunks[indices[0][0]]

# Step 6: Synthesize text-to-speech (TTS) and save audio
def synthesize_speech(text, output_file="output.wav"):
    try:
        engine = pyttsx3.init()
        engine.save_to_file(text, output_file)
        engine.runAndWait()
    except Exception as e:
        print(f"Error during speech synthesis: {e}")
        return None
    return output_file

# Server Functionality
# Server Functionality
def start_server(pdf_path):
    document_text = extract_text_from_pdf(pdf_path)
    if not document_text or "Error" in document_text:
        print(f"Error: {document_text}")
        return

    chunks = chunk_text(document_text)
    if not chunks:
        print("The PDF contains no readable text.")
        return

    embeddings, model = create_embeddings(chunks)
    index = create_faiss_index(embeddings)

    # Socket setup
    host = "127.0.0.1"
    port = 1234
    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server_socket.bind((host, port))
    server_socket.listen(1)
    print(f"Server listening on {host}:{port}")

    while True:
        print("Waiting for a client connection...")
        client_socket, address = server_socket.accept()
        print(f"Connection established with {address}")

        try:
            while True:
                # Receive query
                query = client_socket.recv(1024).decode("utf-8").strip()
                if not query:
                    break  # Handle empty query or client disconnection
                print(f"Received query: {query}")

                if query.lower() == "exit":
                    response = "Goodbye!"
                    client_socket.send(response.encode("utf-8"))
                    print(f"Connection with {address} closed.")
                    break

                # Get relevant chunk
                text_response = retrieve_relevant_chunk(query, model, index, chunks)
                print(f"Text Response: {text_response}")

                # Step 1: Send text response
                client_socket.send(text_response.encode("utf-8"))
                print("Text response sent.")

                # Step 2: Convert text response to speech and send the audio file
                audio_file = synthesize_speech(text_response, output_file="response.wav")
                if audio_file:
                    # Inform the client that audio is being sent
                    client_socket.send(b"AUDIO_INCOMING")
                    print("Notifying client about incoming audio.")

                    # Send the audio file content to the client
                    with open(audio_file, "rb") as f:
                        audio_data = f.read()
                        client_socket.sendall(audio_data)
                        print("Audio response sent.")
                else:
                    client_socket.send(b"Error generating audio.")
        except Exception as e:
            print(f"Error during communication: {e}")
        finally:
            client_socket.close()


if __name__ == "__main__":
    pdf_path = "DatasetNLP.pdf"  # Replace with your actual PDF file path
    start_server(pdf_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AsmaA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Server listening on 127.0.0.1:1234
Waiting for a client connection...
Connection established with ('127.0.0.1', 55884)
Received query: who is ramesses?
Text Response: 1. Ramesses III  
Historical Context  
• Dynasty and Time Period : Ramesses III was the second pharaoh of the 20th Dynasty (1186 –1155 
BCE), during a time of increasing external threats and internal instability.  
• Major Accomplishments :  
o Defensive Campaigns : Successfully repelled invasions by the Sea Peoples, securing 
Egypt’s borders. This victory is recorded in the temple of Medinet Habu, where detailed 
battle reliefs portray the conflict.
Text response sent.
Notifying client about incoming audio.
Audio response sent.
Received query: who is ramesses?
Text Response: 1. Ramesses III  
Historical Context  
• Dynasty and Time Period : Ramesses III was the second pharaoh of the 20th Dynasty (1186 –1155 
BCE), during a time of increasing external threats and internal instability.  
• Major Accomplishments :  
o Defen