In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install dateparser
!pip install parsedatetime
!pip install chromadb
!pip install sentence-transformers
!pip install transformers
!pip install langchain
!pip install -U langchain-community

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import dateparser
import parsedatetime
from datetime import datetime
from langchain.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelForCausalLM, AutoTokenizer
import chromadb
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract dates using spaCy
def extract_dates(paragraph):
    doc = nlp(paragraph)
    return [ent.text for ent in doc.ents if ent.label_ == "DATE"]

# Function to convert date strings into exact date objects
def convert_dates(extracted_dates):
    cal = parsedatetime.Calendar()
    converted_dates = []
    for date_str in extracted_dates:
        parsed_date = dateparser.parse(date_str)
        if parsed_date is None:
            time_struct, parse_status = cal.parse(date_str)
            if parse_status == 1:
                parsed_date = datetime(*time_struct[:6])
            else:
                parsed_date = None
        converted_dates.append((date_str, parsed_date))
    return converted_dates

# Function to validate email and phone number
def validate_user_info(name, phone, email):
    phone_valid = re.fullmatch(r'\+?\d{10,15}', phone)
    email_valid = re.fullmatch(r"[^@]+@[^@]+\.[^@]+", email)
    return phone_valid is not None, email_valid is not None

# Function to collect user info conversationally
def collect_user_info():
    name = input("Please provide your name: ")
    phone = input("Please provide your phone number: ")
    email = input("Please provide your email: ")

    phone_valid, email_valid = validate_user_info(name, phone, email)
    if not phone_valid:
        return "Invalid phone number format. Please try again."
    if not email_valid:
        return "Invalid email format. Please try again."

    return f"Thank you, {name}. We will call you at {phone} or email you at {email}."

# Function to load documents and create ChromaDB collection
def setup_document_collection(filepath):
    client = chromadb.Client()
    now = datetime.now()
    timestamp = now.strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS
    collection_name = f"document_embeddings_{timestamp}"
    collection = client.create_collection(collection_name)
    loader = TextLoader(filepath)
    documents = loader.load()

    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    texts = [doc.page_content for doc in documents]
    embeddings = embedding_model.encode(texts)

    for i, (embedding, text) in enumerate(zip(embeddings, texts)):
        collection.add(
            ids=[f"doc_{i}"],
            documents=[text],
            embeddings=[embedding.tolist()],
            metadatas=[{"text": text}]
        )
    return collection

# Function to setup the TinyLLaMA model and tokenizer
def setup_tiny_llama():
    model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    return model, tokenizer

# Function to generate answers using the TinyLLaMA model
def generate_answer(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt")
    # outputs = model.generate(inputs.input_ids, max_new_tokens=150, do_sample=True)
        # Generate response with a limit on max tokens and prevent excessive length
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=150,       # Control the length of the output
        do_sample=True,           # Sampling for variability
        num_return_sequences=1,   # Single output response
        temperature=0.7,           # Adjust temperature for more focused responses
        repetition_penalty=1.2   # Penalize repetitive responses
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to answer a query by fetching relevant document context and generating a response
def answer_query(query, collection, model, tokenizer, embedding_model, history):
    # Update history with the new query
    history.append((query, ""))
    if len(history) > 5:
        history.pop(0)  # Keep only the latest 5 entries

    # Generate the query embedding
    query_embedding = embedding_model.encode([query])[0]
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=1
    )

    if results['metadatas']:
        # Extract the relevant context text
        context_text = results['metadatas'][0][0]['text']

        # Generate the answer without including the prompt in the output
        generated_answer = generate_answer(context_text, model, tokenizer)

        # Calculate the similarity score
        generated_embedding = embedding_model.encode([generated_answer])[0]
        similarity_score = util.cos_sim(query_embedding, generated_embedding).item() * 100  # Convert to percentage
        print(f"Similarity Score: {similarity_score}")

        # Update the history with the generated answer
        history[-1] = (query, generated_answer)  # Update the latest history entry with the response

        # Return answer if it meets the similarity threshold
        if similarity_score > 40:
            return generated_answer.strip()
        else:
            return "Content not found."
    else:
        return "Content not found."

# Function to book an appointment based on user input
def book_appointment(user_input):
    extracted_dates = extract_dates(user_input)
    converted_dates = convert_dates(extracted_dates)

    for original, converted in converted_dates:
        if converted:
            return f"Appointment successfully booked for {converted.strftime('%Y-%m-%d')}."

    return "Could not book the appointment. Please provide a valid date."

def chatbot_function(filepath):
    # Setting up document collection
    collection = setup_document_collection(filepath)
    tiny_llama_model, tiny_llama_tokenizer = setup_tiny_llama()
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    history = []  # Initialize the history for queries and responses

    # Interaction loop
    while True:
        query = input("You: ")
        if "call me" in query.lower():
            print(collect_user_info())
        elif "book appointment" in query.lower():
            appointment_message = book_appointment(query)
            print(appointment_message)
        else:
            answer = answer_query(query, collection, tiny_llama_model, tiny_llama_tokenizer, embedding_model, history)
            print(f"Chatbot: {answer}")

# Example usage
chatbot_function('/content/Agriculture(1).txt')


  from tqdm.autonotebook import tqdm, trange


You: call me
Please provide your name: testing
Please provide your phone number: 98
Please provide your email: testing@gmail.com
Invalid phone number format. Please try again.
You: call me
Please provide your name: testing
Please provide your phone number: 9878898878
Please provide your email: te
Invalid email format. Please try again.
You: call me
Please provide your name: testing
Please provide your phone number: 9878899667
Please provide your email: testing@gmail.com
Thank you, testing. We will call you at 9878899667 or email you at testing@gmail.com.
You: book appointment for Next Friday
Appointment successfully booked for 2024-10-18.
You: book appointment for Sunday
Appointment successfully booked for 2024-10-13.
You: Explain about Agriculture.
Similarity Score: 61.16095781326294
Chatbot: ﻿Agriculture, one of the oldest human activities, is the foundation of civilization and remains crucial to the sustenance and development of societies worldwide. This practice involves the cultiv