# Example of a Project of a Chatbot based on LLM

This is a chatbot example using Transmilenio as the main topic.

## 1. Requirements

As follows the required dependencies will be instaled into workspace.

In [None]:
%pip install transformers faiss-cpu datasets fastapi uvicorn pymupdf accelerate nest_asyncio

In [None]:
%pip install transformers[torch]

## 2. Extract PDF Information

In [None]:
import fitz

def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Example
path = "./informacion_general.pdf"
text = extract_text_from_pdf(path)
print(text)    

# 3. Generate Model based on a Foundational Model and Fine Tunning

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import fitz

def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# load foundational model
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prepare specific data
path_concepts = "./informacion_general.pdf"
text_concepts = extract_text_from_pdf(path_concepts)

# Add a dummy label for the purpose of training
# In a real scenario, you should have actual labels for your data
dataset = Dataset.from_dict({"text": [text_concepts], "labels": [0]})

# tokenize data
def tokenize_function(concepts):
    return tokenizer(concepts["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# train model with fine-tuning
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:00<00:00, 31.75 examples/s]


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.7817802429199219, metrics={'train_runtime': 14.3268, 'train_samples_per_second': 0.209, 'train_steps_per_second': 0.209, 'total_flos': 397402195968.0, 'train_loss': 0.7817802429199219, 'epoch': 3.0})

## 4. RAG

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModel, AutoTokenizer
import faiss
import numpy as np
import torch

# Embed definition
def embed(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

embedding_model_name = "distilbert-base-uncased"
embedding_model = AutoModel.from_pretrained(embedding_model_name)
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

embedding_dim = embedding_model.config.hidden_size

index = faiss.IndexFlatL2(embedding_dim)

documents = ["./technical_report.pdf"]

def extract_text_from_pdf(pdf_path: str) -> str:
    import fitz  # PyMuPDF
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

embeddings = np.vstack([embed(extract_text_from_pdf(doc), embedding_model, embedding_tokenizer) for doc in documents])
index.add(embeddings)

def retrieve(query: str) -> list:
    query_embedding = embed(query, embedding_model, embedding_tokenizer)
    query_embedding = np.array(query_embedding).reshape(1, -1)  
    D, I = index.search(query_embedding, 1)
    return [documents[i] for i in I[0]]

def generate_response(prompt: str) -> str:
    retrieved_document = retrieve(prompt)
    context = " ".join(retrieved_document)
    input_text = f"{context} {prompt}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    output = model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],  # Give more attention to non-padding tokens
        max_length=150, 
        num_return_sequences=1, 
        temperature=0.7,  # Control randomness
        top_k=50,         # Controla diversity
        top_p=0.9,        # Controla diversity
        repetition_penalty=2.0,  # Penalize repeated output
        pad_token_id=tokenizer.eos_token_id  # Configure the pad token
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example of use
prompt = "What do you know about bogota?"
response = generate_response(prompt)
print(response)

# 5. Deploy FastAPI

In [None]:
import nest_asyncio
nest_asyncio.apply()

from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class Query(BaseModel):
    prompt: str

@app.post("/chatbot")
def chatbot(query: Query):
    try:
        response = generate_response(query.prompt)
        return {"response": response}
    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8005)