In [3]:
!pip install faiss-cpu
!pip install langdetect
!pip install python-docx
# !pip install ebooklib
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [4]:
# Cell 1: Enhanced Environment Setup
# =================================
import os
import sys
import json
import re
# import fitz  # PyMuPDF
import docx
# import epub
from pathlib import Path
import numpy as np
import torch
import transformers
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)
from sentence_transformers import SentenceTransformer
import faiss
from langdetect import detect
from googletrans import Translator
from typing import List, Dict, Union, Optional

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["NO_TF"] = "1"

print("\n=== Core Package Versions ===")
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


=== Core Package Versions ===
Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
PyTorch: 2.6.0+cu124
Transformers: 4.51.3
CUDA available: False


In [5]:
# Cell 2: Document Processing System
# =================================
class DocumentProcessor:
    """Handles document uploads and text extraction"""
    
    @staticmethod
    def extract_text_from_pdf(file_path: str) -> str:
        """Extract text from PDF files"""
        text = ""
        try:
            with fitz.open(file_path) as doc:
                for page in doc:
                    text += page.get_text()
        except Exception as e:
            print(f"Error reading PDF: {str(e)}")
        return text
    
    @staticmethod
    def extract_text_from_docx(file_path: str) -> str:
        """Extract text from DOCX files"""
        try:
            doc = docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs])
        except Exception as e:
            print(f"Error reading DOCX: {str(e)}")
            return ""
    
    @staticmethod
    def extract_text_from_txt(file_path: str) -> str:
        """Extract text from TXT files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading TXT: {str(e)}")
            return ""
            
    
    @staticmethod
    def extract_text_from_epub(file_path: str) -> str:
        """Extract text from EPUB files"""
        text = ""
        try:
            book = epub.read_epub(file_path)
            for item in book.get_items():
                if item.get_type() == epub.EpubHtml:
                    text += item.get_content().decode('utf-8')
        except Exception as e:
            print(f"Error reading EPUB: {str(e)}")
        return text
    
    @staticmethod
    def process_uploaded_file(file_path: str) -> str:
        """Process any supported file format"""
        ext = Path(file_path).suffix.lower()
        if ext == '.pdf':
            return DocumentProcessor.extract_text_from_pdf(file_path)
        elif ext == '.docx':
            return DocumentProcessor.extract_text_from_docx(file_path)
        elif ext == '.txt':
            return DocumentProcessor.extract_text_from_txt(file_path)
        elif ext == '.epub':
            return DocumentProcessor.extract_text_from_epub(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

In [6]:
# Cell 3: Knowledge Base Management
# ================================
class KnowledgeBase:
    """Manages the vector database and document storage"""
    
    def __init__(self):
        self.embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        self.index = None
        self.documents = []
        self.doc_embeddings = []
        
    def add_document(self, text: str, metadata: dict = None):
        """Add a document to the knowledge base"""
        if not text.strip():
            return
            
        # Split into chunks (adjust based on your needs)
        chunks = self._chunk_text(text)
        
        for chunk in chunks:
            self.documents.append({
                "text": chunk,
                "metadata": metadata or {}
            })
            
    
    def _chunk_text(self, text: str, chunk_size: int = 512) -> List[str]:
        """Split text into manageable chunks"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in words:
            if current_length + len(word) + 1 <= chunk_size:
                current_chunk.append(word)
                current_length += len(word) + 1
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word)
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            
        return chunks
    
    def build_index(self):
        """Create FAISS index from document embeddings"""
        if not self.documents:
            raise ValueError("No documents to index")
            
        texts = [doc["text"] for doc in self.documents]
        self.doc_embeddings = self.embedder.encode(texts, show_progress_bar=True)
        
        # Create FAISS index
        dimension = self.doc_embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(self.doc_embeddings)
        
    
    def search(self, query: str, k: int = 3) -> List[Dict]:
        """Search for relevant documents"""
        if self.index is None:
            self.build_index()
            
        query_embedding = self.embedder.encode([query])
        distances, indices = self.index.search(query_embedding, k)
        
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx >= 0:  # FAISS may return -1 for invalid indices
                results.append({
                    "text": self.documents[idx]["text"],
                    "metadata": self.documents[idx]["metadata"],
                    "score": float(distance)
                })
        
        return results

In [7]:
from typing import List

In [8]:
# Cell 4: Multilingual Support
# ============================
class MultilingualSupport:
    """Handles language detection and translation"""
    
    def __init__(self):
        self.translator = Translator()
    
    def detect_language(self, text: str) -> str:
        """Detect language of input text"""
        try:
            return detect(text)
        except:
            return "en"  # Default to English
    
    def translate_to_english(self, text: str, src_lang: str = None) -> str:
        """Translate non-English text to English"""
        if not src_lang:
            src_lang = self.detect_language(text)
            
        if src_lang == 'en':
            return text
            
        try:
            translated = self.translator.translate(text, src=src_lang, dest='en')
            return translated.text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text
    
    def translate_from_english(self, text: str, dest_lang: str) -> str:
        """Translate English text to target language"""
        if dest_lang == 'en':
            return text
            
        try:
            translated = self.translator.translate(text, src='en', dest=dest_lang)
            return translated.text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text

# Cell 5: RAG System Integration
# ==============================
class AudreyRAGSystem:
    """Main RAG system for crypto wallet Q&A"""
    
    def __init__(self, model_name: str = "gpt2"):
        # Initialize components
        self.knowledge_base = KnowledgeBase()
        self.multilingual = MultilingualSupport()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Load language model
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
        ).to(self.device)
        
        # Initialize pipeline
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device == "cuda" else -1
        )
    
    def add_documents(self, file_paths: List[str]):
        """Process and add uploaded documents to knowledge base"""
        for file_path in file_paths:
            try:
                text = DocumentProcessor.process_uploaded_file(file_path)
                if text:
                    filename = Path(file_path).name
                    self.knowledge_base.add_document(text, {"source": filename})
                    print(f"Processed: {filename}")
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
        
        # Rebuild index after adding documents
        self.knowledge_base.build_index()
    
    def generate_response(self, query: str, max_length: int = 300) -> str:
        """Generate answer using RAG approach"""
        # Detect query language
        query_lang = self.multilingual.detect_language(query)
        
        # Translate non-English queries to English for retrieval
        if query_lang != 'en':
            english_query = self.multilingual.translate_to_english(query, query_lang)
        else:
            english_query = query
        
        # Retrieve relevant documents
        retrieved_docs = self.knowledge_base.search(english_query)
        context = "\n\n".join([doc["text"] for doc in retrieved_docs])
        
        # Prepare prompt with context
        prompt = f"""Answer the question based on the context below. If you don't know the answer, say you don't know.

Context: {context}

Question: {english_query}
Answer:"""
        
        # Generate response
        response = self.generator(
            prompt,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )[0]['generated_text']
        
        # Extract just the answer part
        answer = response.split("Answer:")[-1].strip()
        
        # Translate back to original language if needed
        if query_lang != 'en':
            answer = self.multilingual.translate_from_english(answer, query_lang)
        
        return answer
    
    def chat_interface(self):
        """Simple command-line chat interface"""
        print("\nWelcome to Audrey Crypto Wallet Assistant!")
        print("Type 'quit' to exit.\n")
        
        while True:
            # User query
            query = input("You: ")
            if query.lower() in ['quit', 'exit']:
                break
                
            # Document upload option
            if query.lower() == 'upload':
                file_paths = input("Enter file paths (comma separated): ").split(',')
                file_paths = [f.strip() for f in file_paths]
                self.add_documents(file_paths)
                print("Documents processed successfully!")
                continue
                
            # Get response
            response = self.generate_response(query)
            print(f"\nAudrey: {response}\n")

In [9]:
from your_module_name import KnowledgeBase  # Replace 'your_module_name' with the correct module name

ModuleNotFoundError: No module named 'your_module_name'

In [None]:
# Cell 6: Main Execution
# ======================
if __name__ == "__main__":
    # Initialize system
    audrey = AudreyRAGSystem()
    
    # Add some default crypto knowledge (optional)
    default_knowledge = [
        "A hardware wallet is a physical device that stores users' private keys offline.",
        "A software wallet is an application that stores private keys on internet-connected devices.",
        "Proof of Work (PoW) is a consensus mechanism that requires computational work to validate transactions.",
        "A private key is a secret number that allows cryptocurrency to be spent.",
        "A public key is derived from a private key and can be shared to receive cryptocurrency."
    ]
    
    for text in default_knowledge:
        audrey.knowledge_base.add_document(text)
    
    # Start chat interface
    audrey.chat_interface()