In [9]:
# Environment Setup
!pip install faiss-cpu langdetect python-docx googletrans==4.0.0-rc1 nltk \ sentence-transformers transformers torch datasets rank_bm25
!pip uninstall bitsandbytes -y
!pip install bitsandbytes
!pip install --upgrade transformers bitsandbytes

[0mCollecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.0
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.52.4


In [10]:
# First install the required packages if not already installed
# !pip install rank_bm25 googletrans==4.0.0-rc1 transformers sentence-transformers faiss-cpu nltk python-docx datasets

import bitsandbytes
import os
import sys
import json
import re
import docx
from pathlib import Path
import numpy as np
import torch
import transformers
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from sentence_transformers import SentenceTransformer
import faiss
from langdetect import detect
from googletrans import Translator
from typing import List, Dict, Union, Optional
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data for tokenization
nltk.download('punkt')

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("\n=== Core Package Versions ===")
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

class DocumentProcessor:
    """Handles document text extraction with improved chunking"""
    
    @staticmethod
    def extract_text_from_docx(file_path: str) -> str:
        """Extract text from DOCX files"""
        try:
            doc = docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
        except Exception as e:
            print(f"Error reading DOCX: {str(e)}")
            return ""
    
    @staticmethod
    def extract_text_from_txt(file_path: str) -> str:
        """Extract text from TXT files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading TXT: {str(e)}")
            return ""
    
    @staticmethod
    def process_uploaded_file(file_path: str) -> str:
        """Process supported file formats"""
        ext = Path(file_path).suffix.lower()
        if ext == '.docx':
            return DocumentProcessor.extract_text_from_docx(file_path)
        elif ext == '.txt':
            return DocumentProcessor.extract_text_from_txt(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

class KnowledgeBase:
    """Manages the vector database with hybrid search capability"""
    
    def __init__(self):
        # Improved embedding model
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.documents = []
        self.doc_embeddings = []
        self.bm25 = None
        self.tokenized_docs = []
        
    def add_document(self, text: str, metadata: dict = None):
        """Add a document to the knowledge base with better chunking"""
        if not text.strip():
            return
            
        chunks = self._chunk_text(text)
        
        for chunk in chunks:
            self.documents.append({
                "text": chunk,
                "metadata": metadata or {}
            })
            
    def _chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 64) -> List[str]:
        """Improved text chunking with overlap and sentence awareness"""
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(word_tokenize(sentence))
            if current_length + sentence_length <= chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    # Keep overlap between chunks
                    current_chunk = current_chunk[-overlap:] if overlap > 0 else []
                    current_length = sum(len(word_tokenize(s)) for s in current_chunk)
                current_chunk.append(sentence)
                current_length += sentence_length
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            
        return chunks
    
    def build_index(self):
        """Create FAISS index and BM25 index from document embeddings"""
        if not self.documents:
            raise ValueError("No documents to index")
            
        texts = [doc["text"] for doc in self.documents]
        
        # Build semantic index
        self.doc_embeddings = self.embedder.encode(texts, show_progress_bar=True)
        dimension = self.doc_embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Using Inner Product for similarity
        faiss.normalize_L2(self.doc_embeddings)  # Normalize for cosine similarity
        self.index.add(self.doc_embeddings)
        
        # Build keyword index
        self.tokenized_docs = [self._preprocess_text(doc["text"]) for doc in self.documents]
        self.bm25 = BM25Okapi(self.tokenized_docs)
    
    def _preprocess_text(self, text: str) -> List[str]:
        """Text preprocessing for BM25"""
        tokens = word_tokenize(text.lower())
        return [token for token in tokens if token not in ENGLISH_STOP_WORDS and token.isalnum()]
    
    def hybrid_search(self, query: str, k: int = 3, alpha: float = 0.5) -> List[Dict]:
        """
        Perform hybrid search combining semantic and keyword-based approaches
        alpha = 0: pure keyword search
        alpha = 1: pure semantic search
        """
        if self.index is None or self.bm25 is None:
            self.build_index()
            
        # Semantic search
        query_embedding = self.embedder.encode([query])
        faiss.normalize_L2(query_embedding)
        semantic_scores, semantic_indices = self.index.search(query_embedding, k)
        
        # Keyword search
        tokenized_query = self._preprocess_text(query)
        keyword_scores = self.bm25.get_scores(tokenized_query)
        top_keyword_indices = np.argsort(keyword_scores)[-k:][::-1]
        top_keyword_scores = keyword_scores[top_keyword_indices]
        
        # Normalize scores
        if len(semantic_scores[0]) > 0:
            semantic_scores = (semantic_scores[0] - semantic_scores[0].min()) / (semantic_scores[0].max() - semantic_scores[0].min())
        if len(top_keyword_scores) > 0:
            top_keyword_scores = (top_keyword_scores - top_keyword_scores.min()) / (top_keyword_scores.max() - top_keyword_scores.min())
        
        # Combine results
        combined_indices = set(semantic_indices[0]).union(set(top_keyword_indices))
        combined_results = []
        
        for idx in combined_indices:
            if idx >= 0 and idx < len(self.documents):
                # Get normalized scores
                semantic_score = semantic_scores[0][np.where(semantic_indices[0] == idx)[0][0]] if idx in semantic_indices[0] else 0
                keyword_score = keyword_scores[idx] if idx in top_keyword_indices else 0
                
                # Combine scores
                combined_score = alpha * semantic_score + (1 - alpha) * keyword_score
                
                combined_results.append({
                    "text": self.documents[idx]["text"],
                    "metadata": self.documents[idx]["metadata"],
                    "semantic_score": float(semantic_score),
                    "keyword_score": float(keyword_score),
                    "combined_score": float(combined_score)
                })
        
        # Sort by combined score
        combined_results.sort(key=lambda x: x["combined_score"], reverse=True)
        
        return combined_results[:k]

class MultilingualSupport:
    """Handles language detection and translation with improved reliability"""
    
    def __init__(self):
        self.translator = Translator()
    
    def detect_language(self, text: str) -> str:
        """Detect language of input text with fallback"""
        try:
            lang = detect(text)
            # Validate it's a supported language
            if lang in ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'zh', 'ja', 'ko']:
                return lang
            return 'en'
        except:
            return "en"  # Default to English
    
    def translate_to_english(self, text: str, src_lang: str = None) -> str:
        """Translate non-English text to English with retry logic"""
        if not src_lang:
            src_lang = self.detect_language(text)
            
        if src_lang == 'en':
            return text
            
        try:
            # Split long text to avoid translation errors
            if len(text) > 500:
                parts = [text[i:i+500] for i in range(0, len(text), 500)]
                translated_parts = []
                for part in parts:
                    translated = self.translator.translate(part, src=src_lang, dest='en')
                    translated_parts.append(translated.text)
                return " ".join(translated_parts)
            else:
                translated = self.translator.translate(text, src=src_lang, dest='en')
                return translated.text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text
    
    def translate_from_english(self, text: str, dest_lang: str) -> str:
        """Translate English text to target language with retry logic"""
        if dest_lang == 'en':
            return text
            
        try:
            # Split long text to avoid translation errors
            if len(text) > 500:
                parts = [text[i:i+500] for i in range(0, len(text), 500)]
                translated_parts = []
                for part in parts:
                    translated = self.translator.translate(part, src='en', dest=dest_lang)
                    translated_parts.append(translated.text)
                return " ".join(translated_parts)
            else:
                translated = self.translator.translate(text, src='en', dest=dest_lang)
                return translated.text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text

class CryptoWalletAssistant:
    """Enhanced RAG system for crypto wallet Q&A with strict prompting"""
    
    def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.1"):
        self.knowledge_base = KnowledgeBase()
        self.multilingual = MultilingualSupport()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Quantization config for memory efficiency
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        
        # Load language model with quantization
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.float16
        )
        
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device_map="auto"
        )
        
        # Add default crypto knowledge
        self._initialize_default_knowledge()
    
    def _initialize_default_knowledge(self):
        """Add comprehensive default crypto knowledge to the knowledge base"""
        default_knowledge = [
            "Hardware wallets store private keys offline, ensuring top-tier security against cyber threats.",
            "Software wallets provide easy access to crypto but remain vulnerable to hacking and malware.",
            "Proof of Work (PoW) validates transactions via computational puzzles solved by miners, securing the network.",
            "Private keys are secret cryptographic codes enabling access to funds—losing them means losing assets.",
            "Public keys are derived from private keys, allowing users to receive cryptocurrency without exposing sensitive data.",
            "Cold storage keeps assets completely offline, minimizing risks from online attacks.",
            "Seed phrases generate wallet private keys and must be backed up securely to prevent irreversible loss.",
            "Multi-signature wallets require multiple authorizations for transactions, ideal for institutional security.",
            "ERC-20 and ERC-721 are common Ethereum standards for fungible and non-fungible tokens, respectively.",
            "Gas fees compensate blockchain validators for processing transactions efficiently.",
            "Hierarchical Deterministic (HD) wallets simplify key management by deriving all keys from a single root key.",
            "Smart contract wallets allow programmable execution of transactions based on predefined conditions.",
            "Two-factor authentication (2FA) strengthens security by requiring a secondary verification method.",
            "Hot wallets are designed for frequent transactions like trading and payments but are vulnerable to cyberattacks.", 
            "Security measures such as multi-factor authentication (MFA) and end-to-end encryption are essential.",
            "Cold wallets provide secure long-term storage by keeping private keys offline, reducing the risk of cyber threats. However, they remain susceptible to physical theft.",
            "Examples of hot wallets include MetaMask (browser extension) and Coinbase Wallet (exchange-linked).",
            "Examples of cold wallets include Ledger Nano X (Bluetooth-enabled) and Trezor Model T (touchscreen).",
            "Best practices for wallet usage include keeping less than 10% of assets in hot wallets for liquidity and using multi-signature cold wallets for institutional security (e.g., 3-of-5 signatures).",
            "Wallet monitoring through blockchain explorers (e.g., Etherscan) enhances security by detecting unusual activity.",
            "Bitcoin's Proof of Work (PoW) consumes ~127 TWh annually, roughly equivalent to Norway's energy consumption.",
            "Ethereum's shift to Proof of Stake (PoS) reduced energy usage by 99.95% after the Merge in 2022, making it one of the most sustainable blockchain networks.",
            "Alternative consensus mechanisms include PoS (Ethereum, Cardano) and hybrid models like Solana's Proof-of-History (PoH).",
            "Renewable mining solutions leverage geothermal energy in Iceland and excess wind/solar power in Texas.",
            "Hardware improvements such as ENERGY STAR-certified servers help reduce energy waste in blockchain data centers.",
            "Validators stake cryptocurrency as collateral to validate transactions and propose blocks.",
            "PoS is over 99% more energy-efficient than PoW, making it the preferred method for eco-friendly blockchains.",
            "PoS networks achieve high transaction speeds, with Solana processing 65,000 TPS compared to Bitcoin's 7 TPS.",
            "Popular PoS-based cryptocurrencies include Ethereum (smart contracts & DeFi), Cardano (peer-reviewed security), and Polkadot (cross-chain interoperability).",
            "Risks of PoS include centralization, as wealthier validators gain disproportionate influence over the network.",
            "Liquidity concerns arise because staked tokens remain locked, such as Ethereum's minimum 32 ETH requirement for solo staking.",
            "Users can stake tokens directly or join liquid staking pools, such as Lido Finance, to earn rewards while maintaining token accessibility.",
            "Green mining initiatives include Marathon Digital's carbon offsets and Sweden's mining rigs repurposing heat for residential buildings.",
            "Upcoming regulations include EU MiCA's mandatory PoW energy disclosures by 2025 and U.S. SEC audits for crypto companies focusing on ESG compliance.",
            "Modular data centers are increasingly deployed near renewable energy sources, such as hydroelectric and wind farms.",
            "Networks using PoS achieve high transaction speeds, with Solana processing 65,000 transactions per second compared to Bitcoin's 7.",
            "Cryptocurrency wallet strategies differentiate between hot and cold wallets based on security and usability.",
            "Hot wallets optimize for frequent transactions, such as trading and payments, but remain vulnerable to cyberattacks, requiring multi-factor authentication (MFA) and end-to-end encryption.",
            "Cold wallets provide secure long-term storage by keeping private keys offline, reducing exposure to online threats, though they can still be physically compromised.",
            "Examples of hot wallets include MetaMask, a popular browser extension, and Coinbase Wallet, which integrates with exchange platforms.",
            "Cold wallet solutions like Ledger Nano X with Bluetooth connectivity and Trezor Model T with touchscreen functionality offer enhanced offline security.",
            "Best practices recommend limiting hot wallet funds to below 10% for liquidity and using multi-signature cold wallets for institutional protection, such as requiring 3-of-5 signatures for transaction approval.",
            "Wallet monitoring through blockchain explorers like Etherscan enables users to detect suspicious activity and enhance security.",
            "Bitcoin's Proof-of-Work (PoW) consensus mechanism consumes approximately 127 terawatt-hours annually, equivalent to Norway's total energy consumption.",
            "Ethereum transitioned from PoW to Proof-of-Stake (PoS) in 2022, reducing energy usage by 99.95% post-Merge, establishing itself as one of the most sustainable blockchain networks.",
            "Alternative consensus models to PoW include PoS networks like Cardano, which use validator staking instead of mining, and Solana's hybrid Proof-of-History (PoH) system for enhanced efficiency.",
        ]
        
        for text in default_knowledge:
            self.knowledge_base.add_document(text, {"source": "default_knowledge"})
        
        self.knowledge_base.build_index()
    
    def load_crypto_dataset(self, dataset_name: str = "crypto_qa"):
        """Load and process a crypto-specific dataset to enhance knowledge"""
        try:
            # Example dataset - in practice you would use your own curated dataset
            dataset = load_dataset(dataset_name, split='train')
            
            for example in dataset:
                if 'question' in example and 'answer' in example:
                    # Add both question and answer as documents to capture different phrasings
                    self.knowledge_base.add_document(
                        f"Question: {example['question']}\nAnswer: {example['answer']}",
                        {"source": dataset_name}
                    )
            
            self.knowledge_base.build_index()
            print(f"Loaded and processed {len(dataset)} examples from {dataset_name}")
        except Exception as e:
            print(f"Error loading dataset: {str(e)}")
    
    def train_on_crypto_data(self, dataset_name: str = "crypto_qa", epochs: int = 3):
        """Fine-tune the model on crypto-specific data"""
        try:
            dataset = load_dataset(dataset_name, split='train')
            
            # Preprocess dataset for fine-tuning
            def preprocess_function(examples):
                inputs = [f"Question: {q}\nContext: {c}" if c else f"Question: {q}" 
                         for q, c in zip(examples['question'], examples.get('context', ['']*len(examples['question'])))]
                targets = examples['answer']
                model_inputs = self.tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
                
                # Setup the tokenizer for targets
                with self.tokenizer.as_target_tokenizer():
                    labels = self.tokenizer(targets, max_length=256, truncation=True, padding="max_length")
                
                model_inputs["labels"] = labels["input_ids"]
                return model_inputs
            
            tokenized_dataset = dataset.map(
                preprocess_function,
                batched=True,
                remove_columns=dataset.column_names
            )
            
            # Training arguments
            training_args = TrainingArguments(
                output_dir="./results",
                num_train_epochs=epochs,
                per_device_train_batch_size=4,
                per_device_eval_batch_size=4,
                warmup_steps=500,
                weight_decay=0.01,
                logging_dir="./logs",
                logging_steps=10,
                fp16=True,
                gradient_accumulation_steps=2,
                save_steps=10_000,
                save_total_limit=2
            )
            
            # Initialize Trainer
            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_dataset,
            )
            
            # Start training
            print("Starting fine-tuning on crypto dataset...")
            trainer.train()
            print("Fine-tuning completed successfully!")
            
        except Exception as e:
            print(f"Error during fine-tuning: {str(e)}")
    
    def add_documents(self, file_paths: List[str]):
        """Process and add uploaded documents to knowledge base"""
        for file_path in file_paths:
            try:
                text = DocumentProcessor.process_uploaded_file(file_path)
                if text:
                    filename = Path(file_path).name
                    self.knowledge_base.add_document(text, {"source": filename})
                    print(f"Processed: {filename}")
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
        
        self.knowledge_base.build_index()
    
    def generate_response(self, query: str, max_length: int = 300) -> str:
        """Generate answer using enhanced RAG approach with strict prompting"""
        query_lang = self.multilingual.detect_language(query)
        
        if query_lang != 'en':
            english_query = self.multilingual.translate_to_english(query, query_lang)
        else:
            english_query = query
        
        # Use hybrid search for better retrieval
        retrieved_docs = self.knowledge_base.hybrid_search(english_query, alpha=0.7)
        context = "\n\n".join([doc["text"] for doc in retrieved_docs])
        
        # Strict prompt to prevent hallucinations and off-topic answers
        prompt = f"""You are a specialized Crypto Wallet Assistant. Answer the question strictly based on the provided context about cryptocurrency wallets and blockchain technology. 

Follow these rules:
1. If the question is not related to cryptocurrency wallets, blockchain, or digital assets, respond with: "I specialize only in cryptocurrency wallet topics."
2. If the answer isn't found in the context, say "I don't have information about that specific aspect of cryptocurrency wallets."
3. Be precise and technical in your answers.
4. Never provide financial advice or price predictions.
5. Always prioritize security considerations in your answers.

Context: {context}

Question: {english_query}

Answer:"""
        
        response = self.generator(
            prompt,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.3,  # Lower temperature for more deterministic answers
            top_p=0.85,
            do_sample=True,
            repetition_penalty=1.2,
            eos_token_id=self.tokenizer.eos_token_id
        )[0]['generated_text']
        
        # Extract only the answer part
        answer = response.split("Answer:")[-1].strip()
        
        # Clean up any trailing incomplete sentences or repetition
        answer = re.sub(r'\[.*?\]', '', answer)  # Remove any citations if present
        answer = answer.split('\n')[0]  # Take only the first line if multiple
        
        # Ensure the answer stays on topic
        if query_lang != 'en':
            answer = self.multilingual.translate_from_english(answer, query_lang)
        
        return answer
    
    def chat_interface(self):
        """Enhanced command-line chat interface"""
        print("\n=== Crypto Wallet Assistant ===")
        print("Specialized in cryptocurrency wallet security and blockchain technology")
        print("Type 'quit' to exit, 'upload' to add documents, or 'train' to fine-tune on crypto data\n")
        
        while True:
            query = input("User: ")
            if query.lower() in ['quit', 'exit']:
                break
                
            if query.lower() == 'upload':
                file_paths = input("Enter file paths (comma separated): ").split(',')
                file_paths = [f.strip() for f in file_paths]
                self.add_documents(file_paths)
                print("Documents processed successfully!")
                continue
                
            if query.lower() == 'train':
                self.train_on_crypto_data()
                continue
                
            response = self.generate_response(query)
            print(f"\nAssistant: {response}\n")


=== Core Package Versions ===
Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
PyTorch: 2.6.0+cu124
Transformers: 4.51.3
CUDA available: False


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Example usage
if __name__ == "__main__":
    assistant = CryptoWalletAssistant()
    assistant.chat_interface()

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1.
401 Client Error. (Request ID: Root=1-6841bf00-138478756352acb714c9bdbc;a5c68ddd-814b-4b5a-aa4b-5500fa08f845)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.