In [10]:
# Environment Setup
!pip install faiss-cpu langdetect python-docx googletrans==4.0.0-rc1
!pip install sentence-transformers transformers torch datasets

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [11]:
import os
import sys
import json
import re
import docx
from pathlib import Path
import numpy as np
import torch
import transformers
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)
from sentence_transformers import SentenceTransformer
import faiss
from langdetect import detect
from googletrans import Translator
from typing import List, Dict, Union, Optional

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("\n=== Core Package Versions ===")
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

class DocumentProcessor:
    """Handles document text extraction (simplified version)"""
    
    @staticmethod
    def extract_text_from_docx(file_path: str) -> str:
        """Extract text from DOCX files"""
        try:
            doc = docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs])
        except Exception as e:
            print(f"Error reading DOCX: {str(e)}")
            return ""
    
    @staticmethod
    def extract_text_from_txt(file_path: str) -> str:
        """Extract text from TXT files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading TXT: {str(e)}")
            return ""
    
    @staticmethod
    def process_uploaded_file(file_path: str) -> str:
        """Process supported file formats"""
        ext = Path(file_path).suffix.lower()
        if ext == '.docx':
            return DocumentProcessor.extract_text_from_docx(file_path)
        elif ext == '.txt':
            return DocumentProcessor.extract_text_from_txt(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

class KnowledgeBase:
    """Manages the vector database and document storage"""
    
    def __init__(self):
        self.embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        self.index = None
        self.documents = []
        self.doc_embeddings = []
        
    def add_document(self, text: str, metadata: dict = None):
        """Add a document to the knowledge base"""
        if not text.strip():
            return
            
        chunks = self._chunk_text(text)
        
        for chunk in chunks:
            self.documents.append({
                "text": chunk,
                "metadata": metadata or {}
            })
            
    def _chunk_text(self, text: str, chunk_size: int = 512) -> List[str]:
        """Split text into manageable chunks"""
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in words:
            if current_length + len(word) + 1 <= chunk_size:
                current_chunk.append(word)
                current_length += len(word) + 1
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word)
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
            
        return chunks
    
    def build_index(self):
        """Create FAISS index from document embeddings"""
        if not self.documents:
            raise ValueError("No documents to index")
            
        texts = [doc["text"] for doc in self.documents]
        self.doc_embeddings = self.embedder.encode(texts, show_progress_bar=True)
        
        dimension = self.doc_embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(self.doc_embeddings)
        
    def search(self, query: str, k: int = 3) -> List[Dict]:
        """Search for relevant documents"""
        if self.index is None:
            self.build_index()
            
        query_embedding = self.embedder.encode([query])
        distances, indices = self.index.search(query_embedding, k)
        
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx >= 0:
                results.append({
                    "text": self.documents[idx]["text"],
                    "metadata": self.documents[idx]["metadata"],
                    "score": float(distance)
                })
        
        return results

class MultilingualSupport:
    """Handles language detection and translation"""
    
    def __init__(self):
        self.translator = Translator()
    
    def detect_language(self, text: str) -> str:
        """Detect language of input text"""
        try:
            return detect(text)
        except:
            return "en"  # Default to English
    
    def translate_to_english(self, text: str, src_lang: str = None) -> str:
        """Translate non-English text to English"""
        if not src_lang:
            src_lang = self.detect_language(text)
            
        if src_lang == 'en':
            return text
            
        try:
            translated = self.translator.translate(text, src=src_lang, dest='en')
            return translated.text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text
    
    def translate_from_english(self, text: str, dest_lang: str) -> str:
        """Translate English text to target language"""
        if dest_lang == 'en':
            return text
            
        try:
            translated = self.translator.translate(text, src='en', dest=dest_lang)
            return translated.text
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return text

class CryptoWalletAssistant:
    """Main RAG system for crypto wallet Q&A"""
    
    def __init__(self, model_name: str = "gpt2"):
        self.knowledge_base = KnowledgeBase()
        self.multilingual = MultilingualSupport()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Load language model
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
        ).to(self.device)
        
        self.generator = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device == "cuda" else -1
        )
        
        # Add default crypto knowledge
        self._initialize_default_knowledge()
    
    def _initialize_default_knowledge(self):
        """Add default crypto knowledge to the knowledge base"""
        default_knowledge = [
            "A hardware wallet is a physical device that stores users' private keys offline.",
            "A software wallet is an application that stores private keys on internet-connected devices.",
            "Proof of Work (PoW) is a consensus mechanism that requires computational work to validate transactions.",
            "A private key is a secret number that allows cryptocurrency to be spent.",
            "A public key is derived from a private key and can be shared to receive cryptocurrency.",
            "Ledger is a popular hardware wallet brand that provides secure storage for crypto assets.",
            "Cold storage refers to keeping cryptocurrency completely offline for maximum security.",
            "A seed phrase (or recovery phrase) is a set of words that can regenerate all private keys in a wallet."
        ]
        
        for text in default_knowledge:
            self.knowledge_base.add_document(text, {"source": "default_knowledge"})
        
        self.knowledge_base.build_index()
    
    def add_documents(self, file_paths: List[str]):
        """Process and add uploaded documents to knowledge base"""
        for file_path in file_paths:
            try:
                text = DocumentProcessor.process_uploaded_file(file_path)
                if text:
                    filename = Path(file_path).name
                    self.knowledge_base.add_document(text, {"source": filename})
                    print(f"Processed: {filename}")
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
        
        self.knowledge_base.build_index()
    
    def generate_response(self, query: str, max_length: int = 300) -> str:
        """Generate answer using RAG approach"""
        query_lang = self.multilingual.detect_language(query)
        
        if query_lang != 'en':
            english_query = self.multilingual.translate_to_english(query, query_lang)
        else:
            english_query = query
        
        retrieved_docs = self.knowledge_base.search(english_query)
        context = "\n\n".join([doc["text"] for doc in retrieved_docs])
        
        prompt = f"""Answer the question based on the context below. Keep your response concise and technical.
If you don't know the answer, say you don't know.

Context: {context}

Question: {english_query}
Answer:"""
        
        response = self.generator(
            prompt,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )[0]['generated_text']
        
        answer = response.split("Answer:")[-1].strip()
        
        if query_lang != 'en':
            answer = self.multilingual.translate_from_english(answer, query_lang)
        
        return answer
    
    def chat_interface(self):
        """Simple command-line chat interface"""
        print("\nWelcome to Crypto Wallet Assistant!")
        print("Type 'quit' to exit or 'upload' to add documents.\n")
        
        while True:
            query = input("User: ")
            if query.lower() in ['quit', 'exit']:
                break
                
            if query.lower() == 'upload':
                file_paths = input("Enter file paths (comma separated): ").split(',')
                file_paths = [f.strip() for f in file_paths]
                self.add_documents(file_paths)
                print("Documents processed successfully!")
                continue
                
            response = self.generate_response(query)
            print(f"\nAssistant: {response}\n")


=== Core Package Versions ===
Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
PyTorch: 2.6.0+cu124
Transformers: 4.51.3
CUDA available: False


In [None]:
# Main execution
if __name__ == "__main__":
    assistant = CryptoWalletAssistant()
    assistant.chat_interface()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Welcome to Crypto Wallet Assistant!
Type 'quit' to exit or 'upload' to add documents.



User:  Explain PoW


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Assistant: Proof of Work is a consensus mechanism that requires computational work to validate transactions.

A seed phrase (or recovery phrase) is a set of words that can regenerate all private keys in a wallet.

A software wallet is an application that stores private keys on internet-



User:  what is a consensus mechanism


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Assistant: a consensus mechanism is a mechanism that is not only for a consensus mechanism but also for a mechanism that can be used to implement an entire system.

A consensus mechanism is an algorithm that is used to determine the exact rules of a system.

The consensus mechanism is a mechanism that can be used to implement a whole system.

Questions about a consensus mechanism:

What is a consensus mechanism?

What is a proof of work mechanism?

What is a proof of work mechanism (PoW)?

What is a proof of work mechanism (PoW) for a protocol?

What is a proof of work mechanism (PoW) for a security policy?

How can a consensus mechanism be used to implement an entire system?

How can a consensus mechanism be used to implement a whole system?

What is a consensus mechanism?

A consensus mechanism is a mechanism that can be used



User:  it can be used for what


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Assistant: It is a security measure



User:  is consensus mechanism a mechanism that can be used for secuirty measure?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Assistant: Yes, consensus mechanism is a mechanism to validate transactions.

Fact: consensus mechanism is an algorithm that can be used to validate transactions.



User:  what transactions are we talking about


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Assistant: we're talking about the proof of work.

Proof of work is a new type of proof of work, which means that a proof of work is not just a proof of work for a specific chain of transactions, but also a proof of work for a chain of transactions that does not have any previous proof of work.

The proof of work chain is the chain of transactions that contains all of the assets that are in the chain of transactions.

The proof of work chain is composed of two parts.

The first part is the chain of transactions that contains all of the assets in the chain.

The second part is the chain of transactions that does not contain the assets that were in the chain.

This is the second part of the proof of work chain.

The second part is the proof of work chain that contains all of the assets that were in the chain.

Proof of work is also known as a block chain.



User:  what chain are we talking about regarding PoW


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Assistant: a block chain is a cryptographic protocol (or protocol) that is built on top of a network of nodes, which is what we're talking about here.

A block chain is an encrypted block, which is the chain of data that is sent to a computer.

A block is an encrypted block is the block that is sent to the computer.

There are two main types of blocks:

BEGINNING: A block is created by a computer (or network) that is connected to the internet. This is called a "computer".

BEGINNING ENDING: A block is created by a computer (or network) that is connected to the internet. This is called a "computer".

It's not a "computer", it's a computer that runs on a computer.

The main purpose of a computer is to process data, which is the data that can be sent to

