In [None]:
# !pip install sentence-transformers pinecone-client rank-bm25 transformers accelerate bitsandbytes

import pinecone
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# Initialize components
class IdeaRAG:
    def __init__(self, ideas):
        self.ideas = ideas
        self.device = "mps" if torch.backends.mps.is_available() else "cpu"
        
        # Initialize models with 4-bit quantization
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device=self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
        self.generator = AutoModelForSeq2SeqLM.from_pretrained(
            "google/flan-t5-small",
            device_map="auto",
            load_in_4bit=True,
            torch_dtype=torch.float16
        )
        
        # Initialize Pinecone
        pinecone.init(api_key="YOUR_KEY", environment="YOUR_ENV")
        self.index = pinecone.Index("your-index-name")
        
        # Prepare hybrid search
        self._prepare_hybrid_search()
    
    def _prepare_hybrid_search(self):
        # Create BM25 corpus
        tokenized_ideas = [idea.split() for idea in self.ideas]
        self.bm25 = BM25Okapi(tokenized_ideas)
        
        # Upload ideas to Pinecone
        embeddings = self.embedder.encode(self.ideas)
        records = [(str(idx), emb.tolist(), {"text": idea}) 
                 for idx, (emb, idea) in enumerate(zip(embeddings, self.ideas))]
        self.index.upsert(records)

    def _hybrid_search(self, query, top_k=5):
        # Vector search
        query_embedding = self.embedder.encode(query).tolist()
        vector_results = self.index.query(query_embedding, top_k=top_k*2, include_metadata=True)
        
        # BM25 search
        tokenized_query = query.split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        bm25_indices = np.argsort(bm25_scores)[-top_k*2:][::-1]
        
        # Combine results
        combined = [(match['score'], match['metadata']['text']) for match in vector_results['matches']]
        combined += [(bm25_scores[i], self.ideas[i]) for i in bm25_indices]
        
        # Deduplicate and sort
        unique_results = {text: score for score, text in combined}
        return sorted(unique_results.items(), key=lambda x: x[1], reverse=True)[:top_k]

    def generate_response(self, query):
        # Retrieve relevant ideas
        context = [text for text, _ in self._hybrid_search(query)]
        
        # Generate response
        input_text = f"Answer based on these ideas: {', '.join(context)}\n\nQuestion: {query}"
        inputs = self.tokenizer(input_text, return_tensors="pt").to(self.device)
        
        outputs = self.generator.generate(
            inputs.input_ids,
            max_new_tokens=150,
            num_beams=4,
            early_stopping=True
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
ideas = [ "1,2,3,4,5,6,7,8" ]
# Initialize with your ideas
rag = IdeaRAG(ideas)

# Example usage
response = rag.generate_response("How can shorthand writing improve creative thinking?")
print(response)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`