In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
import pandas as pd
from datasets import load_dataset
import faiss
import numpy as np


In [3]:
retriever_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


In [16]:
# og embedding with just one document
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the retriever model
retriever_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Load data from the CSV file
df = pd.read_csv("./151_ideas_updated.csv", usecols=[0, 1, 2, 3, 4, 5])

# Ensure the correct column is used for passages
df.columns = df.columns.str.strip()  # Remove any extra spaces in column names
passages = df["Ideas"].dropna().tolist()  # Replace "Ideas" with the actual column name if different

# Encode the passages
passage_embeddings = retriever_model.encode(passages, convert_to_tensor=True)

# Print the shape of the embeddings
print("Encoded passages:", passage_embeddings.shape)


Encoded passages: torch.Size([150, 768])


In [17]:
# embedding with 2 documents
import pandas as pd

# Load the existing structured CSV
structured_df = pd.read_csv("./151_ideas_updated.csv", usecols=[0, 1, 2, 3, 4, 5])

# Load the less structured CSV
less_structured_df = pd.read_csv('./151qa2.csv', usecols=['text'])

# Normalize text to lowercase for consistent processing
less_structured_df['text'] = less_structured_df['text'].str.lower()

# Initialize lists to hold processed questions and answers
questions = []
answers = []

# Temporary storage for current question and answer
current_question = None
current_answer = None

# Iterate through the rows
for _, row in less_structured_df.iterrows():
    text = row['text'].strip()
    if text.startswith('q:'):
        # If there's a current question, finalize it without an answer
        if current_question:
            questions.append(current_question)
            answers.append(None)
        # Start a new question
        current_question = text[2:].strip()
        current_answer = None
    elif text.startswith('a:'):
        # If there's an answer, associate it with the current question
        current_answer = text[2:].strip()
        if current_question:  # Only add if there is a question
            questions.append(current_question)
            answers.append(current_answer)
            current_question = None
            current_answer = None
    else:
        # Ignore rows that are neither questions nor answers
        pass

# Handle any trailing question without an answer
if current_question:
    questions.append(current_question)
    answers.append(None)

# Combine questions and answers into a structured DataFrame
new_data = pd.DataFrame({
    'Question': questions,
    'Answer': answers
})

# Add placeholder columns to match the structure of the existing DataFrame
for col in structured_df.columns:
    if col not in new_data.columns:
        new_data[col] = None  # Fill with None or another placeholder value

# Append the new data to the existing structured DataFrame
updated_df = pd.concat([structured_df, new_data], ignore_index=True)

# Save the updated DataFrame to a new CSV
updated_df.to_csv("./updated_ideas.csv", index=False)

# Inspect the first few rows of the updated DataFrame
print(updated_df.head())
# Ensure the correct column is used for passages
df.columns = df.columns.str.strip()  # Remove any extra spaces in column names
passages = df["Ideas"].dropna().tolist()  # Replace "Ideas" with the actual column name if different

# Encode the passages
passage_embeddings = retriever_model.encode(passages, convert_to_tensor=True)


                                               Ideas    Theme a    Theme-b  \
0  1) Maximize the Beauty - fully channel the bea...        fun   rational   
1  2) Full Expression - it takes a lot of effort ...  inspiring  intuitive   
2  3) Expect Rising - this means our expectations...   rational   negative   
3  4) The Power of Pettiness - is the idea that p...      chill  inspiring   
4  5) Various meditation - I am a big fan of medi...      chill        fun   

     Theme-c Unnamed: 4                                         Unnamed: 5  \
0   positive   personal  theme ideas - rough, rational, intuitive, posi...   
1   positive   personal  top 5 idea themes- rational, positive, inspiri...   
2   negative        NaN                                                NaN   
3   positive        NaN                                                NaN   
4  inspiring        NaN                                                NaN   

  Question Answer  
0      NaN    NaN  
1      NaN    NaN  
2 

In [20]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def load_and_encode_passages(csv_path, text_columns, model_name="BAAI/bge-base-en-v1.5", batch_size=32):
    """
    Optimized function to load passages and create embeddings
    
    Parameters:
    - csv_path: Path to the CSV file
    - text_columns: List of column names to combine for embeddings
    - model_name: Sentence transformer model to use
    - batch_size: Number of passages to encode in each batch
    
    Returns:
    - passages: List of cleaned passages
    - passage_embeddings: Tensor of embeddings
    """
    # Load data with error handling
    try:
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return None, None
    
    # Combine specified text columns into a single passage
    df['combined_text'] = df[text_columns].fillna('').agg(' '.join, axis=1).str.strip()
    passages = df['combined_text'].dropna().tolist()
    
    # Check if passages are empty
    if not passages:
        print("No valid passages found in the specified columns.")
        return None, None
    
    # Initialize model with device optimization
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    retriever_model = SentenceTransformer(model_name).to(device)
    
    # Encode passages in batches with progress tracking
    passage_embeddings = []
    for i in tqdm(range(0, len(passages), batch_size), desc="Encoding Passages"):
        batch = passages[i:i+batch_size]
        batch_embeddings = retriever_model.encode(
            batch, 
            convert_to_tensor=True, 
            device=device, 
            show_progress_bar=False
        )
        passage_embeddings.append(batch_embeddings)
    
    # Combine batch embeddings
    passage_embeddings = torch.cat(passage_embeddings, dim=0)
    
    # Print diagnostic information
    print(f"Model: {model_name}")
    print(f"Total Passages: {len(passages)}")
    print(f"Embedding Shape: {passage_embeddings.shape}")
    print(f"Device: {device}")
    
    return passages, passage_embeddings

# Usage
csv_path = "updated_ideas.csv"
passages, embeddings = load_and_encode_passages(
    csv_path, 
    text_columns=["Question", "Answer"],  # Specify the columns to combine
    model_name="BAAI/bge-base-en-v1.5", 
    batch_size=32
)


Encoding Passages: 100%|██████████| 9/9 [00:09<00:00,  1.10s/it]

Model: BAAI/bge-base-en-v1.5
Total Passages: 271
Embedding Shape: torch.Size([271, 768])
Device: cpu





In [None]:
# metrics - benchmarks etc  embedding with 2 documents
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def load_and_encode_passages(csv_path, text_column, model_name="BAAI/bge-base-en-v1.5", batch_size=32):
    """
    Optimized function to load passages and create embeddings
    
    Parameters:
    - csv_path: Path to the CSV file
    - text_column: Name of the column containing passages
    - model_name: Sentence transformer model to use
    - batch_size: Number of passages to encode in each batch
    
    Returns:
    - passages: List of cleaned passages
    - passage_embeddings: Tensor of embeddings
    """
    # Load data with error handling
    try:
        df = pd.read_csv(csv_path, usecols=[0, 1, 2, 3, 4, 5])
        df.columns = df.columns.str.strip()
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return None, None
    
    # Clean and filter passages
    passages = df[text_column].dropna().tolist()
    
    # Check if passages are empty
    if not passages:
        print("No valid passages found in the specified column.")
        return None, None
    
    # Initialize model with device optimization
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    retriever_model = SentenceTransformer(model_name).to(device)
    
    # Encode passages in batches with progress tracking
    passage_embeddings = []
    for i in tqdm(range(0, len(passages), batch_size), desc="Encoding Passages"):
        batch = passages[i:i+batch_size]
        batch_embeddings = retriever_model.encode(
            batch, 
            convert_to_tensor=True, 
            device=device, 
            show_progress_bar=False
        )
        passage_embeddings.append(batch_embeddings)
    
    # Combine batch embeddings
    passage_embeddings = torch.cat(passage_embeddings, dim=0)
    
    # Print diagnostic information
    print(f"Model: {model_name}")
    print(f"Total Passages: {len(passages)}")
    print(f"Embedding Shape: {passage_embeddings.shape}")
    print(f"Device: {device}")
    
    return passages, passage_embeddings

# Usage
csv_path = "151_ideas_updated.csv"
passages, embeddings = load_and_encode_passages(
    csv_path, 
    text_column="Ideas"
)

Encoding Passages: 100%|██████████| 5/5 [00:31<00:00,  6.39s/it]


Model: BAAI/bge-base-en-v1.5
Total Passages: 150
Embedding Shape: torch.Size([150, 768])
Device: cpu


In [None]:
# metrics - benchmarks etc
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def load_and_encode_passages(csv_path, text_column, model_name="BAAI/bge-base-en-v1.5", batch_size=32):
    """
    Optimized function to load passages and create embeddings
    
    Parameters:
    - csv_path: Path to the CSV file
    - text_column: Name of the column containing passages
    - model_name: Sentence transformer model to use
    - batch_size: Number of passages to encode in each batch
    
    Returns:
    - passages: List of cleaned passages
    - passage_embeddings: Tensor of embeddings
    """
    # Load data with error handling
    try:
        df = pd.read_csv(csv_path, usecols=[0, 1, 2, 3, 4, 5])
        df.columns = df.columns.str.strip()
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return None, None
    
    # Clean and filter passages
    passages = df[text_column].dropna().tolist()
    
    # Check if passages are empty
    if not passages:
        print("No valid passages found in the specified column.")
        return None, None
    
    # Initialize model with device optimization
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    retriever_model = SentenceTransformer(model_name).to(device)
    
    # Encode passages in batches with progress tracking
    passage_embeddings = []
    for i in tqdm(range(0, len(passages), batch_size), desc="Encoding Passages"):
        batch = passages[i:i+batch_size]
        batch_embeddings = retriever_model.encode(
            batch, 
            convert_to_tensor=True, 
            device=device, 
            show_progress_bar=False
        )
        passage_embeddings.append(batch_embeddings)
    
    # Combine batch embeddings
    passage_embeddings = torch.cat(passage_embeddings, dim=0)
    
    # Print diagnostic information
    print(f"Model: {model_name}")
    print(f"Total Passages: {len(passages)}")
    print(f"Embedding Shape: {passage_embeddings.shape}")
    print(f"Device: {device}")
    
    return passages, passage_embeddings

# Usage
csv_path = "151_ideas_updated.csv"
passages, embeddings = load_and_encode_passages(
    csv_path, 
    text_column="Ideas"
)

Encoding Passages: 100%|██████████| 5/5 [00:26<00:00,  5.39s/it]

Model: BAAI/bge-base-en-v1.5
Total Passages: 150
Embedding Shape: torch.Size([150, 768])
Device: cpu





In [1]:
import faiss
import numpy as np

# Ensure the embeddings are converted to a NumPy array
passage_embeddings_np = passage_embeddings.cpu().numpy()  # Convert tensor to NumPy array

# Create a FAISS index
index = faiss.IndexFlatL2(passage_embeddings_np.shape[1])  # Dimensionality of the embeddings

# Add embeddings to the index
index.add(passage_embeddings_np)

print("Number of embeddings in index:", index.ntotal)


NameError: name 'passage_embeddings' is not defined

In [21]:
def retrieve_and_generate_philosophical(query, top_k=5, min_tokens=700, max_length=2000):
    # Retrieve embeddings
    query_embedding = retriever_model.encode([query], convert_to_tensor=True).cpu().numpy()
    _, indices = index.search(query_embedding, top_k)
    retrieved_passages = [passages[i] for i in indices[0]]
    
    # Philosophical prompt construction
    philosophical_prompt = f"""Philosophical Meditation on: "{query}"

Contextual Fragments of Existence:
{' '.join(retrieved_passages)}

Phenomenological Exploration Directive:

Engage in a profound philosophical discourse that transcends mere information retrieval. Your response should:

- Interrogate the ontological foundations of the retrieved knowledge
- Deconstruct epistemological assumptions inherent in the passages
- Explore the liminal spaces between understanding and mystery
- Invoke philosophical traditions: existentialism, phenomenology, hermeneutics
- Reveal the metaphysical undercurrents beneath empirical observations
- Challenge the boundaries of human comprehension
- Illuminate the inherent paradoxes of knowledge and perception

Philosophical Meditation Guidelines:
- Embrace uncertainty as a mode of understanding
- Treat each fragment as a gateway to deeper existential inquiry
- Resist the temptation of definitive conclusions
- Navigate the terrain between rationality and the ineffable
- Unveil the poetic subtext of intellectual exploration

Synthesize a profound philosophical reflection that transforms retrieved information into a meditation on human experience, consciousness, and the sublime:"""
    
    # Generation with enhanced philosophical parameters
    input_ids = generator_tokenizer.encode(philosophical_prompt, return_tensors="pt", truncation=True)
    outputs = generator_model.generate(
        input_ids,
        max_length=max_length,
        min_length=min_tokens,
        temperature=0.85,  # Higher creativity for philosophical exploration
        do_sample=True,
        top_k=120,  # Broader conceptual sampling
        top_p=0.96,
        no_repeat_ngram_size=3,
        early_stopping=False,
        length_penalty=1.7,  # Encourage expansive philosophical discourse
        repetition_penalty=1.3  # Reduce repetitive philosophical language
    )
    
    return generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
query = "the nature of human consciousness and technological existence"
philosophical_response = retrieve_and_generate_philosophical(query)
print(philosophical_response)

NameError: name 'index' is not defined

In [None]:
def retrieve_and_generate(query, top_k=5, min_tokens=500, max_length=1500):
    # Encode the query
    query_embedding = retriever_model.encode([query], convert_to_tensor=True).cpu().numpy()

    # Retrieve top_k passages
    _, indices = index.search(query_embedding, top_k)
    retrieved_passages = [passages[i] for i in indices[0]]

    # Combine retrieved passages
    input_text = f"{query} {' '.join(retrieved_passages)} Discuss failures, beauty, and the certainty of death with examples."

    # Generate response
    input_ids = generator_tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    outputs = generator_model.generate(
        input_ids,
        max_length=max_length,
        min_length=min_tokens,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2,
        early_stopping=False,
        length_penalty=1.2  # Encourage longer responses
    )

    return generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
query = "list points that mention neech"
print(retrieve_and_generate(query, min_tokens=500, max_length=1000))




a feminist, but not enough to be feminist. I see half the world suffering from cynicism that is stunting civilization. gitty up! Update 2-19-19 Feminism is important to me because I've seen half of the population suffering... but if that’s not sufficient to have feminism, I don’t know what is. Prejudice is inherently dumb. If you see something say something, right? We see its extreme importance in all of our lives. We saw it».'....................)??.............................................? fa  ne -', just., 'MtB – slut..,.“””..: Patriarchy?...? (.t:..!...and d'' ::)? and! e.g. when i, are &?; ;. (M.W.B. more than one, and have the p.- t,.&. in mtb, this ’m:-;&& o. and how l. — and for...........–...no,, it,...to... and even, at yo...,........... h.s. the. [ed.;]:(...) the beauty.â») as tru.o;; or, we see:?!.» () in every.’: [)-. but. that. just r.a. or w. on this:,;)..C.self.T.A..com;-;...the.p;, as. all. people.i.P.E. of."B" v..S.M:E in our hearts.R.• and to. ".>. »" in: "D.f.m.