In [3]:
# A 
import pandas as pd
from collections import defaultdict
import hashlib

# Step 1: Load data from CSV into a DataFrame
df = pd.read_csv('151_ideas_updated2.csv')

# Convert DataFrame to a list of dictionaries
documents = [{"id": idx, "content": idea} for idx, idea in enumerate(df['Ideas'])]

# Step 2: Creating an Inverted Index manually
inverted_index = defaultdict(list)

for doc in documents:
    doc_id = doc["id"]
    for word in doc["content"].split():
        inverted_index[word].append(doc_id)

# Step 3: Using Hash Tables for Fast Retrieval
def hash_text(text):
    return hashlib.md5(text.encode()).hexdigest()

hash_table = {doc["id"]: doc["content"] for doc in documents}

# Example Query
query = "beauty"
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Output Retrieved Documents
print("Documents containing the query words:")
for doc_id in retrieved_docs:
 

Documents containing the query words:
Document ID: 0, Content: 1) Maximize the Beauty - fully channel the beauty with in. Maybe ask what makes this moment beautiful? See if beauty can be increased in every situation. MtB also could be taken as a use of reason and also a disciplining of the senses to focus on beauty (i.e. all the pretty flowers, all the pretty birds). (Update 3-16-21 -This is one of the first 4 points created because they were on the top of my mind when I first started this project. Maximize the Beauty and Full Expression still are some of the concepts I am most influenced by, create an ethos around, consider to be primary to my concept of virtue, and aim to implement as much as possible.) 
Document ID: 143, Content: 145) this moment - forget everything else, block out other prevailing opinions on how you should act - just maximize the beauty of this moment - like it's the last moment before you die - or even more graphic b4 you are unjustly put b4 a firing squad but ha

In [None]:
#B

In [5]:
# c
import pandas as pd
import dspy
from collections import defaultdict
import hashlib

# Step 1: Load data from CSV into a DataFrame
df = pd.read_csv('151_ideas_updated2.csv')

# Convert DataFrame to a list of dictionaries
documents = [{"id": idx, "content": idea} for idx, idea in enumerate(df['Ideas'])]

# Step 2: Creating an Inverted Index with DSPy (if available)
inverted_index = defaultdict(list)
for doc in documents:
    doc_id = doc["id"]
    for word in doc["content"].split():
        inverted_index[word].append(doc_id)

# Step 3: Using Hash Tables for Fast Retrieval
def hash_text(text):
    return hashlib.md5(text.encode()).hexdigest()

hash_table = {doc["id"]: doc["content"] for doc in documents}

# Example Query
query = "quick brown fox"
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Output Retrieved Documents
print("Documents containing the query words:")
for doc_id in retrieved_docs:
    print(f"Document ID: {doc_id}, Content: {hash_table[doc_id]}")


Documents containing the query words:


In [6]:
# d

import pandas as pd
from collections import defaultdict
import hashlib
import json

# Step 1: Load data from CSV into a DataFrame
df = pd.read_csv('151_ideas_updated2.csv')

# Convert DataFrame to a list of dictionaries
documents = [{"id": idx, "content": idea} for idx, idea in enumerate(df['Ideas'])]

# Step 2: Creating an Inverted Index manually
inverted_index = defaultdict(list)
for doc in documents:
    doc_id = doc["id"]
    for word in doc["content"].split():
        inverted_index[word].append(doc_id)

# Step 3: Using Hash Tables for Fast Retrieval
def hash_text(text):
    return hashlib.md5(text.encode()).hexdigest()

hash_table = {doc["id"]: doc["content"] for doc in documents}

# Write the hash table to a JSON file
with open('hash_table.json', 'w') as file:
    json.dump(hash_table, file)

# Example Query
query = "quick brown fox"
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Output Retrieved Documents
print("Documents containing the query words:")
for doc_id in retrieved_docs:
    print(f"Document ID: {doc_id}, Content: {hash_table[doc_id]}")


Documents containing the query words:


In [None]:
# E - Alpha 
import pandas as pd
from collections import defaultdict
import re
import nltk
import json
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Step 1: Load data from CSV into a DataFrame
df = pd.read_csv('articles.csv')

# Extract the original numbering (handling missing numbers correctly)
df['Original_Number'] = df['Ideas'].str.extract(r'^(\d+)\)').fillna(-1).astype(int)

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Apply preprocessing (removing the number first)
df['Ideas_without_number'] = df['Ideas'].str.replace(r'^\d+\)\s*', '', regex=True)
df['Cleaned_Ideas'] = df['Ideas_without_number'].apply(preprocess_text)

# Create the hash table (handling missing numbers correctly)
hash_table = {}
for _, row in df.iterrows():
    key = row['Original_Number']
    hash_table[key] = {
        "original_content": row['Ideas'],
        "cleaned_content": row['Cleaned_Ideas']
    }

# Step 2: Creating an Inverted Index
inverted_index = defaultdict(list)
for doc_id, data in hash_table.items():
    if doc_id != -1:  # ignore the non numbered entries
        for word in data["cleaned_content"].split():
            inverted_index[word].append(doc_id)

# Example Query
query = "beauty"
query = preprocess_text(query)
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Print the hash table (improved formatting, with original number)
print("\nHash Table:")
for key, value in hash_table.items():
    if key != -1:
        print(f"ID: {key}")
        print(f"  Original Content: {value['original_content']}")
        print(f"  Cleaned Content: {value['cleaned_content']}")
        print("-" * 20)

# Output Retrieved Documents (using original content and number)
print("\nDocuments containing the query words:")
for doc_id in retrieved_docs:
    if doc_id != -1:  # important to avoid printing non existent keys
        print(f"Original Number: {doc_id}, Content: {hash_table[doc_id]['original_content']}")

# Example of how to use the hash table for embedding creation (after pre-filtering)
docs_to_embed = [hash_table[doc_id]['cleaned_content'] for doc_id in retrieved_docs if doc_id != -1]
print(f"\nDocuments to be embedded: {docs_to_embed}")


# Define the directory for storing hash tables
output_dir = "tablisi"
os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

# Create a unique filename using a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS
filename = f"hash_table_{timestamp}.json"
output_path = os.path.join(output_dir, filename)

# Write the hash table to the JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(hash_table, f, indent=4, ensure_ascii=False)

print(f"Hash table saved to: {output_path}")
print(f"\nHash table written to: {output_path}")  # confirmation message


Hash Table:
ID: 1
  Original Content: 1) Maximize the Beauty - fully channel the beauty with in. Maybe ask what makes this moment beautiful? See if beauty can be increased in every situation. MtB also could be taken as a use of reason and also a disciplining of the senses to focus on beauty (i.e. all the pretty flowers, all the pretty birds). (Update 3-16-21 -This is one of the first 4 points created because they were on the top of my mind when I first started this project. Maximize the Beauty and Full Expression still are some of the concepts I am most influenced by, create an ethos around, consider to be primary to my concept of virtue, and aim to implement as much as possible.) 
  Cleaned Content: maximize beauty fully channel beauty maybe ask make moment beautiful see beauty increased every situation mtb also could taken use reason also disciplining sens focus beauty ie pretty flower pretty bird update 31621 one first 4 point created top mind first started project maximize beauty 

In [1]:
# f
import pandas as pd
from collections import defaultdict
import hashlib
import re

# Step 1: Load data from CSV into a DataFrame
df = pd.read_csv('151_ideas_updated2.csv')

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):  # Handle potential non-string values (e.g., NaN)
        return ""
    text = text.lower() # Lowercasing for consistency
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

# Apply preprocessing to the 'Ideas' column
df['Cleaned_Ideas'] = df['Ideas'].apply(preprocess_text)

# Convert DataFrame to a list of dictionaries (using cleaned text)
documents = [{"id": idx, "content": idea} for idx, idea in enumerate(df['Cleaned_Ideas'])]

# Step 2: Creating an Inverted Index
inverted_index = defaultdict(list)

for doc in documents:
    doc_id = doc["id"]
    for word in doc["content"].split():
        inverted_index[word].append(doc_id)

# Step 3: Using Hash Tables for Fast Retrieval (no longer needed for retrieval now that we are using inverted index)
# def hash_text(text):  # Hash is not needed for retrieval, only for unique doc ID if you don't have one
#     return hashlib.md5(text.encode()).hexdigest()

# hash_table = {doc["id"]: doc["content"] for doc in documents}

# Example Query
query = "beauty"
query = preprocess_text(query) # Preprocess the query as well
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Output Retrieved Documents
print("Documents containing the query words:")
for doc_id in retrieved_docs:
    print(f"Document ID: {doc_id}, Content: {df['Ideas'][doc_id]}") # Access original content

Documents containing the query words:
Document ID: 0, Content: 1) Maximize the Beauty - fully channel the beauty with in. Maybe ask what makes this moment beautiful? See if beauty can be increased in every situation. MtB also could be taken as a use of reason and also a disciplining of the senses to focus on beauty (i.e. all the pretty flowers, all the pretty birds). (Update 3-16-21 -This is one of the first 4 points created because they were on the top of my mind when I first started this project. Maximize the Beauty and Full Expression still are some of the concepts I am most influenced by, create an ethos around, consider to be primary to my concept of virtue, and aim to implement as much as possible.) 
Document ID: 72, Content: 73) The Joy of Sacrifice - Sacrificing things is really just adding other things, but doesn’t it feel good to stop doing something so you can increase the love you have for someone in such a solid way. Real sacrifice leading to an increase in real love to th

In [None]:
# g

import pandas as pd
from collections import defaultdict
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Step 1: Load data from CSV into a DataFrame
df = pd.read_csv('151_ideas_updated2.csv')

# Preprocessing function (optimized)
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words) #Return as a string

# Apply preprocessing to the 'Ideas' column
df['Cleaned_Ideas'] = df['Ideas'].apply(preprocess_text)

# Create the hash table (optimized for embedding)
hash_table = {}
for idx, row in df.iterrows():
    hash_table[idx] = {"original_content": row['Ideas'], "cleaned_content": row['Cleaned_Ideas']}

# Step 2: Creating an Inverted Index (still useful for pre-filtering)
inverted_index = defaultdict(list)
for doc_id, data in hash_table.items():
    for word in data["cleaned_content"].split():
        inverted_index[word].append(doc_id)

# Example Query
query = "beauty"
query = preprocess_text(query)
query_words = query.split()

# Retrieval using Inverted Index (pre-filtering)
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Print the hash table (for inspection)
print("Hash Table:")
for key, value in hash_table.items():
    print(f"ID: {key}, Original: {value['original_content'][:50]}..., Cleaned: {value['cleaned_content']}") #truncated for readability

# Output Retrieved Documents (using original content)
print("\nDocuments containing the query words:")
for doc_id in retrieved_docs:
    print(f"Document ID: {doc_id}, Content: {hash_table[doc_id]['original_content']}")

# Example of how to use the hash table for embedding creation (after pre-filtering)
docs_to_embed = [hash_table[doc_id]['cleaned_content'] for doc_id in retrieved_docs]
print(f"\nDocuments to be embedded: {docs_to_embed}")

Hash Table:
ID: 0, Original: 1) Maximize the Beauty - fully channel the beauty ..., Cleaned: 1 maximize beauty fully channel beauty maybe ask make moment beautiful see beauty increased every situation mtb also could taken use reason also disciplining sens focus beauty ie pretty flower pretty bird update 31621 one first 4 point created top mind first started project maximize beauty full expression still concept influenced create ethos around consider primary concept virtue aim implement much possible
ID: 1, Original: 2) Full Expression - it takes a lot of effort for ..., Cleaned: 2 full expression take lot effort one understand comfortable channel real expression hold people back shyness distraction inability focus want express
ID: 2, Original: 3) Expect Rising - this means our expectations are..., Cleaned: 3 expect rising mean expectation constantly rising kind line give em inch theyll take mile also related law diminishing return law diminishing return state productive process adding 