In [None]:
# E - Alpha updated 
import pandas as pd
from collections import defaultdict
import re
import nltk
import json
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime


# Check current directory and files
print("Current Directory:", os.getcwd())
print("Files in Directory:", os.listdir())

# Attempt to load CSV
try:
    df = pd.read_csv('articles.csv', quotechar='"', lineterminator='\n', skip_blank_lines=True)
    print("CSV loaded successfully. First few rows:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'articles.csv' not found. Check the file location.")
    exit()
except pd.errors.ParserError as e:
    print(f"Error parsing CSV: {e}")
    exit()
except Exception as e:
    print(f"Unexpected error: {e}")
    exit()

# Ensure required column exists
if 'articles' not in df.columns:
    print("Error: Column 'articles' not found in the CSV file.")
    exit()

# Proceed with processing...


nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Extract the original numbering (handling missing numbers correctly)
df['Original_Number'] = df['article'].str.extract(r'^(\d+)\)').fillna(-1).astype(int)

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Apply preprocessing (removing the number first)
df['articles_without_number'] = df['articles'].str.replace(r'^\d+\)\s*', '', regex=True)
df['Cleaned_articles'] = df['articles_without_number'].apply(preprocess_text)

# Create the hash table (handling missing numbers correctly)
hash_table = {}
for _, row in df.iterrows():
    key = row['Original_Number']
    hash_table[key] = {
        "original_content": row['articles'],
        "cleaned_content": row['Cleaned_articles']
    }

# Step 2: Creating an Inverted Index
inverted_index = defaultdict(list)
for doc_id, data in hash_table.items():
    if doc_id != -1:  # ignore the non numbered entries
        for word in data["cleaned_content"].split():
            inverted_index[word].append(doc_id)

# Example Query
query = "beauty"
query = preprocess_text(query)
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Print the hash table (improved formatting, with original number)
print("\nHash Table:")
for key, value in hash_table.items():
    if key != -1:
        print(f"ID: {key}")
        print(f"  Original Content: {value['original_content']}")
        print(f"  Cleaned Content: {value['cleaned_content']}")
        print("-" * 20)

# Output Retrieved Documents (using original content and number)
print("\nDocuments containing the query words:")
for doc_id in retrieved_docs:
    if doc_id != -1:  # important to avoid printing non existent keys
        print(f"Original Number: {doc_id}, Content: {hash_table[doc_id]['original_content']}")

# Example of how to use the hash table for embedding creation (after pre-filtering)
docs_to_embed = [hash_table[doc_id]['cleaned_content'] for doc_id in retrieved_docs if doc_id != -1]
print(f"\nDocuments to be embedded: {docs_to_embed}")


# Define the directory for storing hash tables
output_dir = "tablisi"
os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

# Create a unique filename using a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS
filename = f"hash_table_{timestamp}.json"
output_path = os.path.join(output_dir, filename)

# Write the hash table to the JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(hash_table, f, indent=4, ensure_ascii=False)

print(f"Hash table saved to: {output_path}")
print(f"\nHash table written to: {output_path}")  # confirmation message

In [1]:
# just hash module
import pandas as pd
import re
import nltk
import json
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime

# Download necessary NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
    return " ".join(words)

# Step 1: Load data from CSV
try:
    df = pd.read_csv('articles.csv', quotechar='"', lineterminator='\n', skip_blank_lines=True)
except FileNotFoundError:
    print("Error: 'articles.csv' not found.")
    exit()
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    exit()

# Ensure the required column exists
if 'articles' not in df.columns:
    print("Error: Column 'articles' not found in the CSV file.")
    exit()

# Extract original numbering if present (adjust logic if numbering isn't relevant)
df['Original_Number'] = df['articles'].str.extract(r'^(\d+)\)').fillna(-1).astype(int)

# Preprocess the 'articles' column
df['Cleaned_articles'] = df['articles'].apply(preprocess_text)

# Create the hash table
hash_table = {
    idx: {
        "original_content": row['articles'],
        "cleaned_content": row['Cleaned_articles']
    }
    for idx, row in df.iterrows()
}

# Save the hash table to a JSON file
output_dir = "tablisi"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "hash_table.json")
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(hash_table, f, indent=4, ensure_ascii=False)

print(f"Hash table saved to: {output_path}")


ParserError: Error tokenizing data. C error: Expected 1 fields in line 8, saw 2



NameError: name 'df' is not defined

: 

In [None]:
# just query and inverted indexing
import json
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the hash table from the JSON file
hash_table_path = "tablisi/hash_table_<timestamp>.json"  # Replace <timestamp> with the correct file
with open(hash_table_path, 'r', encoding='utf-8') as f:
    hash_table = json.load(f)

# Rebuild the inverted index
inverted_index = defaultdict(list)
for doc_id, data in hash_table.items():
    doc_id = int(doc_id)
    if doc_id != -1:  # Ignore non-numbered entries
        for word in data["cleaned_content"].split():
            inverted_index[word].append(doc_id)

# Preprocessing function for queries
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Example Query
query = "beauty"
query = preprocess_text(query)
query_words = query.split()

# Retrieval using Inverted Index
retrieved_docs = set()
for word in query_words:
    if word in inverted_index:
        retrieved_docs.update(inverted_index[word])

# Output Retrieved Documents
print("\nDocuments containing the query words:")
for doc_id in retrieved_docs:
    if doc_id != -1:  # Ignore invalid keys
        print(f"Original Number: {doc_id}, Content: {hash_table[str(doc_id)]['original_content']}")
