In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet
import nltk
import re
import pickle
import os

# 📥 NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')

# 📄 Load your CSV file (replace path if needed)
csv_path = "amibot.csv"  # Ensure it has columns: 'Field', 'Value'
try:
    df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding='cp1252')  # fallback encoding

print("✅ Loaded CSV with encoding:", df.columns)

# ✅ Cleaning & synonym expansion
def correct_typos(text):
    text = text.strip().lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text

def expand_with_synonyms(text):
    words = text.split()
    expanded_words = []
    for word in words:
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().replace("_", " "))
        if synonyms:
            expanded_words.append(word + " " + " ".join(list(synonyms)[:2]))
        else:
            expanded_words.append(word)
    return " ".join(expanded_words)

# ✅ Prepare field variants
field_map = {}
variant_to_response = {}
query_list = []

for _, row in df.iterrows():
    field = row["Field"]
    response = row["Value"]
    variants = [v.strip().lower() for v in field.split(",") if v.strip()]
    
    for variant in variants:
        cleaned = correct_typos(variant)
        expanded = expand_with_synonyms(cleaned)
        query_list.append(expanded)
        variant_to_response[expanded] = response

# ✅ Generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(query_list)  # shape: [N, 384]

# ✅ Save everything
os.makedirs("amibot_data", exist_ok=True)
np.save("amibot_data/field_embeddings.npy", embeddings)

with open("amibot_data/query_list.pkl", "wb") as f:
    pickle.dump(query_list, f)

with open("amibot_data/variant_to_response.pkl", "wb") as f:
    pickle.dump(variant_to_response, f)

print("✅ All precomputed and saved.")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 44: invalid start byte