In [None]:
# 📦 Install required packages (run once)
!pip install -q sentence-transformers rapidfuzz nltk

# 📥 Imports
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import fuzz
from nltk.corpus import wordnet
import nltk
import re
import torch
import pickle
import os

# 📌 Download NLTK corpus
nltk.download('wordnet')
nltk.download('omw-1.4')

# 🧠 Load transformer model
print("📥 Loading transformer model...")
model = SentenceTransformer("paraphrase-MiniLM-L3-v2")

# 📄 Load your CSV file (replace path if needed)
csv_path = "amibot.csv"  # Ensure it has columns: 'Field', 'Value'
try:
    df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding='cp1252')  # fallback encoding

print("✅ Loaded CSV with encoding:", df.columns)

# 📚 Preprocess data
field_variants = []
field_map = {}

for idx, row in df.iterrows():
    field_str = row["Field"]
    value = row["Value"]
    variants = [v.strip().lower() for v in field_str.split(",") if v.strip()]
    for v in variants:
        field_variants.append(v)
        field_map[v] = value  # Map each variant to its value

field_embeddings = model.encode(field_variants, convert_to_tensor=True)

# 🔧 Function: Correct typos (basic spell fix using regex for now)
def correct_typos(text):
    text = text.strip().lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text

# 🔧 Function: Expand with synonyms using WordNet
def expand_with_synonyms(text):
    words = text.split()
    expanded_words = []
    for word in words:
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().replace("_", " "))
        if synonyms:
            expanded_words.append(word + " " + " ".join(list(synonyms)[:2]))
        else:
            expanded_words.append(word)
    return " ".join(expanded_words)

# 🤖 Function: Get AmiBot response
def get_response(user_input, model, field_variants, field_embeddings, field_map, threshold=0.55, fuzz_threshold=55):
    original_input = user_input.strip()
    corrected_input = correct_typos(original_input)
    expanded_input = expand_with_synonyms(corrected_input)

    query_embedding = model.encode(expanded_input, convert_to_tensor=True)
    similarities = util.cos_sim(query_embedding, field_embeddings)[0]

    best_score = float(similarities.max())
    best_idx = int(similarities.argmax())
    best_field = field_variants[best_idx]
    best_answer = field_map[best_field]

    fuzzy_score = fuzz.token_set_ratio(original_input.lower(), best_field.lower())

    if best_score >= threshold or fuzzy_score >= fuzz_threshold:
        return f"\n✅ Matched: '{best_field}'\n📐 Semantic: {best_score:.2f}, 🔤 Fuzzy: {fuzzy_score}\n👉 {best_answer}"
    else:
        return f"\n🤖 Sorry, I’m not sure what you meant.\n💡 Did you mean: '{best_field}'?\nPlease rephrase your question."

# 💾 Save necessary components for Flask app
save_dir = "amibot_data"
os.makedirs(save_dir, exist_ok=True)

with open(f"{save_dir}/df.pkl", "wb") as f:
    pickle.dump(df, f)

with open(f"{save_dir}/field_variants.pkl", "wb") as f:
    pickle.dump(field_variants, f)

with open(f"{save_dir}/field_map.pkl", "wb") as f:
    pickle.dump(field_map, f)

torch.save(field_embeddings, f"{save_dir}/field_embeddings.pt")

print("💾 Saved df.pkl, field_variants.pkl, and field_embeddings.pt to 'amibot_data/'")

# 🧪 Test in Notebook (example)
while True:
    user_input = input("\nAsk AmiBot (type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break
    response = get_response(user_input, model, field_variants, field_embeddings, field_map)
    print(response)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


📥 Loading transformer model...
✅ Loaded CSV with encoding: Index(['Field', 'Value'], dtype='object')
💾 Saved df.pkl, field_variants.pkl, and field_embeddings.pt to 'amibot_data/'



Ask AmiBot (type 'exit' to quit):  name



✅ Matched: 'name'
📐 Semantic: 0.44, 🔤 Fuzzy: 100.0
👉 I’m Amritanshu Mishra — this bot speaks from my data, my words, my world.



Ask AmiBot (type 'exit' to quit):  wife



✅ Matched: 'wife'
📐 Semantic: 0.81, 🔤 Fuzzy: 100.0
👉 Committed for life — merged with Sneha Mishra in a lifelong partnership.



Ask AmiBot (type 'exit' to quit):  skills



✅ Matched: 'skills'
📐 Semantic: 0.55, 🔤 Fuzzy: 100.0
👉 Skilled in MERN Stack, React, Node.js, and advanced AI/ML systems, including Transformers.


In [1]:
# 1. Install & Import Packages
# !pip install -q sentence-transformers rapidfuzz nltk
# Load the Transformer Model
# model = SentenceTransformer("all-MiniLM-L6-v2")
# This model converts text into dense vector embeddings.

# Embeddings capture semantic meaning (not just exact words).

# It’s fast and lightweight.
# Internally:
# "father name" → [0.23, -0.14, ..., 0.01] (384-dim vector)
# "What’s your dad's name?" → similar vector
# 3. Read and Parse CSV
# Each row in the CSV maps Field(s) → Value (response).
# 6. User Input Flow

# Comma-separated synonyms like:
# "father name, dad name, papa" are split and preprocessed.

# Preprocess: Extract Field Variants
# for row in df:
#     variants = field_str.split(",")
#     for v in variants:
#         field_variants.append(v.lower().strip())
#         field_map[v.lower()] = value
# field_variants = ["father name", "dad name", "papa", "your name", ...]
# field_map = {
#   "father name": "Anshul Sharma",
#   "papa": "Anshul Sharma",
#   ...
# }

# 5. Generate Embeddings
# field_embeddings = model.encode(field_variants, convert_to_tensor=True)
# Each field becomes a semantic vector:
# "father name" → tensor([0.12, -0.55, ..., 0.33])

# 6. User Input Flow
# def get_response(user_input, ...)
# Internally:

# a. Correct Typos
# correct_typos("Dad’s n@me!") → "dads name"
# b. Expand with Synonyms
# expand_with_synonyms("dads name") → "dads name dad father"

# c. Encode Input
# query_embedding = model.encode(expanded_input)
# d. Computes similarity between query and each field:
# "What's your dad’s name?" vs ["father name", "your name", ...]
# → cosine scores like [0.87, 0.22, 0.04, ...]

# e. Fuzzy Score
# fuzz.token_set_ratio("what’s your dad’s name", "father name") → 80

# f. Threshold-Based Match
# if similarity > 0.55 or fuzzy_score > 55:
#     return correct response
# else:
#     return "Sorry, I’m not sure..."

# 7. Save Artifacts for Flask

# pickle.dump(df, field_variants, field_map)
# torch.save(field_embeddings)
# Avoid recomputing embeddings when deploying.

# Use this in your Flask app without repeating preprocessing.


In [2]:
# | 🔢 Step | 🧩 Component             | 📝 Description                         | 🧠 Internal Operation                                                      | 🧪 Example                                            |
# | ------- | ------------------------ | -------------------------------------- | -------------------------------------------------------------------------- | ----------------------------------------------------- |
# | 1️⃣     | **Install Packages**     | Installs required libraries            | Downloads and sets up `sentence-transformers`, `rapidfuzz`, and `nltk`     | `pip install -q sentence-transformers rapidfuzz nltk` |
# | 2️⃣     | **Import Modules**       | Loads Python packages                  | Imports for NLP, embedding, fuzzy logic, and preprocessing                 | `import pandas as pd`, `import torch`, etc.           |
# | 3️⃣     | **Download WordNet**     | Enables synonym expansion              | Downloads NLTK corpora: `wordnet` & `omw-1.4`                              | `nltk.download('wordnet')`                            |
# | 4️⃣     | **Load CSV File**        | Loads personal Q\&A data               | Reads `amibot.csv` into a DataFrame with 'Field' and 'Value' columns       | CSV sample: `father name, Anshul Sharma`              |
# | 5️⃣     | **Parse Field Variants** | Splits Field into multiple query forms | For each comma-separated variation in "Field", create mappings             | `"father name, dad name"` → 2 keys                    |
# | 6️⃣     | **Build Mapping Dicts**  | Store phrases and answers              | `field_variants = []` stores queries, `field_map = {}` maps to values      | `"dad name" → Anshul Sharma`                          |
# | 7️⃣     | **Generate Embeddings**  | Semantic vectors for field variants    | Converts all `field_variants` to dense vectors using `SentenceTransformer` | `"father name" → [0.23, -0.54, ..., 0.11]`            |
# | 8️⃣     | **Typo Correction**      | Pre-clean user input                   | Removes symbols and lowercases the input via regex                         | `"Dad’s name?" → "dads name"`                         |
# | 9️⃣     | **Synonym Expansion**    | Enhances semantic reach                | Adds 1–2 synonyms from WordNet to each word                                | `"dad"` → `"dad father papa"`                         |
# | 🔟      | **User Input Encoding**  | Transforms input to embedding          | Uses model to encode expanded user input                                   | `"Who is your dad?" → tensor`                         |
# | 1️⃣1️⃣  | **Cosine Similarity**    | Semantic comparison                    | Measures angle between input vector and all stored field vectors           | `cos_sim = 0.82 with "father name"`                   |
# | 1️⃣2️⃣  | **Fuzzy Matching**       | Textual string similarity              | Uses `fuzz.token_set_ratio` to score rough matches                         | `"Who is your dad?" vs "father name" → 76`            |
# | 1️⃣3️⃣  | **Response Selection**   | Final decision on best match           | Chooses highest score above thresholds: `cos_sim > 0.55 or fuzzy > 55`     | ✅ Match: `"father name" → Anshul Sharma"`             |
# | 1️⃣4️⃣  | **Fallback Message**     | Handles low-match inputs               | Suggests closest match or asks user to rephrase                            | `"🤖 Sorry, I’m not sure..."`                         |
# | 1️⃣5️⃣  | **Save Artifacts**       | Save all required objects              | Dumps model outputs and mappings to `amibot_data/` folder for Flask        | `field_embeddings.pt`, `field_map.pkl`                |
# | 1️⃣6️⃣  | **Interactive Testing**  | Run in Jupyter loop                    | Continuously prompt for input, display match & answer                      | `input("Ask AmiBot: ")`                               |


In [3]:
# | 🔎 User Input             | 🎯 Cleaned Input        | 🧠 Expanded Input                   | 🔗 Best Match    | 📐 Cosine Sim | 🔤 Fuzzy Score | ✅ Final Response            |
# | ------------------------- | ----------------------- | ----------------------------------- | ---------------- | ------------- | -------------- | --------------------------- |
# | "What’s your dad’s name?" | `whats your dads name`  | `whats your dads name dad father`   | `father name`    | 0.87          | 78             | `Anshul Sharma`             |
# | "Tell me your birthday"   | `tell me your birthday` | `tell me your birthday natal birth` | `dob`            | 0.74          | 68             | `09 September 1996`         |
# | "Favourite dish?"         | `favourite dish`        | `favourite dish food meal`          | `favourite food` | 0.51          | 43             | `🤖 Sorry, please rephrase` |


In [4]:
# | 📁 File               | 📄 Format      | 🧠 Contents                           |
# | --------------------- | -------------- | ------------------------------------- |
# | `df.pkl`              | Pickle         | Original CSV DataFrame                |
# | `field_variants.pkl`  | Pickle         | All phrases extracted from 'Field'    |
# | `field_map.pkl`       | Pickle         | Maps each variant → Value             |
# | `field_embeddings.pt` | PyTorch Tensor | Vector representation of all variants |


In [5]:
# | Setting               | Description                                 | Recommendation                    |
# | --------------------- | ------------------------------------------- | --------------------------------- |
# | `threshold = 0.55`    | Minimum cosine similarity to consider match | Lower to 0.5 for broader matches  |
# | `fuzz_threshold = 55` | Minimum fuzzy ratio for textual match       | Keep above 50 to avoid false hits |


In [6]:
# ┌────────────────────────────────────────────┐
# │           START AMIBOT SYSTEM              │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔄 Load CSV File (amibot.csv)              │
# │ - Columns: 'Field', 'Value'                │
# │ - Example: "father name, dad name", "Anshul Sharma" │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔄 Parse and Preprocess Fields              │
# │ - Split comma-separated fields             │
# │ - Store in:                                │
# │     • field_variants (list of queries)     │
# │     • field_map (dict: query → answer)     │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ ⚙️ Encode All field_variants Using Model    │
# │ - SentenceTransformer("all-MiniLM-L6-v2")  │
# │ - Convert each query to semantic vector    │
# │ - Save as: field_embeddings (tensor list)  │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 💾 Save Artifacts to Disk                   │
# │ - df.pkl, field_map.pkl, field_variants.pkl│
# │ - field_embeddings.pt                      │
# └────────────────────────────────────────────┘

# ────────────────────────────────────────────────
# ▶️ SYSTEM IS READY — USER ENTERS A QUERY BELOW:
# ────────────────────────────────────────────────

#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🧍 USER INPUTS QUESTION (e.g., “Dad’s name?”) │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔧 Step 1: Preprocess Input                  │
# │ - Lowercase                                 │
# │ - Remove punctuation and extra whitespace   │
# │ → "Dad’s name?" → "dads name"               │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔧 Step 2: Synonym Expansion (WordNet)      │
# │ - For each word in input:                  │
# │     • Add top 1–2 synonyms                 │
# │ → "dads name" → "dads name father dad"     │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔧 Step 3: Encode Expanded Input            │
# │ - Use same SentenceTransformer model       │
# │ - Generate semantic embedding              │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔍 Step 4: Semantic Comparison              │
# │ - Cosine similarity between user input &   │
# │   each field_variant embedding             │
# │ → Get best_match, best_score               │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔤 Step 5: Fuzzy String Matching            │
# │ - Compare original input vs. best_match    │
# │ - Use RapidFuzz `token_set_ratio()`        │
# │ → Get fuzzy_score                          │
# └────────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔎 Step 6: Check Match Thresholds           │
# │ - If best_score ≥ 0.55  OR                 │
# │   fuzzy_score ≥ 55                         │
# │   → Proceed with best_match                │
# │ - Else → Go to fallback response           │
# └────────────────────────────────────────────┘
#       │                            │
#       ▼                            ▼
# ┌────────────────────┐    ┌────────────────────────────┐
# │ ✅ MATCH FOUND      │    │ ❌ NO CONFIDENT MATCH       │
# └────────────────────┘    └────────────────────────────┘
#       │                            │
#       ▼                            ▼
# ┌────────────────────────────────────────────┐
# │ 🔁 Step 7: Retrieve Answer from field_map   │
# │ - Lookup value using best_match            │
# │ - Example: "father name" → "Anshul Sharma" │
# └────────────────────────────────────────────┘
#       │                            │
#       ▼                            ▼
# ┌────────────────────────────┐   ┌────────────────────────────────────────┐
# │ 💬 Return Response:         │   │ 💬 Fallback:                           │
# │   ✅ Matched: ‘father name’ │   │   🤖 Sorry, I’m not sure what you meant│
# │   👉 Anshul Sharma          │   │   💡 Suggested closest: ‘father name’ │
# └────────────────────────────┘   └────────────────────────────────────────┘
#                   │
#                   ▼
# ┌────────────────────────────────────────────┐
# │ 🔁 Loop: Wait for Next User Query or Exit   │
# └────────────────────────────────────────────┘


In [1]:
# | 🔢 Index | 🧾 File               | 📦 Format         | 📌 Contents                                                                     | 🧠 Purpose                                                                                | 📂 Example                                                                                  |
# | -------- | --------------------- | ----------------- | ------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
# | 1️⃣      | `df.pkl`              | Pickled DataFrame | A full table mapping each **field** (e.g., `Name`) to its **response**          | Used to show all available knowledge, help with UI listing, manual lookup, or export      | `{"field": "Name", "value": "Amritanshu Mishra"}`                                           |
# | 2️⃣      | `field_map.pkl`       | Pickled dict      | Dictionary mapping every **user variant** to a **canonical field**              | Enables the bot to map fuzzy or alternative inputs to a consistent, known response source | `{"your name": "Name", "who are you": "Name"}` maps both to the `"Name"` field              |
# | 3️⃣      | `field_variants.pkl`  | Pickled list      | A list of **all accepted phrases** or variants asked by users                   | Used as the raw text input to create sentence embeddings or apply fuzzy matching          | `["your name", "what's your full name", "who are you", "tell me your name"]`                |
# | 4️⃣      | `field_embeddings.pt` | PyTorch tensor    | A tensor with **vectorized embeddings** (e.g., SentenceTransformer) of variants | Allows fast **cosine similarity search** when user input doesn’t exactly match a variant  | Embedding for "what's your full name" stored as a 384-dim vector to match to `"Name"` field |


In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model.save('./local_model')


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'




In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
model.save('./local_model')

# Loading model locally


KeyboardInterrupt: 