In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

# Expanded training dataset with more diverse examples
train_data = pd.DataFrame({
    "Description": [
        "amazon order", "flipkart shopping", "bought clothes online", "online shopping on myntra",
        "grocery store", "supermarket purchase", "bought vegetables", "dairy milk and eggs",
        "mcdonalds meal", "starbucks coffee", "lunch at cafe", "dinner at restaurant",
        "uber ride", "bus ticket", "flight travel", "train journey",
        "fuel station", "petrol pump", "diesel topup", "filled fuel at shell",
        "netflix subscription", "movie streaming", "watched movie online", "disney+ subscription",
        "electricity bill", "wifi recharge", "internet service", "phone bill payment"
    ],
    "Category": [
        "Shopping", "Shopping", "Shopping", "Shopping",
        "Groceries", "Groceries", "Groceries", "Groceries",
        "Food & Dining", "Food & Dining", "Food & Dining", "Food & Dining",
        "Transportation", "Transportation", "Transportation", "Transportation",
        "Fuel", "Fuel", "Fuel", "Fuel",
        "Entertainment", "Entertainment", "Entertainment", "Entertainment",
        "Utilities", "Utilities", "Utilities", "Utilities"
    ]
})

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['Description'])
y_train = train_data['Category']

# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn.fit(X_train, y_train)

# Prediction function with confidence threshold
def predict_category(description, threshold=0.4):
    X_test = vectorizer.transform([description])
    distances, indices = knn.kneighbors(X_test, return_distance=True)

    confidences = 1 - distances[0]  # Convert distance to similarity
    best_confidence = confidences[0]

    if best_confidence < threshold:
        return "others"
    
    return y_train.iloc[indices[0][0]]

# Test examples
test_descriptions = [
    "ordered something from amazon",
    "recharged my phone",
    "watched something on disney+",
    "refilled petrol ",
    "bought eggs",
    "invested in stocks",  # not in training -> should be 'Unknown'
    "paid car insurance"   # not in training -> should be 'Unknown'
]

# Predict and print results
for desc in test_descriptions:
    print(f"{desc} => {predict_category(desc)}")


ordered something from amazon => Shopping
recharged my phone => Utilities
watched something on disney+ => Entertainment
refilled petrol  => Fuel
bought eggs => Groceries
invested in stocks => others
paid car insurance => others
