In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import joblib

# Load your data
df = pd.read_csv("expense.csv")

# Split into features and target
X = df["Expense"]
y = df["Category"]

# Create a pipeline: TF-IDF + Naive Bayes
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(X, y)

# Save model for later use
joblib.dump(model, "expense_categorizer.pkl")

['expense_categorizer.pkl']

In [2]:
# Load the saved model
model = joblib.load("expense_categorizer.pkl")

# New entries
# new_data = ["Zomato Pizza", "Uber School", "AutoRixa", "McDonald's", "Subway Sandwich"]
new_data = []
while True:
    data = input("Enter Title")
    if data == ' ':
        break
    new_data.append(data)
    
# Predict
predicted_categories = model.predict(new_data)

# View results
for title, category in zip(new_data, predicted_categories):
    print(f"{title} -> {category}")


Enter Title Auto Rixa
Enter Title 
Enter Title  


Auto Rixa -> Taxi
 -> Groceries


# Components of This Pipeline:
## TfidfVectorizer()

- Converts text (like "Zomato Pizza") into numerical features.
- It uses TF-IDF (Term Frequency-Inverse Document Frequency), which gives more importance to rare and meaningful words.

- Example:
"Zomato Pizza" becomes a vector like [0, 0.5, 0, 0.3, ...]

## MultinomialNB()

- This is the Naive Bayes classifier, a simple but effective algorithm for text classification.
- It learns which words are associated with which categories (e.g., "pizza" → "Food", "uber" → "Taxi").