In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import string
import re

# Download NLTK resources
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('Food_Recipe.csv')  # Replace with your file path
df = df.dropna(subset=['instructions', 'prep_time (in mins)', 'cook_time (in mins)'])
df['prep_time (in mins)'] = pd.to_numeric(df['prep_time (in mins)'], errors='coerce')
df['cook_time (in mins)'] = pd.to_numeric(df['cook_time (in mins)'], errors='coerce')
# Create difficulty labels based on total cooking time

df['total_time'] = df["prep_time (in mins)"] + df["cook_time (in mins)"]

# Define difficulty thresholds (adjust based on your domain knowledge)
bins = [0, 30, 60, float('inf')]
labels = ['easy', 'medium', 'hard']
df['difficulty'] = pd.cut(df['total_time'], bins=bins, labels=labels)

# Text preprocessing
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    words = text.split()
    # Remove stopwords and stem
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['cleaned_instructions'] = df['instructions'].apply(preprocess_text)

# Feature extraction with TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_instructions']).toarray()
y = df['difficulty']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Logistic Regression classifier
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Example prediction function
def predict_difficulty(recipe_text):
    cleaned_text = preprocess_text(recipe_text)
    text_vector = tfidf.transform([cleaned_text]).toarray()
    return model.predict(text_vector)[0]

# Example usage
sample_recipe = """
Chop vegetables and sauté in pan. Add spices and cook for 10 minutes.
Mix with cooked rice and serve hot.
"""
print(f"\nPredicted Difficulty: {predict_difficulty(sample_recipe)}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.60

Classification Report:
              precision    recall  f1-score   support

        easy       0.75      0.25      0.38        36
        hard       0.50      0.13      0.21        30
      medium       0.60      0.95      0.73        78

    accuracy                           0.60       144
   macro avg       0.62      0.44      0.44       144
weighted avg       0.61      0.60      0.53       144


Predicted Difficulty: medium


In [4]:
import joblib
from pathlib import Path

# Create directory to save model assets
save_dir = Path("tfidf_model")
save_dir.mkdir(exist_ok=True)

# Save the TF-IDF vectorizer
joblib.dump(tfidf, save_dir / "tfidf_vectorizer.joblib")

# Save the trained model
joblib.dump(model, save_dir / "logistic_regression_model.joblib")

# Save preprocessing components
preprocess_config = {
    "bins": bins,
    "labels": labels,
    "stop_words": list(stop_words),  # Convert set to list for serialization
    "max_features": 5000,
    "punctuation": string.punctuation,
    "regex_pattern": r'\d+'
}

joblib.dump(preprocess_config, save_dir / "preprocess_config.joblib")

print("Model and preprocessing assets saved to:", save_dir)

Model and preprocessing assets saved to: tfidf_model


In [5]:
import shutil
from google.colab import files
# Path to your folder
folder_path = 'tfidf_model'

# Output zip file
output_path = 'difficulty_classifier_tfidf.zip'

# Compress the folder
shutil.make_archive(output_path.replace('.zip', ''), 'zip', folder_path)

files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import joblib
import re
import string
from pathlib import Path
from nltk.stem import PorterStemmer

class TFIDFPredictor:
    def __init__(self, model_dir="difficulty_classifier_tfidf"):
        self.model_dir = Path(model_dir)

        # Load saved components
        self.tfidf = joblib.load(self.model_dir / "tfidf_vectorizer.joblib")
        self.model = joblib.load(self.model_dir / "logistic_regression_model.joblib")
        self.config = joblib.load(self.model_dir / "preprocess_config.joblib")

        # Initialize preprocessing tools
        self.stop_words = set(self.config["stop_words"])
        self.ps = PorterStemmer()
        self.punctuation = self.config["punctuation"]
        self.regex_pattern = self.config["regex_pattern"]

    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', self.punctuation))
        # Remove numbers
        text = re.sub(self.regex_pattern, '', text)
        # Tokenize and process
        words = text.split()
        words = [self.ps.stem(word) for word in words if word not in self.stop_words]
        return ' '.join(words)

    def predict(self, recipe_text):
        # Preprocess input
        cleaned_text = self.preprocess_text(recipe_text)
        # Transform using saved vectorizer
        text_vector = self.tfidf.transform([cleaned_text]).toarray()
        # Make prediction
        return self.model.predict(text_vector)[0]

# Example usage
if __name__ == "__main__":
    predictor = TFIDFPredictor()

    sample_recipe = """
    Chop vegetables and sauté in pan. Add spices and cook for 10 minutes.
    Mix with cooked rice and serve hot.
    """
    print(f"Predicted Difficulty: {predictor.predict(sample_recipe)}")

    test_recipe = "Prepare dough and let it rise for 2 hours. Bake at 350°F for 45 minutes."
    print(f"Predicted Difficulty: {predictor.predict(test_recipe)}")

Predicted Difficulty: medium
Predicted Difficulty: hard
