In [1]:
# Data manipulation and analysis
import pandas as pd

# Vectorization and similarity calculation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load the dataset
df = pd.read_csv('Cleaned_Indian_Food_Dataset.csv')

# Display the first few rows
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Cleaned_Indian_Food_Dataset.csv'

In [None]:
# Check for missing values in the dataset
print(df.isnull().sum())

# Drop rows with missing Cleaned-Ingredients or Recipe Names
df = df.dropna(subset=['Cleaned-Ingredients', 'TranslatedRecipeName'])

# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)


In [None]:
# Import the necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style and palette for better aesthetics
sns.set(style="whitegrid")
palette = sns.color_palette("Set2")

# Plot the distribution of cuisines in the dataset
plt.figure(figsize=(12, 8))

# Countplot with improved aesthetics
ax = sns.countplot(y=df['Cuisine'], order=df['Cuisine'].value_counts().index, palette=palette)

# Add titles and labels
plt.title('Cuisine Distribution in Indian Food Dataset', fontsize=18, fontweight='bold', color='#4f4e4e')
plt.xlabel('Number of Recipes', fontsize=14)
plt.ylabel('Cuisine', fontsize=14)

# Remove the top and right spines for a cleaner look
sns.despine(left=True, bottom=True)

# Display counts on the bars
for container in ax.containers:
    ax.bar_label(container, label_type='edge', fontsize=12)

# Adjust the tick parameters for better readability
ax.tick_params(axis='y', labelsize=12)
ax.tick_params(axis='x', labelsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Plot a histogram of ingredient counts
plt.figure(figsize=(10, 6))
sns.histplot(df['Ingredient-count'], bins=20, kde=True)
plt.title('Ingredient Count Distribution')
plt.xlabel('Number of Ingredients')
plt.ylabel('Number of Recipes')
plt.show()


In [None]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the Cleaned-Ingredients column
tfidf_matrix = tfidf.fit_transform(df['Cleaned-Ingredients'])

# Check the shape of the resulting matrix
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


In [None]:
# Compute cosine similarity between recipes based on ingredients
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check the shape of the cosine similarity matrix
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")


In [None]:
# Concatenate all ingredients into a single string
ingredients = ' '.join(df['Cleaned-Ingredients'].tolist())

# Convert the string into a list of individual ingredients
ingredient_list = ingredients.split(',')

# Convert to a pandas Series and get the frequency of each ingredient
ingredient_freq = pd.Series(ingredient_list).value_counts()

# Plot the top 20 most common ingredients
plt.figure(figsize=(10, 6))
ingredient_freq[:20].plot(kind='bar')
plt.title('Top 20 Most Common Ingredients')
plt.xlabel('Ingredient')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
