# EpiRecipes

## Importing Libraries

In [5]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Data Collection

In [6]:
file_path = "../raw file/epi_r.csv"
df = pd.read_csv(file_path)

# Exploratory Data Analysis

### Data Overview

In [7]:
df.head(2)

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(679), object(1)
memory usage: 104.0+ MB


In [9]:
df.describe()

Unnamed: 0,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,30 days of groceries,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
count,20052.0,15935.0,15890.0,15869.0,15933.0,20052.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
mean,3.714467,6322.958,100.160793,346.8775,6225.975,0.000299,5e-05,0.000848,0.001346,0.000349,...,0.001247,0.026332,5e-05,0.000299,0.014861,0.00015,0.000349,0.001396,0.000948,0.022741
std,1.340829,359046.0,3840.318527,20456.11,333318.2,0.017296,0.007062,0.029105,0.036671,0.018681,...,0.035288,0.160123,0.007062,0.017296,0.121001,0.012231,0.018681,0.037343,0.030768,0.14908
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.75,198.0,3.0,7.0,80.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.375,331.0,8.0,17.0,294.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.375,586.0,27.0,33.0,711.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,30111220.0,236489.0,1722763.0,27675110.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df.dtypes

title          object
rating        float64
calories      float64
protein       float64
fat           float64
               ...   
cookbooks     float64
leftovers     float64
snack         float64
snack week    float64
turkey        float64
Length: 680, dtype: object

### Handle Missing & Duplicate Data

In [11]:
# df.columns[df.isnull().sum()>=1000]
df.isnull().sum()



title            0
rating           0
calories      4117
protein       4162
fat           4183
              ... 
cookbooks        0
leftovers        0
snack            0
snack week       0
turkey           0
Length: 680, dtype: int64

In [12]:
df = df.dropna()

In [13]:
# Check for duplicates
df.duplicated().sum()



1418

In [14]:

# Remove duplicates
df.drop_duplicates(inplace=True)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14446 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(679), object(1)
memory usage: 75.1+ MB


In [16]:
df.isnull().sum()

title         0
rating        0
calories      0
protein       0
fat           0
             ..
cookbooks     0
leftovers     0
snack         0
snack week    0
turkey        0
Length: 680, dtype: int64

### Drop column have no value

In [17]:
zero_col = df.columns[(df==0).all()]
zero_col

Index(['camping', 'egypt', 'frankenrecipe', 'frittata', 'grand marnier',
       'iowa', 'jamaica', 'las vegas', 'mississippi', 'new hampshire', 'rosé',
       'waffle'],
      dtype='object')

In [18]:
df = df.drop(columns=zero_col)


### Saving the cleaned file

In [19]:
df.to_csv("../cleaned file./cleaned_recipe.csv")

# Feature Engineering

In [20]:
# Step 1: Separate the first column (Title) and the remaining columns
titles = df.iloc[:, 0]  # First column (titles)
ingredient_df = df.iloc[:, 5:]  # Ingredient columns (from column 8 onwards)

# Step 2: Group the ingredients based on presence (1)
feature_group = ingredient_df.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)

# Step 3: Create the final DataFrame with titles and grouped ingredients
final_df = pd.DataFrame({
    'Title': titles,
    'Rating': df['rating'],
    'Calories': df['calories'],
    'Protein': df['protein'],
    'Fat': df['fat'],
    'Sodium': df['sodium'],
    'features': feature_group
})

In [21]:
print("\nFinal DataFrame with Grouped Ingredients:")
final_df.head()


Final DataFrame with Grouped Ingredients:


Unnamed: 0,Title,Rating,Calories,Protein,Fat,Sodium,features
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,"apple, bean, cookie, fruit, kid-friendly, lent..."
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,"bake, bastille day, bon appétit, chill, dried ..."
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,"dairy, fennel, gourmet, new york, potato, soup..."
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,"bake, bon appétit, california, cheese, dairy, ..."
5,The Best Blts,4.375,948.0,19.0,79.0,1042.0,"bacon, basil, bon appétit, food processor, kid..."


In [22]:
# pip install openpyxl  

In [23]:
# pip show openpyxl


### Save the Files

In [24]:
final_df.to_csv('../cleaned file/recipies.csv')

final_df.to_excel('../cleaned file/recipies.xlsx',index=False)