In [None]:
import utils
import cleaning
import pandas as pd
import ast
import re


In [None]:
cleaning.recipes_ratings_merged_cleaned()
df = pd.read_csv("../data/Processed/recipes with ratings.csv")

In [None]:
df.sample(5)

In [None]:
# Check the first element of each column to determine if they are strings or lists
first_tags = df['tags'].iloc[0]
first_ingredients = df['ingredients'].iloc[0]
first_nutrition = df['nutrition'].iloc[0]
first_steps = df['steps'].iloc[0]

# Print the type of the first element of each column
types = {
    'tags': type(first_tags),
    'ingredients': type(first_ingredients),
    'nutrition': type(first_nutrition),
    'steps': type(first_steps)
}

# Display the types
for column, typ in types.items():
    print(f'The first element of the {column} column is of type: {typ}')

# Convert the 'tags' column from string to list
if isinstance(df['tags'].iloc[0], str):
    df['tags'] = df['tags'].apply(lambda x: x.strip('[]').replace("'", '').split(', '))

# Convert the 'ingredients' column from string to list
if isinstance(df['ingredients'].iloc[0], str):
    df['ingredients'] = df['ingredients'].apply(lambda x: x.strip('[]').replace("'", '').split(', '))

# Convert the 'nutrition' column from string to list
if isinstance(df['nutrition'].iloc[0], str):
    df['nutrition'] = df['nutrition'].apply(lambda x: x.strip('[]').replace("'", '').split(', '))

# Convert the 'steps' column from string to list
if isinstance(df['steps'].iloc[0], str):
    df['steps'] = df['steps'].apply(lambda x: x.strip('[]').replace("'", '').split(', '))

In [None]:
# Function to categorize time into intervals
def categorize_time(minutes):
    if minutes > 90:
        return 'More than 1h 30min'
    else:
        interval = (minutes // 15) * 15
        return f'About {interval} min' if interval != 0 else 'About 15 min'

# Apply the function to create a new column
df['cookking_time'] = df['minutes'].apply(categorize_time)
df


In [None]:
nutrition_df = cleaning.create_nutrition_df(df, 'nutrition', 'recipe_id')


In [None]:
# Function to categorize number of ingredients into intervals
def categorize_ingredients(n_ingredients):
    if n_ingredients <= 5:
        return '0-5'
    elif 5 < n_ingredients <= 10:
        return '5-10'
    elif 10 < n_ingredients <= 20:
        return '10-20'
    elif 20 < n_ingredients <= 30:
        return '20-30'
    elif 30 < n_ingredients <= 40:
        return '30-40'
    else:
        return '> 40'

# Apply the function to create a new column
df['ingredient_category'] = df['n_ingredients'].apply(categorize_ingredients)

In [None]:
def aggregate_unique_lists(df, column_name):
    """
    Aggregates all the lists from a specified column into a single list of unique elements.

    Parameters:
    df (pd.DataFrame): DataFrame to process.
    column_name (str): Name of the column containing lists or string representations of lists.

    Returns:
    list: A single list containing all unique elements from the lists in the specified column.
    """

    unique_elements = set()
    for item in df[column_name].dropna():
        # Convert string representation of list to actual list if necessary
        if isinstance(item, str):
            try:
                item = ast.literal_eval(item)
            except ValueError:
                continue  # Skip items that cannot be converted

        # Check if the item is a list and add its elements to the set
        if isinstance(item, list):
            unique_elements.update(item)
        else:
            unique_elements.add(item)

    return list(unique_elements)



In [None]:
# Calculate the median number of ratings across all recipes
# This value serves as a threshold to differentiate between recipes with many vs few ratings
m = df['number_of_ratings'].quantile(0.50)

# Compute the mean average rating across all recipes
# This represents the typical rating a recipe receives in the dataset
C = df['average_rating'].mean()

# Define a function to calculate the weighted rating for each recipe
def weighted_rating(x, m=m, C=C):
    v = x['number_of_ratings']  # Number of ratings for the recipe
    R = x['average_rating']     # Average rating of the recipe
    # Weighted rating formula: balances the recipe's average rating (R) with the mean rating (C)
    return (v/(v+m) * R) + (m/(m+v) * C)

# Apply the weighted rating formula to each recipe in the filtered DataFrame
df['weighted_rating'] = df.apply(weighted_rating, axis=1)

In [None]:
 # Call the ingredients function rom utils, in order to get all ingredients in a list. 
ingredients_list = utils.aggregate_unique_lists(df, 'ingredients')
# Save the ingredients into a csv file for later usage
ingredients_list.to_csv("../data/Processed/ingredients_list.csv")

# Creating a nutrition DataFrame by extracting and formatting nutrition data.
nutrition_df = cleaning.create_nutrition_df(df, 'nutrition', 'recipe_id')
nutrition_df.to_csv("../data/Processed/nutrition_df.csv")


In [None]:
# Checking the count of restrictions in total for each recipe. As I want to clean a bit the dataframe and make it weight less, we want to keep the one that have at least 2 restrictions.

restriction_counts = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}

# Iterate over the dataframe and count the number of restrictions for each recipe
for restrictions in df['restrictions']:
    num_restrictions = len(restrictions)  # Get the number of restrictions for the recipe
    if num_restrictions in restriction_counts:
        restriction_counts[num_restrictions] += 1  # Update the count for this number of restrictions
    else:
        # For cases where the number of restrictions is more than 5
        restriction_counts[num_restrictions] = 1

# Display the counts
restriction_counts