In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer


# load and look at data

In [2]:
data = pd.read_csv('data/preprocessed.csv')
data.head().T

  data = pd.read_csv('data/preprocessed.csv')


Unnamed: 0,0,1,2,3,4
code,00000000000000225,00000000000000225,00000000000026772226,00000000000026772226,0000000000100
food_groups_en_x,fruits and vegetables,vegetables,milk and dairy products,dairy desserts,fats and sauces
countries_en_x,france,france,france,france,france
product_name,jeunes pousses,jeunes pousses,Skyr,Skyr,moutarde au moût de raisin
countries_en_y,France,France,France,France,France
brands,endives,endives,Danone,Danone,courte paille
abbreviated_product_name,,,,,
generic_name,,,,,
quantity,,,480 g,480 g,100g
ecoscore_score,79.0,79.0,67.0,67.0,54.0


In [3]:
data.shape

(2540360, 23)

In [4]:
def non_na_rows(df, col):
    return df[~df[col].isna()][[col]].reset_index(drop=True)

In [5]:
non_na_rows(data, 'ecoscore_score')

Unnamed: 0,ecoscore_score
0,79.0
1,79.0
2,67.0
3,67.0
4,54.0
...,...
1685246,-16.0
1685247,-16.0
1685248,-16.0
1685249,-16.0


In [6]:
# Set the code as the index
data.set_index('code', inplace=True)

In [7]:
def clean_and_extract_column(df, column_name):
    # Clean the specified column
    df_cleaned = df[column_name].str.lower().str.strip().str.replace(r"\s+", " ").str.replace(r",\s+", ",").str.replace(r"[\*|\?|\.]", "")
    
    # Extract values into a new DataFrame with multi-index
    df_extracted = df_cleaned.str.extractall(r"(?P<{0}>[^,]+)".format(column_name))
    
    # Reset index and drop the automatically created column 'match'
    df_extracted.reset_index(inplace=True)
    df_extracted.drop('match', axis=1, inplace=True)
    
    # Remove duplicates
    df_extracted.drop_duplicates(inplace=True)
    
    # Set the original index of df as the index for df_extracted
    df_extracted.set_index('code', inplace=True)
    
    # Merge the extracted DataFrame with the original DataFrame
    df_merged = df_extracted.merge(df, right_index=True, left_index=True)
    
    return df_merged

In [8]:
# Example usage:
# Assuming 'data' is your DataFrame
df = clean_and_extract_column(data, 'countries_en')
df = clean_and_extract_column(df, 'food_groups_en')
df.head()

KeyError: 'countries_en'

# EDA 

In [None]:
# Adjust figure size
plt.figure(figsize=(14, 6))

# Group by 'food_groups_en_1' and calculate mean of 'nutriscore_score'
grouped_data = df.groupby(['food_groups_en_x'])['nutriscore_score'].mean().sort_values()

# Plot bar graph
bar_plot = grouped_data.plot(kind='bar')

# Add values above bars
for i in bar_plot.patches:
    bar_plot.text(i.get_x() + i.get_width() / 2, i.get_height() + 0.05, \
                  round(i.get_height(), 2), ha='center', va='bottom', rotation=60)

# Add labels and title
plt.xlabel('Food Groups')
plt.ylabel('Mean Nutrition Score')
plt.title('Mean Nutrition Score by Food Groups')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Show plot
plt.show()

In [None]:
# Adjust figure size
plt.figure(figsize=(14, 6))

# Group by 'food_groups_en_1' and calculate mean of 'nutriscore_score'
grouped_data = df.groupby(['food_groups_en_x'])['ecoscore_score'].mean().sort_values()

# Plot bar graph
bar_plot = grouped_data.plot(kind='bar')

# Add values above bars
for i in bar_plot.patches:
    bar_plot.text(i.get_x() + i.get_width() / 2, i.get_height() + 0.05, \
                  round(i.get_height(), 2), ha='center', va='bottom', rotation=60)

# Add labels and title
plt.xlabel('Food Groups')
plt.ylabel('Mean Eco Score')
plt.title('Mean Eco Score by Food Groups')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Show plot
plt.show()

# Make broader food groups

In [None]:
df.food_groups_en_x.unique()

In [None]:
# Define a dictionary to map specific food groups to broader categories
# (groups made to follow pnns_groups_1 from full data)
group_mapping = {
    'Unknown': ['unknown', 'na', ''],
    'Fruits and vegetables': ['fruits', 'vegetables', 'dried fruits','fruits and vegetables', 'legumes'],
    'Sugary snacks': ['sugary snacks', 'chocolate products', 'sweets', 'biscuits and cakes', 'pastries'],
    'Beverages': ['beverages', 'sweetened beverages', 'fruit juices', 'unsweetened beverages', 'waters and flavored waters', 'artificially sweetened beverages', 'fruit nectars', 'teas and herbal teas and coffees', 'alcoholic beverages'],
    'Composite foods': ['composite foods', 'sandwiches', 'pizza pies and quiches', 'one-dish meals', 'soups', 'appetizers'],
    'Fish meat eggs': ['fish and seafood', 'fatty fish', 'meat', 'meat other than poultry', 'processed meat', 'lean fish', 'eggs', 'offals', 'poultry', 'fish‚ meat‚ eggs', 'fish'],
    'Fat and sauces': ['fats and sauces', 'dressings and sauces', 'fats'],
    'Salty snacks': ['salty snacks', 'salty and fatty products', 'bread', 'nuts'],
    'Cereals and potatoes': ['cereals and potatoes', 'breakfast cereals', 'cereals', 'potatoes'],
    'Milk and dairy products': ['milk and dairy products', 'dairy desserts', 'milk and yogurt', 'cheese', 'ice cream', 'plant-based milk substitutes']
}

# Function to map food groups to their respective categories
def map_food_group(group):
    for category, food_groups in group_mapping.items():
        if group in food_groups:
            return category
        else: 
            with open("unknown_groups.txt", "w") as file:
                file.write(group + "\n")                
    return 'Unknown'  # If no match found, assign 'Unknown'

# Create the new column
df['food_group_category'] = df['food_groups_en_x'].apply(map_food_group)
df.food_group_category.unique()

In [None]:
df[df['food_group_category'] == 'Unknown']

In [None]:
# Adjust figure size
plt.figure(figsize=(14, 6))

# Group by 'food_groups_en_1' and calculate mean of 'nutriscore_score'
grouped_data_nutri = df.groupby(['food_group_category'])['nutriscore_score'].mean().sort_values()

# Plot bar graph
bar_plot = grouped_data_nutri.plot(kind='bar')

# Add values above bars
for i in bar_plot.patches:
    bar_plot.text(i.get_x() + i.get_width() / 2, i.get_height() + 0.05, \
                  round(i.get_height(), 2), ha='center', va='bottom', rotation=60)

# Add labels and title
plt.xlabel('Food Groups')
plt.ylabel('Mean Nutrition Score')
plt.title('Mean Nutrition Score by Broad Food Groups')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Show plot
plt.show()

In [None]:
# Adjust figure size
plt.figure(figsize=(14, 6))

# Group by 'food_groups_en_1' and calculate mean of 'nutriscore_score'
grouped_data_eco = df.groupby(['food_group_category'])['ecoscore_score'].mean().sort_values()

# Plot bar graph
bar_plot = grouped_data_eco.plot(kind='bar')

# Add values above bars
for i in bar_plot.patches:
    bar_plot.text(i.get_x() + i.get_width() / 2, i.get_height() + 0.05, \
                  round(i.get_height(), 2), ha='center', va='bottom', rotation=60)

# Add labels and title
plt.xlabel('Food Groups')
plt.ylabel('Mean Eco Score')
plt.title('Mean Eco Score by Broad Food Groups')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Show plot
plt.show()

# Spider plot

In [None]:
# Group by 'broad_food_group' and calculate mean of 'ecoscore_score' and 'nutriscore_score'
grouped_data_eco = df.groupby(['food_group_category'])['ecoscore_score'].mean().sort_values()
grouped_data_nutri = df.groupby(['food_group_category'])['nutriscore_score'].mean().sort_values()

# Combine the two series into one DataFrame
spider_df = pd.concat([grouped_data_eco, grouped_data_nutri], axis=1)
spider_df.columns = ['Mean Eco Score', 'Mean Nutrition Score']
spider_df

In [None]:
# Adjust figure size
plt.figure(figsize=(10, 6))

categories = spider_df.index.tolist()
values_eco = spider_df['Mean Eco Score'].values
values_nutri = spider_df['Mean Nutrition Score'].values

# Create a radar chart
ax = plt.subplot(111, polar=True)

# Calculate angles for each category
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()

# Plot eco scores
ax.plot(angles, values_eco, label='Eco Score', marker='o')
ax.fill(angles, values_eco, 'b', alpha=0.1)

# Plot nutrition scores
ax.plot(angles, values_nutri, label='Nutrition Score', marker='o')
ax.fill(angles, values_nutri, 'r', alpha=0.1)

# Add labels and title
plt.title('Mean Eco and Nutrition Scores by Broad Food Groups')
plt.legend(loc='upper right')
plt.xticks(angles, categories)

# Show plot
plt.show()

# Nutrition Score vs. Eco Score by Food Group Trends

In [None]:
# Plot scatterplot colored by food_group_category
plt.figure(figsize=(10, 6))
for category, group in df.groupby('food_group_category'):
    plt.scatter(group['nutriscore_score'], group['ecoscore_score'], label=category)

plt.xlabel('Nutriscore Score')
plt.ylabel('Ecoscore Score')
plt.title('Nutriscore Score vs Ecoscore Score by Food Group Category')
plt.legend(title='Food Group Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
# add trendlines

df1 = df

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df1[['nutriscore_score', 'ecoscore_score']] = imputer.fit_transform(df1[['nutriscore_score', 'ecoscore_score']])


# Plot scatterplot colored by food_group_category
plt.figure(figsize=(10, 6))
for category, group in df1.groupby('food_group_category'):
    plt.scatter(group['nutriscore_score'], group['ecoscore_score'], label=category, alpha=0.2)
    


# Perform ridge regression and plot trend lines
for category, group in df1.groupby('food_group_category'):
    X = group[['nutriscore_score']].values
    y = group['ecoscore_score'].values
    
    # Perform ridge regression
    ridge = Ridge(alpha=0.1)  # Adjust alpha as needed
    ridge.fit(X, y)
    
    # Plot trend line
    x_values = np.linspace(min(group['nutriscore_score']), max(group['nutriscore_score']), 100)
    y_values = ridge.predict(x_values.reshape(-1, 1))
    plt.plot(x_values, y_values, label=f'{category} trendline')

plt.xlabel('Nutriscore Score')
plt.ylabel('Ecoscore Score')
plt.title('Nutriscore Score vs Ecoscore Score by Food Group Category')
plt.legend(title='Food Group Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
#  Without points 

df1 = df

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df1[['nutriscore_score', 'ecoscore_score']] = imputer.fit_transform(df1[['nutriscore_score', 'ecoscore_score']])


# Plot scatterplot colored by food_group_category
plt.figure(figsize=(10, 6))
#for category, group in df1.groupby('food_group_category'):
 #   plt.scatter(group['nutriscore_score'], group['ecoscore_score'], label=category, alpha=0.2)
    


# Perform ridge regression and plot trend lines
for category, group in df1.groupby('food_group_category'):
    X = group[['nutriscore_score']].values
    y = group['ecoscore_score'].values
    
    # Perform ridge regression
    ridge = Ridge(alpha=0.1)  # Adjust alpha as needed
    ridge.fit(X, y)
    
    # Plot trend line
    x_values = np.linspace(min(group['nutriscore_score']), max(group['nutriscore_score']), 100)
    y_values = ridge.predict(x_values.reshape(-1, 1))
    plt.plot(x_values, y_values, label=f'{category}')

plt.xlabel('Nutriscore Score')
plt.ylabel('Ecoscore Score')
plt.title('Nutriscore Score vs Ecoscore Score by Food Group Category')
plt.legend(title='Food Group Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
# Plot each food group category in its own subplot to make it more clear
fig, axs = plt.subplots(2, 5, figsize=(16, 12), sharex=True, sharey=True)
axs = axs.flatten()

for ax, (category, group) in zip(axs, df1.groupby('food_group_category')):
    ax.scatter(group['nutriscore_score'], group['ecoscore_score'], label=category, alpha=0.3)

    # Perform ridge regression and plot trend line
    X = group[['nutriscore_score']].values
    y = group['ecoscore_score'].values
    ridge = Ridge(alpha=0.1)  # Adjust alpha as needed
    ridge.fit(X, y)
    x_values = np.linspace(min(group['nutriscore_score']), max(group['nutriscore_score']), 100)
    y_values = ridge.predict(x_values.reshape(-1, 1))
    ax.plot(x_values, y_values, label=f'{category} trendline', color='red', linewidth=2)

    ax.set_title(category)
    ax.grid(True)

# Add x and y labels to the full figure
fig.text(0.5, -0.01, 'Nutriscore Score', ha='center', fontsize=14)
fig.text(-0.01, 0.5, 'Ecoscore Score', va='center', rotation='vertical', fontsize=14)

fig.suptitle('Nutriscore Score vs Ecoscore Score by Food Group Category', fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust subplot layout to prevent overlap
plt.show()

In [None]:
# Calculate median scores for each food category
median_ecoscore = df.groupby('food_group_category')['ecoscore_score'].median().sort_values(ascending=False).index
median_nutriscore = df.groupby('food_group_category')['nutriscore_score'].median().sort_values().index


# Plot boxplots for each food category based on ecoscore
plt.figure(figsize=(12, 6))
sns.boxplot(x='food_group_category', y='ecoscore_score', data=df, order=median_ecoscore)
plt.title('Boxplots of Ecoscore by Food Category')
plt.xticks(rotation=45)
plt.xlabel('Food Category')
plt.ylabel('Ecoscore Score')
plt.show()

# Plot boxplots for each food category based on nutriscore
plt.figure(figsize=(12, 6))
sns.boxplot(x='food_group_category', y='nutriscore_score', data=df, order=median_nutriscore)
plt.title('Boxplots of Nutriscore by Food Category')
plt.xticks(rotation=45)
plt.xlabel('Food Category')
plt.ylabel('Nutriscore Score')
plt.show()

In [None]:
# Note: in general, as nutriscore goes up, ecoscore goes down 
# with exception for beverages, composite foods, and fish/mean/eggs

In [None]:
# Looking at only grade a foods (only 284 in this subset that are both a in eco and nutri)

In [None]:
grade_a = df[(df['ecoscore_grade']=='a') & (df['nutriscore_grade']=='a')]
grade_a

In [None]:
# Plot each food group category in its own subplot to make it more clear
fig, axs = plt.subplots(2, 5, figsize=(16, 12), sharex=True, sharey=True)
axs = axs.flatten()

for ax, (category, group) in zip(axs, grade_a.groupby('food_group_category')):
    ax.scatter(group['nutriscore_score'], group['ecoscore_score'], label=category, alpha=0.3)

    # Perform ridge regression and plot trend line
    X = group[['nutriscore_score']].values
    y = group['ecoscore_score'].values
    ridge = Ridge(alpha=0.1)  # Adjust alpha as needed
    ridge.fit(X, y)
    x_values = np.linspace(min(group['nutriscore_score']), max(group['nutriscore_score']), 100)
    y_values = ridge.predict(x_values.reshape(-1, 1))
    ax.plot(x_values, y_values, label=f'{category} trendline', color='red', linewidth=2)

    ax.set_title(category)
    ax.grid(True)

# Add x and y labels to the full figure
fig.text(0.5, -0.01, 'Nutriscore Score', ha='center', fontsize=14)
fig.text(-0.01, 0.5, 'Ecoscore Score', va='center', rotation='vertical', fontsize=14)

fig.suptitle('Nutriscore Score vs Ecoscore Score by Food Group Category', fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust subplot layout to prevent overlap
plt.show()

In [None]:
# Calculate median scores for each food category
median_ecoscore = grade_a.groupby('food_group_category')['ecoscore_score'].median().sort_values(ascending=False).index
median_nutriscore = grade_a.groupby('food_group_category')['nutriscore_score'].median().sort_values().index

# Plot boxplots for each food category based on ecoscore
plt.figure(figsize=(12, 6))
sns.boxplot(x='food_group_category', y='ecoscore_score', data=grade_a, order=median_ecoscore)
plt.title('Boxplots of Ecoscore by Food Category')
plt.xticks(rotation=45)
plt.xlabel('Food Category')
plt.ylabel('Ecoscore Score')
plt.show()

# Plot boxplots for each food category based on nutriscore
plt.figure(figsize=(12, 6))
sns.boxplot(x='food_group_category', y='nutriscore_score', data=grade_a, order=median_nutriscore)
plt.title('Boxplots of Nutriscore by Food Category')
plt.xticks(rotation=45)
plt.xlabel('Food Category')
plt.ylabel('Nutriscore Score')
plt.show()

# Now look at top brands, products, countries

In [None]:
# Get the top occurencies function
def get_top_occurencies(df, cols, n=10):
    count = df.groupby(cols).size().sort_values(ascending=False)
    count_percent = round(count / df.shape[0] * 100, 2)
    return pd.DataFrame({'count': count, 'count %': count_percent}).head(n)

In [None]:
get_top_occurencies(df, 'countries_en_x', n=10)
### IMPORTANT TO NOTE: >90% of data is from france in this subset!!

In [None]:
get_top_occurencies(df, 'product_name', n=10)
# most of these are in french bc 90% of data is from france! Important to note

In [None]:
get_top_occurencies(df, 'brands', n=10)