In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load nyemil data
df_nyemil = pd.read_csv('nyemil.csv', usecols=['fat', 'sugar', 'sodium'])

In [3]:
# Clean nyemil data
df_nyemil['fat'] = df_nyemil['fat'].str.replace(' g', '')
df_nyemil['sugar'] = df_nyemil['sugar'].str.replace(' g', '')
df_nyemil['sodium'] = df_nyemil['sodium'].str.replace(' mg', '')
df_nyemil = df_nyemil.apply(lambda x: x.astype(float))
df_nyemil['sodium'] = df_nyemil['sodium'] / 1000

In [4]:
# Load fatsecret data
df_fatsecret = pd.read_csv('fatsecret.csv', usecols=['fat', 'sugar', 'sodium'])

In [5]:
# Clean fatsecret data
df_fatsecret['fat'] = df_fatsecret['fat'].str.replace('g', '').str.replace(',', '.')
df_fatsecret['sugar'] = df_fatsecret['sugar'].str.replace('g', '').str.replace(',', '.')
df_fatsecret['sodium'] = df_fatsecret['sodium'].str.replace('mg', '').str.replace(',', '.')
df_fatsecret = df_fatsecret.apply(lambda x: x.astype(float))
df_fatsecret['sodium'] = df_fatsecret['sodium'] / 1000


In [7]:
def calculate_grade(row):
    nutrient_limits = {
        'fat': 67,
        'sugar': 50,
        'sodium': 5
    }

    def calculate_nutrient_grade(value, limit):
        if value <= limit * 0.1:
            return 'A'
        elif value <= limit * 0.25:
            return 'B'
        elif value <= limit * 0.5:
            return 'C'
        else:
            return 'D'

    grades = []
    for nutrient, limit in nutrient_limits.items():
        grade = calculate_nutrient_grade(row[nutrient], limit)
        grades.append(grade)

    final_grade = min(grades, key=lambda x: ['D', 'C', 'B', 'A'].index(x))
    return final_grade

In [8]:
df_nyemil['grade'] = df_nyemil.apply(calculate_grade, axis=1)
df_fatsecret['grade'] = df_fatsecret.apply(calculate_grade, axis=1)

In [9]:
# Merge the two datasets
df = pd.concat([df_nyemil, df_fatsecret])

In [10]:
df.head()

Unnamed: 0,fat,sugar,sodium,grade
0,9.3,0.0,0.775,B
1,14.0,8.0,1.07,B
2,3.5,19.0,0.03,C
3,21.9,4.9,0.3886,C
4,43.5,9.7,0.87,D


In [11]:
print(df['grade'].value_counts())

grade
B    3301
A    3291
C    1840
D     295
Name: count, dtype: int64


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8727 entries, 0 to 8443
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   fat     8727 non-null   float64
 1   sugar   8727 non-null   float64
 2   sodium  8727 non-null   float64
 3   grade   8727 non-null   object 
dtypes: float64(3), object(1)
memory usage: 340.9+ KB


In [16]:
# Save the merged dataset to a single CSV file
df.to_csv('clean_data.csv', index=False, header=True, encoding='utf-8')