### Importing Libraries

In [33]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

### Importing Data File 

In [43]:
food_data = pd.read_csv('foodstruct_nutritional_facts.csv') 
food_data.head() 

Unnamed: 0,Food Name,Category Name,Calcium,Calories,Carbs,Cholesterol,Copper,Fats,Fiber,Folate,...,Vitamin D,Vitamin E,Vitamin K,Omega-3 - ALA,Omega-6 - Eicosadienoic acid,Omega-6 - Gamma-linoleic acid,Omega-3 - Eicosatrienoic acid,Omega-6 - Dihomo-gamma-linoleic acid,Omega-6 - Linoleic acid,Omega-6 - Arachidonic acid
0,Acerola,Fruits,12.0,32.0,77.0,0.0,9.0,3.0,11.0,14.0,...,,,,,,,,,,
1,Apple,Fruits,6.0,52.0,14.0,0.0,3.0,17.0,24.0,3.0,...,0.0,18.0,22.0,,,,,,,
2,Apricot,Fruits,13.0,48.0,11.0,0.0,8.0,39.0,2.0,9.0,...,0.0,89.0,33.0,,,,,,,
3,Dried fruit,Fruits,55.0,241.0,63.0,0.0,34.0,51.0,73.0,10.0,...,0.0,43.0,31.0,,,,,,,
4,Avocado,Fruits,12.0,160.0,85.0,0.0,19.0,15.0,67.0,81.0,...,0.0,21.0,21.0,11.0,0.0,2.0,,,,


### EDA Part I: Cleaning Data 

In [44]:
# Checking all column names 
food_data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171 entries, 0 to 1170
Data columns (total 59 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Food Name                             1171 non-null   object 
 1   Category Name                         1171 non-null   object 
 2   Calcium                               1146 non-null   float64
 3   Calories                              1171 non-null   float64
 4   Carbs                                 1171 non-null   float64
 5   Cholesterol                           1116 non-null   float64
 6   Copper                                1092 non-null   float64
 7   Fats                                  1171 non-null   float64
 8   Fiber                                 1073 non-null   float64
 9   Folate                                1069 non-null   float64
 10  Iron                                  1151 non-null   float64
 11  Magnesium        

In [45]:
# Checking how many unique food groups fall under the categories column 
food_groups_counts = food_data['Category Name'].value_counts()
food_groups_counts

Category Name
Baked Products                     106
Meat                               100
Sweets                              96
Meals, Entrees, and Side Dishes     90
Vegetables                          90
Fruits                              88
Beverages                           84
Seafood                             83
Soups                               67
Grains                              66
Greens                              60
Dairy                               58
Oils and Sauces                     54
Spices                              46
Fast Foods                          45
Nuts                                27
Mushrooms                            8
Baby Foods                           3
Name: count, dtype: int64

According to the National Institute of Health (NIH) there are 7 major food groups... 
 - Vegetables
 - Fruits
 - Grains
 - Protein Foods 
 - Dairy 
 - Oils and Solid Fats
 - Added Sugars
 - Beverages 

The food_data Categories are not sorted this way so we need to reorganize how things are classified to align them with the NIH food groups. Below is an outline of how the Categories above will be adjusted to suit the NIH classifications. 

| NIH Food Group       | Data Categories                |
|----------------------|---------------------------------|
| Vegetables           | Vegetables, Greens, Mushrooms   |
| Fruits               | Fruits                          |
| Grains               | Grains                          |
| Protein Foods        | Meat, Seafood, Nuts             |
| Dairy                | Dairy                           |
| Oils and Solid Fats  | Oils and Sauces                 |
| Added Sugars         | Baked Products, Sweets          |
| Beverages            | Beverages                       |


Categories that are being filtered out: Baby Foods, Fast Foods, Spices, Soups, Meals, Entrees, and Side Dishes

[Source](https://www.nia.nih.gov/health/healthy-eating-nutrition-and-diet/healthy-eating-you-age-know-your-food-groups#grains)

In [46]:
# Clean and standardize the 'Category Name' column
food_data['Category Name'] = food_data['Category Name'].str.strip().str.title()

# Define the mapping from old categories to NIH Food Groups
category_mapping = {
    'Vegetables': 'Vegetables',
    'Greens': 'Vegetables',
    'Mushrooms': 'Vegetables',
    'Fruits': 'Fruits',
    'Grains': 'Grains',
    'Meat': 'Protein Foods',
    'Seafood': 'Protein Foods',
    'Nuts': 'Protein Foods',
    'Dairy': 'Dairy',
    'Oils': 'Oils and Solid Fats',
    'Sauces': 'Oils and Solid Fats',
    'Baked Products': 'Added Sugars',
    'Sweets': 'Added Sugars',
    'Beverages': 'Beverages',
    # Add other old categories as needed
}

# Map the old categories to the new NIH Food Groups
food_data['NIH_Food_Group'] = food_data['Category Name'].map(category_mapping)

# Identify and handle unmapped categories
unmapped = food_data[food_data['NIH_Food_Group'].isnull()]['Category Name'].unique()
if len(unmapped) > 0:
    print('Categories not mapped:')
    print(unmapped)
    # Decide how to handle these categories
    # For example, assign to 'Other'
    food_data['NIH_Food_Group'] = food_data['NIH_Food_Group'].fillna('Other')

# Verify the mapping
print('Mapped NIH Food Groups:')
print(food_data['NIH_Food_Group'].value_counts())

# Save the updated dataset
food_data.to_csv('food_data_nih.csv', index=False)

Categories not mapped:
['Spices' 'Oils And Sauces' 'Soups' 'Fast Foods'
 'Meals, Entrees, And Side Dishes' 'Baby Foods']
Mapped NIH Food Groups:
NIH_Food_Group
Other            305
Protein Foods    210
Added Sugars     202
Vegetables       158
Fruits            88
Beverages         84
Grains            66
Dairy             58
Name: count, dtype: int64


In [53]:
nih_food_data = pd.read_csv('food_data_nih.csv')
nih_food_data.head()

Unnamed: 0,Food Name,Category Name,Calcium,Calories,Carbs,Cholesterol,Copper,Fats,Fiber,Folate,...,Vitamin E,Vitamin K,Omega-3 - ALA,Omega-6 - Eicosadienoic acid,Omega-6 - Gamma-linoleic acid,Omega-3 - Eicosatrienoic acid,Omega-6 - Dihomo-gamma-linoleic acid,Omega-6 - Linoleic acid,Omega-6 - Arachidonic acid,NIH_Food_Group
0,Acerola,Fruits,12.0,32.0,77.0,0.0,9.0,3.0,11.0,14.0,...,,,,,,,,,,Fruits
1,Apple,Fruits,6.0,52.0,14.0,0.0,3.0,17.0,24.0,3.0,...,18.0,22.0,,,,,,,,Fruits
2,Apricot,Fruits,13.0,48.0,11.0,0.0,8.0,39.0,2.0,9.0,...,89.0,33.0,,,,,,,,Fruits
3,Dried fruit,Fruits,55.0,241.0,63.0,0.0,34.0,51.0,73.0,10.0,...,43.0,31.0,,,,,,,,Fruits
4,Avocado,Fruits,12.0,160.0,85.0,0.0,19.0,15.0,67.0,81.0,...,21.0,21.0,11.0,0.0,2.0,,,,,Fruits


In [52]:
# We are going to move the NIH Food Groups column to the 2nd position to the left
# Get the list of columns
columns = nih_food_data.columns.tolist()

# Remove 'NIH_Food_Group' from its current position
columns.remove('NIH_Food_Group')

# Insert 'NIH_Food_Group' at position 1 (after the first column)
columns.insert(1, 'NIH_Food_Group')

# Reorder the DataFrame
nih_food_data = nih_food_data[columns]
nih_food_data.head()

Unnamed: 0,Food Name,NIH_Food_Group,Category Name,Calcium,Calories,Carbs,Cholesterol,Copper,Fats,Fiber,...,Vitamin D,Vitamin E,Vitamin K,Omega-3 - ALA,Omega-6 - Eicosadienoic acid,Omega-6 - Gamma-linoleic acid,Omega-3 - Eicosatrienoic acid,Omega-6 - Dihomo-gamma-linoleic acid,Omega-6 - Linoleic acid,Omega-6 - Arachidonic acid
0,Acerola,Fruits,Fruits,12.0,32.0,77.0,0.0,9.0,3.0,11.0,...,,,,,,,,,,
1,Apple,Fruits,Fruits,6.0,52.0,14.0,0.0,3.0,17.0,24.0,...,0.0,18.0,22.0,,,,,,,
2,Apricot,Fruits,Fruits,13.0,48.0,11.0,0.0,8.0,39.0,2.0,...,0.0,89.0,33.0,,,,,,,
3,Dried fruit,Fruits,Fruits,55.0,241.0,63.0,0.0,34.0,51.0,73.0,...,0.0,43.0,31.0,,,,,,,
4,Avocado,Fruits,Fruits,12.0,160.0,85.0,0.0,19.0,15.0,67.0,...,0.0,21.0,21.0,11.0,0.0,2.0,,,,
