### Importing all Necessary Libraries 

In [None]:
# Importing all libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
from sqlalchemy import exc
from sqlalchemy import inspect
from sqlalchemy import create_engine

### Extracting the Dataset from CSV to a Pandas Dataframe followed by Cleaning and Transformation

In [None]:
#Loads the dataset from the specified file path and returns it as a pandas DataFrame.
    
def load_dataset(nutrients_csv):
    df = pd.read_csv(nutrients_csv)
    return df

#Cleans the specified DataFrame by removing any rows with missing values and dropping any columns that are not needed for analysis.

def clean_dataset(df):  
    # Drop any rows with missing values
    df.dropna(inplace=True)

    # Drop any columns that are not needed for analysis
    columns_to_drop = ['index','NDB_No','index', 'Ash_(g)', 'Sugar_Tot_(g)', 'Copper_mg)', 'Manganese_(mg)', 'Folate_DFE_(µg)','Vit_K_(µg)', 'Choline_Tot_ (mg)', 'Retinol_(µg)', 'Alpha_Carot_(µg)', 'Beta_Carot_(µg)', 'Beta_Crypt_(µg)', 'Lycopene_(µg)', 'Lut+Zea_ (µg)', 'Vit_D_µg', 'Vit_D_IU', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Refuse_Pct','GmWt_1','GmWt_Desc1','GmWt_2','GmWt_Desc2','Refuse_Pct','Thiamin_(mg)','Selenium_(µg)','Niacin_(mg)','Riboflavin_(mg)','Folic_Acid_(µg)','Vit_E_(mg)','Vit_A_RAE','Food_Folate_(µg)','Vit_B12_(µg)','Folate_Tot_(µg)','Panto_Acid_mg)']
    df.drop(columns=columns_to_drop, inplace=True)

    # Rename columns to make them easier to work with
    new_column_names = {'Shrt_Desc': 'products', 'Water_(g)': 'water', 'Energ_Kcal': 'calories', 'Protein_(g)': 'protein', 'Lipid_Tot_(g)': 'fat', 'Carbohydrt_(g)': 'carbohydrates', 'Fiber_TD_(g)': 'fiber', 'Calcium_(mg)': 'calcium', 'Iron_(mg)': 'iron', 'Magnesium_(mg)': 'magnesium', 'Phosphorus_(mg)': 'phosphorus','Potassium_(mg)': 'potassium', 'Sodium_(mg)': 'sodium', 'Vit_C_(mg)': 'vit_c','Zinc_(mg)' : 'zinc', 'Vit_B6_(mg)': 'vitamin_b', 'Vit_A_IU': 'vitamin_a', 'Cholestrl_(mg)' : 'cholestrl'}
    df.rename(columns=new_column_names, inplace=True)

    # Create a new column for the total minerals
    df['tot_minerals'] = df['calcium'] + df['iron'] + df['magnesium'] + df['phosphorus'] + df['potassium'] + df['sodium'] + df['zinc'] 
    return df

#Saves the specified DataFrame to the specified file path.

def save_dataset(df,cleaned_dataset_csv):
    df.to_csv(cleaned_dataset_csv, index=False)


# Load the dataset
nutrients_df = load_dataset("nutrients.csv")

# Clean the dataset
nutrients_cleaned_df = clean_dataset(nutrients_df)

# Save the cleaned dataset to a new file
save_dataset(nutrients_cleaned_df, "cleaned_dataset.csv")


In [None]:
selected_col = ['products','calories', 'protein', 'fat', 'carbohydrates', 'fiber', 
                    'calcium', 'iron', 'vit_c', 'vitamin_a', 'potassium', 'tot_minerals']

# Create a new DataFrame with just the selected columns
nutrients_updated_df = nutrients_cleaned_df[selected_col].copy()

In [None]:
nutrients_updated_df.head()

In [None]:
#%run ./db_connections.ipynb

### Loading the Cleaned Dataset into PostgresSQL 

In [None]:
# Function to create a table in PostgreSQL database for nutrients data.

def create_nutrients_table(conn, table_name):
    try:
        cur = conn.cursor() 
        # SQL query to create the table.
        cur.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (
                products VARCHAR(255),
                calories INT,
                protein float,
                fat float,
                carbohydrates float,
                fiber float,
                calcium float,
                iron float,
                vit_c float,
                vitamin_a float,
                potassium float,
                tot_minerals float
            )
        ''')
        conn.commit()
        print("Table created successfully!")
    except psycopg2.Error as e:
        print("Error creating table:", e)
        conn.rollback()
    finally:
        cur.close() # Close the cursor.
        
table_name = postgresNutrientstablename
create_nutrients_table(conn, table_name)

In [None]:
# Function to load a DataFrame to a PostgreSQL.

def loadnutrients_data_to_postgresql(conn, df, table_name):
    try:
        engine = establish_postgres_connection(postgresUsername, postgresPassword, postgresHost, postgresPort, postgresDbname)
        inspector = inspect(engine)
        # Check if the table already contains data, if so, delete all the rows.
        if table_name in inspector.get_table_names():
            with conn.cursor() as cursor:
                cursor.execute(f"DELETE FROM {table_name}")
                conn.commit() 
        nutrients_updated_df.to_sql(table_name, engine, if_exists='append', index=False)
        engine.dispose() 
        print("Data loaded to PostgreSQL successfully!")
    except exc.SQLAlchemyError as e:
        print("Error loading data to PostgreSQL:", e)
        
loadnutrients_data_to_postgresql(conn, nutrients_updated_df, table_name)

### Extracting the Cleaned Data from Postgres to a Dataframe 

In [None]:
# Function to extract data from PostgreSQL table and load it into a DataFrame

def extract_nutrients_data_from_postgresql(engine, table_name):
    try:
        
        # Define the query to extract data
        query = f'SELECT * FROM {table_name}'
        df = pd.read_sql(query, engine)
        print("Data loaded from PostgreSQL to DataFrame successfully!")
        return df
    except Exception as e:
        print("Error loading data from PostgreSQL to DataFrame:", e)

engine = establish_postgres_connection(postgresUsername, postgresPassword, postgresHost, postgresPort, postgresDbname)
nutrients_postgres_df = extract_nutrients_data_from_postgresql(engine, table_name)

In [None]:
nutrients_postgres_df.head()

### Visualization 

In [None]:
#What is the distribution of calorie content among different foods

# Create a histogram of the calorie content of different foods
plt.hist(nutrients_postgres_df['calories'], bins=20)
plt.xlabel("Calorie content")
plt.ylabel("Frequency")
plt.title("Distribution of calorie content among different foods")
plt.show()

In [None]:
#How do the nutrient values of different foods compare to each other
plt.scatter(nutrients_postgres_df['protein'], nutrients_postgres_df['fat'])
plt.xlabel("Protein")
plt.ylabel("Fat")
plt.title("Nutrient values of different foods")
plt.show()

In [None]:
#Which foods are the highest and lowest in fiber(top 10 values)
fiber = nutrients_postgres_df['fiber'].head(10)
food_names = nutrients_postgres_df['products'].head(10)

# Sort the data by fiber content in descending order
sort_data = nutrients_postgres_df.sort_values(by='fiber', ascending=False).head(10)

# Create a horizontal bar chart
plt.barh(sort_data['products'], sort_data['fiber'])

# Set the axis labels and title
plt.xlabel("Fiber")
plt.ylabel("Products")
plt.title("Top 10 Foods Highest in Fiber")

plt.show()

In [None]:
# heat map to show the relationship between the levels of fiber, calcium, and iron in each food item

nutri = ['fiber', 'calcium', 'iron']
subset_df = nutrients_postgres_df[nutri]

# Create a heatmap using seaborn
sns.heatmap(subset_df.corr(), annot=True)

In [None]:
#percentage of vitamin A and vitatin C in top 10 products
# group the data by the top 10 foods with the highest Vitamin A IU content
grouped = nutrients_postgres_df.sort_values('vitamin_a', ascending=False).head(10)

# calculate the percentage breakdown of Vit_C_(mg) for each food
grouped['vit_c %'] = grouped['vit_c'] / grouped['vit_c'].sum() * 100

# calculate the percentage breakdown of Vitamin A IU for each food
grouped['vitamin_a %'] = grouped['vitamin_a'] / grouped['vitamin_a'].sum() * 100

# create a pie chart for each food
for index, row in grouped.iterrows():
    labels = ['vit_c', 'vitamin_a']
    sizes = [row['vit_c %'], row['vitamin_a %']]
    colors = ['cornflowerblue', 'lightcoral']
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    plt.title(row['products'])
    plt.axis('equal')
    plt.show()