Run the Database Connection File

In [None]:
%run ./db_connections.ipynb

Import the Necessary Packages and Libraries

In [None]:
import pymongo
import pandas as pd
import json
import matplotlib.pyplot as plt
from pymongo.errors import PyMongoError
from sqlalchemy import create_engine
import requests
import numpy as np
import psycopg2
import seaborn as sns
import sqlalchemy
from sqlalchemy import exc
from sqlalchemy import inspect

Load the Data from the json to the MongoDB Collection

In [None]:
# Function to perform a useful task for managing MongoDB collections. It checks whether the collection is empty, and if not, it drops the collection before loading the JSON data into it. This ensures that the data is added to a clean slate, avoiding any potential conflicts or duplicates.

def check_and_load_json_data(json_file, collection):
    if collection.count_documents({}) > 0:
        collection.drop()
        print("Existing Collection dropped successfully.")
    else:
        print("Loading data from JSON file...")
    print("Loading data from JSON file...")

    load_json_data(json_file, collection)
    print("Data loaded successfully from JSON file.")
    
# This function is responsible for loading JSON data into a MongoDB collection
def load_json_data(json_file, collection):
    try:
        with open(json_file) as f:
            data = json.load(f)
            collection.insert_many(data)
            print(f"Loaded {len(data)} records into MongoDB")
    except (FileNotFoundError, ValueError, PyMongoError) as e:
        print(f"Error occurred while loading JSON data into MongoDB: {e}")
        
# Connect to MongoDB database using the specified parameters and assigns the connection information to the 'client', 'db', and 'collection' variables.
client, db, collection = connect_to_mongodb(mongoConnectionstring, mongoDatabasename, mongoCollectionnameFF)

# Check and load data from the 'Food_Service_Inspections.json' file into the MongoDB collection.
check_and_load_json_data('Food_Service_Inspections.json', collection)

Load the Data from the MongoDB Collection to a Pandas Dataframe

In [None]:
def load_FoodInspectiondata_to_dataframe(collection):
    try:
        if collection.count_documents({}) > 0:
            # Retrieve data from MongoDB into a DataFrame
            FoodInspection_df = pd.DataFrame(list(collection.find()))
            print("Data loaded into a DataFrame.")
        else:
            print("No data available in MongoDB collection.")
            FoodInspection_df = None
    except Exception as e:
        print(f"Error occurred while loading data from MongoDB: {e}")
        FoodInspection_df = None
    return FoodInspection_df

FoodInspection_df = load_FoodInspectiondata_to_dataframe(collection)

In [None]:
FoodInspection_df.head()

Data Cleaning and Pre-Processing

In [None]:
# Drop the columns that are unnecessary for generating visual representations
def remove_columns(df, columns):
    return df.drop(columns, axis=1)
columns_to_drop = ['_id','STAGE', 'OPNDATE', 'CLSDATE', 'PDIR', 'PSTREET', 'PSTREETTYPE', 'PSTREETSUF', 'PSUITE', 'PCODE', 'ITYPE', 'TIMEIN', 'TIMEOUT', 'TIMESP', 'COMM', 'INSPNO', 'CODE', 'REPEAT', 'CORRECTED']
FoodInspection_df = remove_columns(FoodInspection_df, columns_to_drop)

# Replace empty cells with None
def replace_empty_strings_with_none(dataframe):
    return dataframe.replace('', None, inplace=True)
replace_empty_strings_with_none(FoodInspection_df)

# Remove missing values
def drop_rows_with_missing_values(dataframe):
    return dataframe.dropna(inplace=False)
FoodInspection_df = drop_rows_with_missing_values(FoodInspection_df)

# Remove duplicates 
def drop_duplicates(dataframe):     # drop duplicate rows
    dataframe = dataframe.drop_duplicates()
    dataframe = dataframe.loc[:,~dataframe.columns.duplicated()]     # drop duplicate columns    
    return dataframe
print('Number of rows before removing duplicates:', FoodInspection_df.shape[0])
print('Number of columns before removing duplicates:', FoodInspection_df.shape[1])
FoodInspection_df = drop_duplicates(FoodInspection_df) # remove duplicates using the drop_duplicates() function
print('Number of rows after removing duplicates:', FoodInspection_df.shape[0])
print('Number of columns after removing duplicates:', FoodInspection_df.shape[1])

# Count the number of duplicates in the dataset
def count_duplicate_rows(dataframe):
    return dataframe.duplicated().sum()
num_duplicates = count_duplicate_rows(FoodInspection_df)
print("Number of duplicates in the dataset: ", num_duplicates)

# Count the number of missing values in each column of the DataFrame
def count_missing_values(dataframe):
    num_missing = dataframe.isnull().sum()
    return num_missing.sum()
num_missing = count_missing_values(FoodInspection_df)
print(f"Total number of missing values: {num_missing}")

# Count the num of invalid values in the dataset
def count_invalid_values(dataframe):
    num_invalid_values = dataframe.isnull().sum().sum()
    return num_invalid_values
num_invalid_values = count_invalid_values(FoodInspection_df)
print("There are {} invalid values in the dataset.".format(num_invalid_values))

In [None]:
# Rename column names for easy understanding of data
def rename_columns(df):
    column_map = {
        'UNID': 'identification',
        'NAME': 'name',
        'STATUS': 'status',
        'TYPE': 'type',
        'RANKING': 'ranking',
        'PBUILD': 'address',
        'PCITY': 'city',
        'PSTATE': 'state',
        'EDATE': 'date',
        'CriticalTier': 'critical'
    } 
    df.rename(columns=column_map, inplace=True) 
    return df
rename_columns(FoodInspection_df)

In [None]:
FoodInspection_df.head()

In [None]:
print(FoodInspection_df)

Write the cleaned DataFrame to a CSV file

In [None]:
FoodInspection_df.to_csv('cleaned_food_inspection_data.csv', index=False)

Loading the saved Csv file 

In [None]:
FoodInspectionCleaned = pd.read_csv("cleaned_food_inspection_data.csv")

Table Creation and Data Loading into PostgreSQL

In [None]:
# Function that generates a table within a PostgreSQL database to store data related to food inspections.

def create_foodInspection_table(conn, table_name):
    try:
        cur = conn.cursor() # Generate a cursor instance that can be used to execute SQL queries.
        # SQL query used for constructing a table that specifies the column names and their respective data types.
        cur.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (identification VARCHAR(255),
                name VARCHAR(255),
                status VARCHAR(255),
                type VARCHAR(255),
                ranking VARCHAR(255),
                address VARCHAR(255),
                city VARCHAR(255),
                state VARCHAR(255),
                date DATE,
                critical VARCHAR(255)
            )
        ''')
        conn.commit() # save changes made to the database by a transaction.
        print("Table created successfully!")
    except psycopg2.Error as e:
        print("Error creating table:", e)
        conn.rollback() #  used to undo the changes made to the database during a transaction in case of an error.
    finally:
        cur.close() #release the database cursor.
        
table_name = postgresFoodinspectiontablename
create_foodInspection_table(conn, table_name)

In [None]:
#  Function designed to transfer data from a DataFrame to a table in PostgreSQL

def loadFoodinspection_data_to_postgresql(conn, df, table_name):
    try:
        engine = establish_postgres_connection(postgresUsername, postgresPassword, postgresHost, postgresPort, postgresDbname)
        inspector = inspect(engine)
        # "Verify whether the table has any existing data, and if it does, remove all rows from it."
        if table_name in inspector.get_table_names():
            with conn.cursor() as cursor:
                cursor.execute(f"DELETE FROM {table_name}")
                conn.commit()  # Commit the DELETE statement
                #print(f"All rows deleted from table '{table_name}'.")
        FoodInspectionCleaned.to_sql(table_name, engine, if_exists='append', index=False)
        engine.dispose() # Dispose the engine after data is loaded.
        print("Data loaded to PostgreSQL successfully!")
    except exc.SQLAlchemyError as e:
        print("Error loading data to PostgreSQL:", e)
        
loadFoodinspection_data_to_postgresql(conn, FoodInspectionCleaned, table_name)

Extract the Pre-processed and Structured Data that was loaded into Postgres into a Pandas Dataframe

In [None]:
# Function that retrieves data from a PostgreSQL table and transfers it into a DataFrame.

def extract_foodinspection_data_from_postgresql(engine, table_name):
    try:
        
        # Define the query to extract data
        query = f'SELECT * FROM {table_name}'
        df = pd.read_sql(query, engine)
        print("Data loaded from PostgreSQL to DataFrame successfully!")
        return df
    except Exception as e:
        print("Error loading data from PostgreSQL to DataFrame:", e)

engine = establish_postgres_connection(postgresUsername, postgresPassword, postgresHost, postgresPort, postgresDbname)
FoodInspection_df = extract_foodinspection_data_from_postgresql(engine, table_name)

In [None]:
#conn.close() #Close the connection

VISUALIZATION

1) Bar chart showing the number of inspections conducted for each establishment type.

In [None]:
insp_by_type = FoodInspection_df.groupby('type')['identification'].count().reset_index()
insp_by_type = insp_by_type.rename(columns={'identification': 'Count'})
plt.bar(insp_by_type['type'], insp_by_type['Count'])
plt.xticks(rotation=90)
plt.xlabel('Establishment Type')
plt.ylabel('Number of Inspections')
plt.title('Number of Inspections by Establishment Type')
plt.show()

2) Horizontal bar chart showing the percentage of establishments with active, inactive, or other license statuses.

In [None]:
status_count = FoodInspection_df['status'].value_counts().reset_index()
status_count = status_count.rename(columns={'index': 'Status', 'status': 'Count'})
plt.barh(status_count['Status'], status_count['Count'])
plt.xlabel('Count')
plt.ylabel('Status')
plt.title('Inspection Status Counts')
plt.show()

3) Violation distribution by type and ranking.

In [None]:
type_ranking_counts = FoodInspection_df.groupby(['type', 'ranking']).size().unstack(fill_value=0)
type_ranking_counts.plot(kind='bar', stacked=True)
plt.xlabel('Type of Violation')
plt.ylabel('Number of Violations')
plt.title('Violation Distribution by Type and Ranking')
plt.show()

4) Most common violations by type and ranking.

In [None]:
violation_counts = FoodInspection_df.groupby(['type', 'ranking']).size().reset_index(name='count')
most_common = violation_counts.sort_values('count', ascending=False).groupby('type').head(1)
most_common.plot(kind='bar', x='type', y='count', color='blue')
plt.title('Most Common Violations by Type and Ranking')
plt.xlabel('Violation Type')
plt.ylabel('Violation Count')
plt.show()