### IMPORT PACKAGES AND LIBRARIES

In [None]:
#!pip install pymongo
#!pip install psycopg2
#!pip install wordcloud
import pymongo
import requests
import pandas as pd
import numpy as np
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import exc
from sqlalchemy import inspect

### DATA CLEANING AND PRE-PROCESSING

In [None]:
# Load the dataset
diseaseOutbreak = pd.read_csv("outbreaks.csv")

# Define a function to drop unrelated columns and create new ones
def drop_and_create_columns(diseaseOutbreak):
    # Drop the "Serotype/Genotype" column
    columns_to_drop = ["Serotype/Genotype"]
    diseaseOutbreak.drop(columns_to_drop, axis=1, inplace=True)
    
    # Create a new "Location_State" column by combining the "Location" and "State" columns
    diseaseOutbreak["Location_State"] = diseaseOutbreak["Location"] + ", " + diseaseOutbreak["State"]
    
    return diseaseOutbreak

# Define a function to remove missing values
def remove_missing_values(diseaseOutbreak):
    diseaseOutbreak.dropna(inplace=True)
    return diseaseOutbreak

# Rename column names for easy understanding of data
def rename_diseaseoutbreakcolumns(df):
    column_map = {
        'Year': 'year', 
        'Month': 'month', 
        'State': 'state', 
        'Location': 'location', 
        'Food': 'food', 
        'Ingredient': 'ingredient',  
        'Species': 'species', 
        'Status': 'status', 
        'Illnesses': 'illnesses', 
        'Hospitalizations': 'hospitalizations', 
        'Fatalities': 'fatalities', 
        'Location_State': 'location_state' 
    } 
    df.rename(columns=column_map, inplace=True) 
    return df

# Drop unrelated columns and create new ones
diseaseOutbreak = drop_and_create_columns(diseaseOutbreak)

# Remove missing values
diseaseOutbreak = remove_missing_values(diseaseOutbreak)

# Rename Column names
rename_diseaseoutbreakcolumns(diseaseOutbreak)

diseaseOutbreak.to_csv('outbreak_Clean.csv',index=False)

### RUN DATABASE CONNECTION FILE

In [None]:
#%run ./db_connections.ipynb

### TABLE CREATION AND DATA LOADING INTO POSTGRESQL

In [None]:
# Function to create a table in PostgreSQL database for Disease_Outbreak data.

def create_disease_outbreak_table(conn, table_name):
    try:
        cur = conn.cursor() # Create a cursor object for executing SQL queries.
        # SQL query to create the table with column names and data types.
        cur.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (
                 year TEXT,
                 month TEXT,
                 state TEXT,
                 location TEXT,
                 food TEXT,
                 ingredient TEXT,
                 species TEXT,
                 status TEXT,
                 illnesses INT,
                 hospitalizations INT,
                 fatalities INT,
                 location_state TEXT
            )
        ''')
        conn.commit() # Commit the transaction.
        print("Table created successfully!")
    except psycopg2.Error as e:
        print("Error creating table:", e)
        conn.rollback() # Rollback the transaction in case of an error.
    finally:
        cur.close() # Close the cursor.
        
table_name = postgresOutbreaktablename
create_disease_outbreak_table(conn, table_name)

In [None]:
diseaseOutbreak.head()

In [None]:
# Function to load data from a DataFrame to a PostgreSQL table.

def load_disease_outbreak_to_postgresql(conn, diseaseOutbreak, table_name):
    try:
        engine = establish_postgres_connection(postgresUsername, postgresPassword, postgresHost, postgresPort, postgresDbname)
        inspector = inspect(engine)
        # Check if the table already contains data, if so, delete all the rows.
        if table_name in inspector.get_table_names():
            with conn.cursor() as cursor:
                cursor.execute(f"DELETE FROM {table_name}")
                conn.commit()  # Commit the DELETE statement
                #print(f"All rows deleted from table '{table_name}'.")
        diseaseOutbreak.to_sql(table_name, engine, if_exists='append', index=False)
        engine.dispose() # Dispose the engine after data is loaded.
        print("Data loaded to PostgreSQL successfully!")
    except exc.SQLAlchemyError as e:
        print("Error loading data to PostgreSQL:", e)

load_disease_outbreak_to_postgresql(conn, diseaseOutbreak, table_name)

### EXTRACT THE PRE-PROCESSED AND STRUCTURED DATA WHICH IS LOADED INTO POSTGRES INTO A PANDAS DATAFRAME

In [None]:
# Function to extract data from PostgreSQL table and load it into a DataFrame

def extract_disease_data_from_postgresql(engine, table_name):
    try:
        
        # Define the query to extract data
        query = f'SELECT * FROM {table_name}'
        df = pd.read_sql(query, engine)
        print("Data loaded from PostgreSQL to DataFrame successfully!")
        return df
    except Exception as e:
        print("Error loading data from PostgreSQL to DataFrame:", e)

engine = establish_postgres_connection(postgresUsername, postgresPassword, postgresHost, postgresPort, postgresDbname)
disease_df_cleaned = extract_disease_data_from_postgresql(engine, table_name)

In [None]:
disease_df_cleaned.tail()

### CLOSE THE CONNECTION

In [None]:
#conn.close() #Close the connection

###                                       VISUALIZATION

#### VISUALIZING FOR FOOD COLUMN IN BAR CHART PLOT

In [None]:
#Which top 10 food items has the highest hospitalization rate and fatality rate combined?

In [1]:
# Define the column to use
col = "food"

# Get the top 10 values
top_10 = disease_df_cleaned[col].value_counts().nlargest(10)

# Create a bar plot
top_10.plot(kind="bar")
plt.xlabel(col)
plt.ylabel("Count")
plt.title(f"Top 10 {col} (Bar Plot)")
plt.show()

NameError: name 'disease_df_cleaned' is not defined

#### VISUALIZING FOR ILLNESS AND HOSPITALIZATIONS COLUMNS USING SCATTER PLOT GRAPHS

In [None]:
#What is the relationship between the number of reported illnesses and hospitalizations in disease outbreaks, and how does this relationship vary across different types of outbreaks and demographic groups?

In [None]:
# Define a function to create a scatter plot
def create_scatter_plot(disease_df_cleaned, x_col, y_col):
    # Create a scatter plot using the x and y columns
    ax = disease_df_cleaned.plot(kind="scatter", x=x_col, y=y_col)
    ax.set_xlabel(x_col)
    ax.set_ylabel(y_col)
    ax.set_title(f"{y_col} vs {x_col}")
    
    plt.show()

# Create a scatter plot of illnesses vs hospitalizations
create_scatter_plot(disease_df_cleaned, "illnesses", "hospitalizations")

#### VISUALIZATION FOR YEAR AND ILLNESS USING LINE CHART

In [None]:
#Are there any trends in the number of foodborne illnesses over time in the dataset?

In [None]:
# Define a function to create a line chart
def create_line_chart(disease_df_cleaned, x_col, y_col):
    # Group the data by the x column and calculate the sum of the y column
    grouped = disease_df_cleaned.groupby(x_col)[y_col].sum()
    
    # Create a line chart using the grouped data
    ax = grouped.plot(kind="line")
    ax.set_xlabel(x_col)
    ax.set_ylabel(y_col)
    ax.set_title(f"{y_col} over {x_col}")
    
    plt.show()

# Create a line chart of illnesses over time
create_line_chart(disease_df_cleaned, "year", "illnesses")

#### VISUALIZATIONS FOR ILLNESS USING HISTOGRAM

In [None]:
#What are the most effective strategies for preventing or mitigating the spread of illnesses based on the patterns and trends identified in the dataset?

In [None]:
# Define a function to create a histogram
def create_histogram(disease_df_cleaned, col):
    # Create a histogram using the specified column
    ax = disease_df_cleaned[col].plot(kind="hist")
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.set_title(f"Distribution of {col}")
    
    plt.show()

# Create a histogram of illness cases
create_histogram(disease_df_cleaned, "illnesses")

In [None]:
#What are the top 10 species that are most commonly associated with reported cases of illnesses and hospitalizations?

In [1]:
# Define the columns to use
x_col = "species"
y_cols = ["illnesses", "hospitalizations"]

# Get the top 10 species based on total count of illnesses and hospitalizations
top_species = disease_df_cleaned.groupby(x_col)[y_cols].sum().sum(axis=1).nlargest(10).index.tolist()

# Filter the dataframe to only include the top 10 species
df_top_species = disease_df_cleaned[disease_df_cleaned[x_col].isin(top_species)]

# Create a stacked bar chart for the top 10 species
df_top_species.groupby(x_col)[y_cols].sum().plot(kind="bar", stacked=True)
plt.xlabel(x_col)
plt.ylabel("Count")
plt.title("Top 10 Species and Illness Outcomes (Stacked Bar Chart)")
plt.show()

NameError: name 'disease_df_cleaned' is not defined

In [None]:

#what is the total number of reported illnesses for each of the top 10 species.

In [None]:
# Define the columns to use
x_col = "species"
y_col = "illnesses"

# Group the data by species and sum the illnesses
species_counts = disease_df_cleaned.groupby(x_col)[y_col].sum()

# Sort the species counts in descending order
species_counts = species_counts.sort_values(ascending=False)

# Keep only the top 10 species
species_counts = species_counts[:10]

# Create a pie chart
plt.pie(species_counts.values, labels=species_counts.index, autopct='%1.1f%%')
plt.title(f"{y_col} by {x_col} (Pie Chart)")
plt.show()