In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import psycopg2

In [None]:
# Define PostgreSQL connection parameters
postgres_params = {
    "postgres_conn_id": "postgres_default",
    "host": "localhost",
    "port": 5432,
    "database": "postgres",
    "username": "airflow",
    "password": "airflow",
}

# Connect to the postgres databse
connection = psycopg2.connect(**postgres_params)
print(connection.get_dsn_parameters(),"\n")

cursor = connection.cursor()

In [None]:
# Return: correlation coefficient
# For a given month in a given region, queries the database to calcualte the correlation coefficient
def query_without_threshold(cursor, region, month):
    
    cursor.execute('''SELECT SUM(temperature) FROM deaths_and_temperature WHERE month = ? AND region = ?;''', (month, region))
    x_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(total_deaths) FROM deaths_and_temperature WHERE month = ? AND region = ?;''', (month, region))
    y_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(temperature * temperature) FROM deaths_and_temperature WHERE month = ? AND region = ?;''', (month, region))
    x_squared_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(total_deaths * total_deaths) FROM deaths_and_temperature WHERE month = ? AND region = ?;''', (month, region))
    y_squared_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT COUNT(*) FROM deaths_and_temperature WHERE month = ? AND region = ?;''', (month, region))
    count_x = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(temperature * total_deaths) FROM deaths_and_temperature WHERE month = ? AND region = ?;''', (month, region))
    sum_xy_product = cursor.fetchone()[0]
   
    # math function for the correlation
    try:
        correlation_coefficient = (count_x * sum_xy_product - (x_sum * y_sum)) / math.sqrt((count_x * x_squared_sum - x_sum * x_sum) * (count_x * y_squared_sum - y_sum * y_sum))
    # if denominator is zero, error occurs
    except ZeroDivisionError:
        print("Error: Division by zero. This could be due to the dataset being too small or no data points meeting the threshold.")
        correlation_coefficient = None
    
    return correlation_coefficient

In [None]:
# Return: correlation coefficient
# For a given month in a given region, queries the database to calcualte the correlation coefficient for a spesific threshold for the temperature
def query_with_threshold(cursor, region, month, threshold):
    
    cursor.execute('''SELECT SUM(temperature) FROM deaths_and_temperature WHERE month = ? AND region = ? AND temperature >= ?;''', (month, region, threshold))
    x_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(total_deaths) FROM deaths_and_temperature WHERE month = ? AND region = ? AND temperature >= ?;''', (month, region, threshold))
    y_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(temperature * temperature) FROM deaths_and_temperature WHERE month = ? AND region = ? AND temperature >= ?;''', (month, region, threshold))
    x_squared_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(total_deaths * total_deaths) FROM deaths_and_temperature WHERE month = ? AND region = ? AND temperature >= ?;''', (month, region, threshold))
    y_squared_sum = cursor.fetchone()[0]

    cursor.execute('''SELECT COUNT(*) FROM deaths_and_temperature WHERE month = ? AND region = ? AND temperature >= ?;''', (month, region, threshold))
    count_x = cursor.fetchone()[0]

    cursor.execute('''SELECT SUM(temperature * total_deaths) FROM deaths_and_temperature WHERE month = ? AND region = ? AND temperature >= ?;''', (month, region, threshold))
    sum_xy_product = cursor.fetchone()[0]
   
    # math function for the correlation
    try:
        correlation_coefficient = (count_x * sum_xy_product - (x_sum * y_sum)) / math.sqrt((count_x * x_squared_sum - x_sum * x_sum) * (count_x * y_squared_sum - y_sum * y_sum))
    # if denominator is zero, error occurs
    except ZeroDivisionError:
        print("Error: Division by zero. This could be due to the dataset being too small or no data points meeting the threshold.")
        correlation_coefficient = None

    return correlation_coefficient

In [None]:
cursor.execute('''SELECT DISTINCT region FROM deaths_and_temperature;''')
regions = cursor.fetchall()

without_threshold = []
with_threshold = []

for region in regions:
    region = region[0]

    for month in range(1, 13):
    
        without_threshold.append({'Month': month, 'Region': region, 'Correlation coefficient': query_without_threshold(cursor, region, month), 'Threshold': None})
        
        with_threshold.append({'Month': month, 'Region': region, 'Correlation coefficient': query_with_threshold(cursor, region, month, 32), 'Treshold': 32})

# Commit and close the connection
connection.commit()
cursor.close()
connection.close()

In [None]:
# Visualize: heatmap with x-axis: Month and y-axis: correlation coefficient
# Creates a heatmap of the correlation coefficients over the months of the year in different regions
def visualization_question_one():
    # Create a DataFrame from the data
    df = pd.DataFrame(without_threshold, columns=['Month', 'Region', 'Correlation coefficient', 'Threshold'])

    # Drop rows with null 'Correlation coefficient' values
    df = df.dropna(subset=['Correlation coefficient'])

    # Visualize relationship between total deaths and temperature across all regions
    sns.lmplot(x='Month', y='Correlation coefficient', hue='Region', data=df)

    plt.title('Heatmap')
    plt.show()
    
visualization_question_one()

In [None]:
# Visualize: heatmap with x-axis: Month and y-axis: correlation coefficient
# Creates a heatmap of the correlation coefficients over the months of the year in different regions with threshold
def visualization_question_two():
    # Create a DataFrame from the data
    df = pd.DataFrame(with_threshold, columns=['Month', 'Region', 'Correlation coefficient', 'Threshold'])

    # Drop rows with null 'Correlation coefficient' values
    df = df.dropna(subset=['Correlation coefficient'])

    # Visualize relationship between total deaths and temperature across all regions
    sns.lmplot(x='Month', y='Correlation coefficient', hue='Region', data=df)

    plt.title('Heatmap')
    plt.show()
    
visualization_question_two()