## Notes


### Imports

In [None]:
import logging
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath("../.."))

from scripts import get_session
from scripts import load_config, setup_logging
from orm_models import ACS2017CountyData, MortalityRate2014


### Config and Logging Setup

In [None]:
try:
    config = load_config()
    setup_logging(config['paths']['log_path'])
    logging.info("Starting the data analysis project.")
except Exception as e:
    logging.error(f"Failed to load config or setup logging: {e}")
    raise

### Database Session

In [None]:
try:
    session = get_session()
    logging.info("Database session created successfully.")
except Exception as e:
    logging.error(f"Failed to create database session: {e}")
    raise

### Query netflix Data

In [None]:
try:
    data = session.query(
        ACS2017CountyData.poverty,
        MortalityRate2014.mortality_rate_2014_max
        ).filter(ACS2017CountyData.fips_code == MortalityRate2014.fips_code)\
        .all()

    
    df = pd.DataFrame(data)
    print(df.head())
except Exception as e:
    logging.error(f"Failed to query show data: {e}")
    raise


### Visualize Data

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Plotting poverty versus mortality rate
plt.figure(figsize=(10, 6), dpi=300)
sns.scatterplot(x='poverty', y='mortality_rate_2014_max', data=df)
plt.title('Poverty vs Mortality Rate')
plt.xlabel('Poverty Rate')
plt.ylabel('Mortality Rate 2014 Max')
plt.show()

### Preprocess Data

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = df.dropna()

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Elbow method to determine the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

# Plotting the elbow plot
plt.figure(figsize=(10, 6), dpi=300)
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()



### K Means Clustering

In [None]:
# k = 5 is the optimal number of clusters
k = 5

# KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(df_scaled)

### Plot Clusters

In [None]:
plt.figure(figsize=(10, 6), dpi=300)
sns.scatterplot(x=df['poverty'], y=df['mortality_rate_2014_max'], hue=df['cluster'],  s=30, palette="colorblind")
plt.title('Poverty vs Mortality Rate - Clustered') 
plt.xlabel('Poverty Rate')
plt.ylabel('Mortality Rate')
plt.legend(title='Cluster')
plt.show()