# ADS-508-01-SP23 Team 8: Final Project

# Setup Database and Athena Tables

Much of the code is modified from `Fregly, C., & Barth, A. (2021). Data science on AWS: Implementing end-to-end, continuous AI and machine learning pipelines. O’Reilly.`

## Install missing dependencies

[PyAthena](https://pypi.org/project/PyAthena/) is a Python DB API 2.0 (PEP 249) compliant client for Amazon Athena.

In [None]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
!pip install missingno

## Globally import libraries

In [None]:
import boto3
from botocore.client import ClientError
from IPython.core.display import display, HTML
import pandas as pd
from pyathena import connect
import matplotlib.pyplot as plt
import missingno as msno
import sagemaker
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold

# Set Seaborn Parameters

sns.set_style = "seaborn-whitegrid"

sns.set(
    rc={
        "font.style": "normal",
        "axes.facecolor": "white",
        "grid.color": ".8",
        "grid.linestyle": "-",
        "figure.facecolor": "white",
        "figure.titlesize": 20,
        "text.color": "black",
        "xtick.color": "black",
        "ytick.color": "black",
        "axes.labelcolor": "black",
        "axes.grid": True,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "font.size": 10,
        "ytick.labelsize": 10,
    }
)

## Instantiate AWS SageMaker session

In [None]:
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
def_bucket = sagemaker_session.default_bucket()
bucket = 'sagemaker-us-east-ads508-sp23-t8'
role = sagemaker.get_execution_role()

s3 = boto3.Session().client(service_name="s3", region_name=region)

In [None]:
print(f"Default bucket: {def_bucket}")
print(f"Public T8 bucket: {bucket}")

# Citaion: OpenAI
s3_conn = boto3.resource('s3')
bucket_conn = s3_conn.Bucket(bucket)

# List bucket contents
for obj in bucket_conn.objects.all():
    print(obj.key)

In [None]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = f"s3://{bucket}/athena/staging"
print(s3_staging_dir)

In [None]:
conn = connect(region_name=region,
               s3_staging_dir=s3_staging_dir)

In [None]:
database_name = "ads508_t8"

## Explore DB tables

### `census`

In [None]:
cen_tsv_tbl_name = 'census'

#### Explore via SQL SELECT statements

In [None]:
# Run query to review a sample of records
cen_bourough01 = "bronx"

cen_select_borough_stmnt01 = f"""
    SELECT * FROM {database_name}.{cen_tsv_tbl_name}
    WHERE lower(borough) = '{cen_bourough01}'
    LIMIT 17
    """

# Display SQL statement
print(cen_select_borough_stmnt01)

# Run SQL statement against Athena table
cen_df01_s01 = pd.read_sql(cen_select_borough_stmnt01,
                           conn)
# Display results
cen_df01_s01.head(17)

#### Perform aggregated summaries

In [None]:
# Run query to review a sample of records
cen_select_hispanic_stmnt01 = f"""
    SELECT DISTINCT
        hispanic,
        count(*)
    FROM {database_name}.{cen_tsv_tbl_name}
    WHERE hispanic IS NULL
    GROUP BY hispanic
    LIMIT 10
    """

# Display SQL statement
print(cen_select_hispanic_stmnt01)

# Run SQL statement against Athena table
cen_df01_s03 = pd.read_sql(cen_select_hispanic_stmnt01,
                           conn)
# Display results
cen_df01_s03.head(17)

In [None]:
cen_summ_borough_stmnt01 = f"""
    SELECT
        borough,
        COUNT(*) AS ctract_count,
        SUM(totalpop) AS bor_pop,
        SUM(round(totalpop*hispanic/100,0))/SUM(totalpop) AS hispanic_perc,
        SUM(round(totalpop*white/100,0))/SUM(totalpop) AS white_perc,
        SUM(round(totalpop*black/100,0))/SUM(totalpop) AS black_perc,
        SUM(round(totalpop*native/100,0))/SUM(totalpop) AS native_perc,
        SUM(round(totalpop*asian/100,0))/SUM(totalpop) AS asian_perc,
        SUM(round(totalpop*childpoverty/100,0))/SUM(totalpop) AS child_poverty_perc,
        SUM(round(totalpop*income,0))/SUM(totalpop) AS income_avg
    FROM {database_name}.{cen_tsv_tbl_name}
    GROUP BY borough
    LIMIT 100
    """

# Display SQL statement
print(cen_summ_borough_stmnt01)

# Run SQL statement against Athena table
cen_df01_s02 = pd.read_sql(cen_summ_borough_stmnt01,
                           conn)
# Display results
cen_df01_s02.head(17)

In [None]:
cen_summ_borough_stmnt01 = f"""
    SELECT
        censustract,
        COUNT(*) AS ctract_count,
        SUM(totalpop) AS bor_pop,
        SUM(round(totalpop*hispanic/100,0))/SUM(totalpop) AS hispanic_perc,
        SUM(round(totalpop*white/100,0))/SUM(totalpop) AS white_perc,
        SUM(round(totalpop*black/100,0))/SUM(totalpop) AS black_perc,
        SUM(round(totalpop*native/100,0))/SUM(totalpop) AS native_perc,
        SUM(round(totalpop*asian/100,0))/SUM(totalpop) AS asian_perc,
        SUM(round(totalpop*childpoverty/100,0))/SUM(totalpop) AS child_poverty_perc,
        SUM(round(totalpop*income,0))/SUM(totalpop) AS income_avg
    FROM {database_name}.{cen_tsv_tbl_name}
    GROUP BY censustract
    LIMIT 100
    """

# Display SQL statement
print(cen_summ_borough_stmnt01)

# Run SQL statement against Athena table
cen_df01_s02 = pd.read_sql(cen_summ_borough_stmnt01,
                           conn)
# Display results
cen_df01_s02.head(17)

#### Load potential predictors and target for further exploration using pandas

In [None]:
cen_box_stmnt01 = f"""
    SELECT
        borough,
        totalpop,
        men,
        women,
        hispanic,
        white,
        black,
        native,
        asian,
        citizen,
        income,
        poverty,
        childpoverty,
        professional,
        service,
        office,
        construction,
        production,
        drive,
        carpool,
        transit,
        walk,
        othertransp,
        workathome,
        meancommute,
        employed,
        privatework,
        publicwork,
        selfemployed,
        familywork,
        unemployment
    FROM {database_name}.{cen_tsv_tbl_name}
    WHERE childpoverty IS NOT NULL
    LIMIT 5000
    """

# Display SQL statement
print(cen_box_stmnt01)

# Run SQL statement against Athena table
cen_df01_s04 = pd.read_sql(cen_box_stmnt01,
                           conn)
# Display results
cen_df01_s04.head(17)

#### Display boxplots for select features

In [None]:
%matplotlib inline
sns.boxplot(x='borough', y='totalpop', data=cen_df01_s04).set(title='Population of Each Census Tract by Borough')

In [None]:
%matplotlib inline
sns.boxplot(x='borough', y='childpoverty', data=cen_df01_s04).set(title='Child Poverty (%) of Each Census Tract by Borough')

#### Create subsets of columns for various purposes

In [None]:
cen_df01_s04_num_lst01 = ['totalpop',
                          'men',
                          'women',
                          'hispanic',
                          'white',
                          'black',
                          'native',
                          'asian',
                          'citizen',
                          'income',
                          'poverty',
                          'childpoverty',
                          'professional',
                          'service',
                          'office',
                          'construction',
                          'production',
                          'drive',
                          'carpool',
                          'transit',
                          'walk',
                          'othertransp',
                          'workathome',
                          'meancommute',
                          'employed',
                          'privatework',
                          'publicwork',
                          'selfemployed',
                          'familywork',
                          'unemployment'
                         ]

cen_df01_s04_num_lst02 = ['totalpop',
                          'women',
                          'hispanic',
                          'black',
                          'native',
                          'asian',
                          'citizen',
                          'income',
                          'poverty',
                          'childpoverty',
                          'professional',
                          'service',
                          'office',
                          'construction',
                          'meancommute',
                          'employed',
                          'unemployment'
                         ]

cen_df02_s01 = cen_df01_s04[cen_df01_s04_num_lst01]
cen_df03_s01 = cen_df01_s04[cen_df01_s04_num_lst02]

display(cen_df02_s01.head(17))

In [None]:
# Pair scatter plots for selected features
#sns.pairplot(cen_df02_s01.iloc[:, 0:2])
sns.pairplot(cen_df03_s01)

In [None]:
# Visualize missing values in each column
msno.matrix(cen_df01_s04)

In [None]:
# Remove any features for which the number of null vals exceed a threshold--
#-- (15% of total N)
cen_df01_s04_null_summ01 = pd.DataFrame(cen_df01_s04.isnull().sum(), columns=['null_count'])

cen_df01_s04_null_summ02 = cen_df01_s04_null_summ01.loc[(cen_df01_s04_null_summ01['null_count'] != 0)].sort_values('null_count', ascending=False)
cen_df01_s04_null_summ03 = cen_df01_s04_null_summ02.reset_index()
print(cen_df01_s04_null_summ03)

cen_df01_s04_null_summ04 = cen_df01_s04_null_summ03.loc[cen_df01_s04_null_summ03['null_count'] > (len(cen_df01_s04)*.05)]
print('\n', cen_df01_s04_null_summ04)

cen_df01_s04_null_summ04_remove_lst01 = list(cen_df01_s04_null_summ04['index'])
print('\n', cen_df01_s04_null_summ04_remove_lst01)

#train_x03_tx_df01 = cen_df01_s04.drop(cen_df01_s04_null_summ04_remove_lst01, axis=1)
#test_x03_tx_df01 = test_x02_tx_df01_eda1.drop(cen_df01_s04_null_summ04_remove_lst01, axis=1)

#print(f'\n{train_x03_tx_df01.shape}')
#print(f'\n{test_x03_tx_df01.shape}')

In [None]:
# Review near-zero variance (NZV) features for possible removal
cen_df01_s04_nzv_fit = VarianceThreshold().fit(cen_df02_s01)
cen_df01_s04_nzv_vc01 = cen_df01_s04_nzv_fit.transform(cen_df02_s01)
print(cen_df01_s04_nzv_vc01)
print(cen_df01_s04_nzv_vc01.shape)

# Get the names of the selected features
selected_features = cen_df02_s01.columns[cen_df01_s04_nzv_fit.get_support()]

cen_df01_s04_nzv_df01 = pd.DataFrame(cen_df01_s04_nzv_vc01,
                        columns=selected_features)

print(cen_df01_s04_nzv_df01.head(5))
print(f'X NZV transformed matrix dimensions = {cen_df01_s04_nzv_df01.shape}')

print(f'\n{cen_df02_s01.shape[1] - cen_df01_s04_nzv_df01.shape[1]} \
                                  near zero variance features were eliminated')


## Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}