# ADS-508-01-SP23 Team 8: Final Project

# Setup Database and Athena Tables

Much of the code is modified from `Fregly, C., & Barth, A. (2021). Data science on AWS: Implementing end-to-end, continuous AI and machine learning pipelines. O’Reilly.`

## Install missing dependencies

[PyAthena](https://pypi.org/project/PyAthena/) is a Python DB API 2.0 (PEP 249) compliant client for Amazon Athena.

In [None]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
!pip install missingno

## Globally import libraries

In [None]:
import boto3
from botocore.client import ClientError
from IPython.core.display import display, HTML
import pandas as pd
from pyathena import connect
import matplotlib.pyplot as plt
import missingno as msno
import sagemaker
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold

# Set Seaborn Parameters

sns.set_style = "seaborn-whitegrid"

sns.set(
    rc={
        "font.style": "normal",
        "axes.facecolor": "white",
        "grid.color": ".8",
        "grid.linestyle": "-",
        "figure.facecolor": "white",
        "figure.titlesize": 20,
        "text.color": "black",
        "xtick.color": "black",
        "ytick.color": "black",
        "axes.labelcolor": "black",
        "axes.grid": True,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "font.size": 10,
        "ytick.labelsize": 10,
    }
)

## Instantiate AWS SageMaker session

In [None]:
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
def_bucket = sagemaker_session.default_bucket()
bucket = 'sagemaker-us-east-ads508-sp23-t8'
role = sagemaker.get_execution_role()

s3 = boto3.Session().client(service_name="s3", region_name=region)

In [None]:
print(f"Default bucket: {def_bucket}")
print(f"Public T8 bucket: {bucket}")

# Citaion: OpenAI
s3_conn = boto3.resource('s3')
bucket_conn = s3_conn.Bucket(bucket)

# List bucket contents
for obj in bucket_conn.objects.all():
    print(obj.key)

In [None]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = f"s3://{bucket}/athena/staging"
print(s3_staging_dir)

In [None]:
conn = connect(region_name=region,
               s3_staging_dir=s3_staging_dir)

In [None]:
database_name = "ads508_t8"

## Explore DB tables

### `census`

In [None]:
cen_tsv_tbl_name = 'census'

#### Explore via SQL SELECT statements

In [None]:
# Run query to review a sample of records
cen_bourough01 = "bronx"

cen_select_borough_stmnt01 = f"""
    SELECT * FROM {database_name}.{cen_tsv_tbl_name}
    WHERE lower(borough) = '{cen_bourough01}'
    LIMIT 11
    """

# Display SQL statement
print(cen_select_borough_stmnt01)

# Run SQL statement against Athena table
cen_df01_s01 = pd.read_sql(cen_select_borough_stmnt01,
                           conn)
# Display results
cen_df01_s01.head(11)

#### Perform aggregated summaries

In [None]:
# Run query to review a sample of records
cen_select_hispanic_stmnt01 = f"""
    SELECT DISTINCT
        hispanic,
        count(*)
    FROM {database_name}.{cen_tsv_tbl_name}
    WHERE hispanic IS NULL
    GROUP BY hispanic
    LIMIT 10
    """

# Display SQL statement
print(cen_select_hispanic_stmnt01)

# Run SQL statement against Athena table
cen_df01_s02 = pd.read_sql(cen_select_hispanic_stmnt01,
                           conn)
# Display results
cen_df01_s02.head(11)

In [None]:
cen_summ_borough_stmnt01 = f"""
    SELECT
        borough,
        COUNT(*) AS ctract_count,
        SUM(totalpop) AS bor_pop,
        SUM(round(totalpop*hispanic/100,0))/SUM(totalpop) AS hispanic_perc,
        SUM(round(totalpop*white/100,0))/SUM(totalpop) AS white_perc,
        SUM(round(totalpop*black/100,0))/SUM(totalpop) AS black_perc,
        SUM(round(totalpop*native/100,0))/SUM(totalpop) AS native_perc,
        SUM(round(totalpop*asian/100,0))/SUM(totalpop) AS asian_perc,
        SUM(round(totalpop*childpoverty/100,0))/SUM(totalpop) AS child_poverty_perc,
        SUM(round(totalpop*income,0))/SUM(totalpop) AS income_avg
    FROM {database_name}.{cen_tsv_tbl_name}
    GROUP BY borough
    LIMIT 100
    """

# Display SQL statement
print(cen_summ_borough_stmnt01)

# Run SQL statement against Athena table
cen_df01_s03 = pd.read_sql(cen_summ_borough_stmnt01,
                           conn)
# Display results
cen_df01_s03.head(11)

In [None]:
cen_summ_borough_stmnt01 = f"""
    SELECT
        censustract,
        COUNT(*) AS ctract_count,
        SUM(totalpop) AS bor_pop,
        SUM(round(totalpop*hispanic/100,0))/SUM(totalpop) AS hispanic_perc,
        SUM(round(totalpop*white/100,0))/SUM(totalpop) AS white_perc,
        SUM(round(totalpop*black/100,0))/SUM(totalpop) AS black_perc,
        SUM(round(totalpop*native/100,0))/SUM(totalpop) AS native_perc,
        SUM(round(totalpop*asian/100,0))/SUM(totalpop) AS asian_perc,
        SUM(round(totalpop*childpoverty/100,0))/SUM(totalpop) AS child_poverty_perc,
        SUM(round(totalpop*income,0))/SUM(totalpop) AS income_avg
    FROM {database_name}.{cen_tsv_tbl_name}
    GROUP BY censustract
    LIMIT 100
    """

# Display SQL statement
print(cen_summ_borough_stmnt01)

# Run SQL statement against Athena table
cen_df01_s04 = pd.read_sql(cen_summ_borough_stmnt01,
                           conn)
# Display results
cen_df01_s04.head(11)

#### Load potential predictors and target for further exploration using pandas

In [None]:
cen_box_stmnt01 = f"""
    SELECT
        borough,
        totalpop,
        men,
        women,
        hispanic,
        white,
        black,
        native,
        asian,
        citizen,
        income,
        poverty,
        childpoverty,
        professional,
        service,
        office,
        construction,
        production,
        drive,
        carpool,
        transit,
        walk,
        othertransp,
        workathome,
        meancommute,
        employed,
        privatework,
        publicwork,
        selfemployed,
        familywork,
        unemployment
    FROM {database_name}.{cen_tsv_tbl_name}
    WHERE childpoverty IS NOT NULL
    LIMIT 5000
    """

# Display SQL statement
print(cen_box_stmnt01)

# Run SQL statement against Athena table
cen_df01_s05 = pd.read_sql(cen_box_stmnt01,
                           conn)
# Display results
cen_df01_s05.head(11)

#### Display boxplots for select features

In [None]:
%matplotlib inline
sns.boxplot(x='borough', y='totalpop', data=cen_df01_s05).set(title='Population of Each Census Tract by Borough')

In [None]:
%matplotlib inline
sns.boxplot(x='borough', y='childpoverty', data=cen_df01_s05).set(title='Child Poverty (%) of Each Census Tract by Borough')

#### Create subsets of columns for various purposes

In [None]:
cen_df01_s05_num_lst01 = ['totalpop',
                          'men',
                          'women',
                          'hispanic',
                          'white',
                          'black',
                          'native',
                          'asian',
                          'citizen',
                          'income',
                          'poverty',
                          'childpoverty',
                          'professional',
                          'service',
                          'office',
                          'construction',
                          'production',
                          'drive',
                          'carpool',
                          'transit',
                          'walk',
                          'othertransp',
                          'workathome',
                          'meancommute',
                          'employed',
                          'privatework',
                          'publicwork',
                          'selfemployed',
                          'familywork',
                          'unemployment'
                         ]

cen_df01_s05_num_lst02 = ['totalpop',
                          'women',
                          'hispanic',
                          'black',
                          'native',
                          'asian',
                          'citizen',
                          'income',
                          'poverty',
                          'childpoverty',
                          'professional',
                          'service',
                          'office',
                          'construction',
                          'meancommute',
                          'employed',
                          'unemployment'
                         ]

cen_df02_s01 = cen_df01_s05[cen_df01_s05_num_lst01]
cen_df03_s01 = cen_df01_s05[cen_df01_s05_num_lst02]

display(cen_df02_s01.head(5))

#### Scatterplots of numerical features

In [None]:
# Pair scatter plots for selected features
#sns.pairplot(cen_df02_s01.iloc[:, 0:2])
sns.pairplot(cen_df03_s01)

#### Examine features with missing values

In [None]:
# Visualize missing values in each column
msno.matrix(cen_df01_s05)

In [None]:
# Remove any features for which the number of null vals exceed a threshold--
#-- (15% of total N)
cen_df01_s05_null_summ01 = pd.DataFrame(cen_df01_s05.isnull().sum(),
                                        columns=['null_count'])

cen_df01_s05_null_summ02 = cen_df01_s05_null_summ01.loc[(cen_df01_s05_null_summ01['null_count'] != 0)].sort_values('null_count',
                                                                                                                   ascending=False)
cen_df01_s05_null_summ03 = cen_df01_s05_null_summ02.reset_index()
print(cen_df01_s05_null_summ03)

cen_df01_s05_null_summ04 = cen_df01_s05_null_summ03.loc[cen_df01_s05_null_summ03['null_count'] > (len(cen_df01_s05)*.05)]
print('\n', cen_df01_s05_null_summ04)

cen_df01_s05_null_summ04_remove_lst01 = list(cen_df01_s05_null_summ04['index'])
print('\n', cen_df01_s05_null_summ04_remove_lst01)

#train_x03_tx_df01 = cen_df01_s05.drop(cen_df01_s05_null_summ04_remove_lst01, axis=1)
#test_x03_tx_df01 = test_x02_tx_df01_eda1.drop(cen_df01_s05_null_summ04_remove_lst01, axis=1)

#print(f'\n{train_x03_tx_df01.shape}')
#print(f'\n{test_x03_tx_df01.shape}')

#### Examine featues with near zero variances

In [None]:
# Review near-zero variance (NZV) features for possible removal
cen_df02_s01_nzv_fit = VarianceThreshold().fit(cen_df02_s01)
cen_df02_s01_nzv_vc01 = cen_df02_s01_nzv_fit.transform(cen_df02_s01)
#print(cen_df02_s01_nzv_vc01)
#print(cen_df02_s01_nzv_vc01.shape)

# Get the names of the selected features
cen_df02_s01_nzv_fit_select_features = cen_df02_s01.columns[cen_df02_s01_nzv_fit.get_support()]

cen_df02_s01_nzv_df01 = pd.DataFrame(cen_df02_s01_nzv_vc01,
                                     columns=cen_df02_s01_nzv_fit_select_features)

display(cen_df02_s01_nzv_df01.head(5))
print(f'NZV transformed matrix dimensions = {cen_df02_s01_nzv_df01.shape}')

print(f'\n{cen_df02_s01.shape[1] - cen_df02_s01_nzv_df01.shape[1]} near zero variance features were eliminated')

### `crime_pqt`

In [None]:
cri_pqt_tbl_name = 'crime_pqt'

#### Explore via SQL SELECT statements

In [None]:
# Run query to review a sample of records
cri_law_cat_cd01 = "misdemeanor"
cri_bourough01 = "bronx"

cri_select_borough_stmnt01 = f"""
    SELECT * FROM {database_name}.{cri_pqt_tbl_name}
    WHERE lower(law_cat_cd) = '{cri_law_cat_cd01}' AND lower(borough) = '{cri_bourough01}'
    LIMIT 11
    """

# Display SQL statement
print(cri_select_borough_stmnt01)

# Run SQL statement against Athena table
cri_df01_s01 = pd.read_sql(cri_select_borough_stmnt01,
                           conn)
# Display results
cri_df01_s01.head(5)

#### Perform aggregated summaries

In [None]:
# Run query to review a sample of records
cri_select_ofns_desc_stmnt01 = f"""
    SELECT DISTINCT
        ofns_desc,
        count(*) AS misdemeanor_offense_count
    FROM {database_name}.{cri_pqt_tbl_name}
    WHERE lower(law_cat_cd) = '{cri_law_cat_cd01}' 
    GROUP BY ofns_desc
    ORDER BY misdemeanor_offense_count DESC
    LIMIT 1000
    """

# Display SQL statement
print(cri_select_ofns_desc_stmnt01)

# Run SQL statement against Athena table
cri_df01_s02 = pd.read_sql(cri_select_ofns_desc_stmnt01,
                           conn)
# Display results
cri_df01_s02.head(31)

In [None]:
cri_summ_borough_stmnt01 = f"""
    SELECT
        law_cat_cd,
        borough,
        COUNT(*) AS crime_count
    FROM {database_name}.{cri_pqt_tbl_name}
    GROUP BY law_cat_cd, borough
    ORDER BY crime_count DESC
    LIMIT 100
    """

# Display SQL statement
print(cri_summ_borough_stmnt01)

# Run SQL statement against Athena table
cri_df01_s03 = pd.read_sql(cri_summ_borough_stmnt01,
                           conn)
# Display results
cri_df01_s03.head(47)

In [None]:
cri_date_stmnt01 = f"""
    SELECT
        cmplnt_fr_dt,
        date_parse(cmplnt_fr_dt, '%m/%d/%Y') AS cmplnt_fr_date,
        count(*) AS daily_misdemeanor_counts
    FROM {database_name}.{cri_pqt_tbl_name}
    WHERE lower(law_cat_cd) = '{cri_law_cat_cd01}'
        AND cmplnt_fr_dt <> ''
    GROUP BY cmplnt_fr_dt
    ORDER BY cmplnt_fr_dt
    LIMIT 10000
    """

# Display SQL statement
print(cri_date_stmnt01)

# Run SQL statement against Athena table
cri_df01_s04 = pd.read_sql(cri_date_stmnt01,
                           conn)
# Display results
print(cri_df01_s04.shape)
display(cri_df01_s04.head(11))

#### Load potential predictors and target for further exploration using pandas

In [None]:
cri_box_stmnt01 = f"""
    SELECT
        date_parse(cmplnt_fr_dt, '%m/%d/%Y') AS cmplnt_fr_date,
        ky_cd,
        ofns_desc,
        pd_cd,
        pd_desc,
        crm_atpt_cptd_cd,
        loc_of_occur_desc,
        prem_typ_desc,
        jurisdiction_code,
        parks_nm,
        susp_age_group,
        susp_race,
        susp_sex,
        transit_district,
        latitude,
        longitude,
        vic_age_group,
        vic_race,
        vic_sex,
        law_cat_cd,
        borough
    FROM {database_name}.{cri_pqt_tbl_name}
    WHERE lower(law_cat_cd) = '{cri_law_cat_cd01}' 
    """

# Display SQL statement
print(cri_box_stmnt01)

# Run SQL statement against Athena table
cri_df01_s05 = pd.read_sql(cri_box_stmnt01,
                           conn)
# Display results
print(cri_df01_s05.shape)
display(cri_df01_s05.head(11))

#### Create subsets of columns for various purposes

In [None]:
cri_df01_s05_num_lst01 = []

cri_df01_s05_num_lst02 = []

cri_df02_s01 = cri_df01_s05[cri_df01_s05_num_lst01]
cri_df03_s01 = cri_df01_s05[cri_df01_s05_num_lst02]

display(cri_df02_s01.head(11))

#### Examine features with missing values

In [None]:
# Visualize missing values in each column
msno.matrix(cri_df01_s05)

In [None]:
# Remove any features for which the number of null vals exceed a threshold--
#-- (15% of total N)
cri_df01_s05_null_summ01 = pd.DataFrame(cri_df01_s05.isnull().sum(),
                                        columns=['null_count'])

cri_df01_s05_null_summ02 = cri_df01_s05_null_summ01.loc[(cri_df01_s05_null_summ01['null_count'] != 0)].sort_values('null_count',
                                                                                                                   ascending=False)
cri_df01_s05_null_summ03 = cri_df01_s05_null_summ02.reset_index()
print(cri_df01_s05_null_summ03)

cri_df01_s05_null_summ04 = cri_df01_s05_null_summ03.loc[cri_df01_s05_null_summ03['null_count'] > (len(cri_df01_s05)*.05)]
print('\n', cri_df01_s05_null_summ04)

cri_df01_s05_null_summ04_remove_lst01 = list(cri_df01_s05_null_summ04['index'])
print('\n', cri_df01_s05_null_summ04_remove_lst01)

#train_x03_tx_df01 = cri_df01_s05.drop(cri_df01_s05_null_summ04_remove_lst01, axis=1)
#test_x03_tx_df01 = test_x02_tx_df01_eda1.drop(cri_df01_s05_null_summ04_remove_lst01, axis=1)

#print(f'\n{train_x03_tx_df01.shape}')
#print(f'\n{test_x03_tx_df01.shape}')

#### Display time plots for select features

In [None]:
%matplotlib inline
fig = plt.gcf()
fig.set_size_inches(12, 5)

fig.suptitle("Misdemeanor Event Counts Over Time")

ax = plt.gca()
# ax = plt.gca().set_xticks(df['year'])
ax.locator_params(integer=True)
ax.set_xticks(cri_df01_s04["cmplnt_fr_date"].unique())

cri_df01_s04.plot(kind="line",
        x="cmplnt_fr_date",
        y='daily_misdemeanor_counts',
        color="red",
        ax=ax)

# plt.xticks(range(1995, 2016, 1))
# plt.yticks(range(0,6,1))
plt.xlabel("Dates")
plt.ylabel("Crime Counts")
plt.xticks(rotation=45)

# fig.savefig('average-rating.png', dpi=300)
plt.show()

### `evictions`

In [None]:
evi_tsv_tbl_name = 'evictions'

#### Explore via SQL SELECT statements

In [None]:
# Run query to review a sample of records
evi_bourough01 = "bronx"

evi_select_borough_stmnt01 = f"""
    SELECT * FROM {database_name}.{evi_tsv_tbl_name}
    WHERE lower(borough) = '{evi_bourough01}'
    LIMIT 11
    """

# Display SQL statement
print(evi_select_borough_stmnt01)

# Run SQL statement against Athena table
evi_df01_s01 = pd.read_sql(evi_select_borough_stmnt01,
                           conn)
# Display results
evi_df01_s01.head(11)

#### Perform aggregated summaries

In [None]:
# Run query to review a sample of records
evi_select_eviction_postcode_stmnt01 = f"""
    SELECT DISTINCT
        eviction_postcode,
        count(*)
    FROM {database_name}.{evi_tsv_tbl_name}
    GROUP BY eviction_postcode
    LIMIT 1000
    """

# Display SQL statement
print(evi_select_eviction_postcode_stmnt01)

# Run SQL statement against Athena table
evi_df01_s02 = pd.read_sql(evi_select_eviction_postcode_stmnt01,
                           conn)
# Display results
evi_df01_s02.head(11)

In [None]:
evi_summ_borough_stmnt01 = f"""
    SELECT
        borough,
        COUNT(*) AS evictions_count
    FROM {database_name}.{evi_tsv_tbl_name}
    GROUP BY borough
    LIMIT 100
    """

# Display SQL statement
print(evi_summ_borough_stmnt01)

# Run SQL statement against Athena table
evi_df01_s03 = pd.read_sql(evi_summ_borough_stmnt01,
                           conn)
# Display results
evi_df01_s03.head(11)

In [None]:
evi_summ_borough_stmnt01 = f"""
    SELECT
        borough,
        census_tract,
        COUNT(*) AS ctract_count
    FROM {database_name}.{evi_tsv_tbl_name}
    GROUP BY borough, census_tract
    LIMIT 100
    """

# Display SQL statement
print(evi_summ_borough_stmnt01)

# Run SQL statement against Athena table
evi_df01_s04 = pd.read_sql(evi_summ_borough_stmnt01,
                           conn)
# Display results
evi_df01_s04.head(11)

In [None]:
evi_date_stmnt01 = f"""
    SELECT
        executed_date,
        date_parse(executed_date, '%m/%d/%Y') AS executed_date,
        count(*) AS daily_eviction_counts
    FROM {database_name}.{cri_pqt_tbl_name}
    WHERE executed_date <> ''
    GROUP BY executed_date
    ORDER BY executed_date
    LIMIT 10000
    """

# Display SQL statement
print(evi_date_stmnt01)

# Run SQL statement against Athena table
evi_df01_s06 = pd.read_sql(evi_date_stmnt01,
                           conn)
# Display results
print(evi_df01_s06.shape)
display(evi_df01_s06.head(11))

#### Load potential predictors and target for further exploration using pandas

In [None]:
evi_box_stmnt01 = f"""
    SELECT
        court_index_number,
        docket_number,
        eviction_address,
        eviction_apartment_number,
        executed_date,
        marshal_first_name,
        marshal_last_name,
        residential_or_commercial,
        borough,
        eviction_postcode,
        ejectment,
        eviction_or_legal_possession,
        latitude,
        longitude,
        census_tract
    FROM {database_name}.{evi_tsv_tbl_name}
    LIMIT 5000
    """

# Display SQL statement
print(evi_box_stmnt01)

# Run SQL statement against Athena table
evi_df01_s05 = pd.read_sql(evi_box_stmnt01,
                           conn)
# Display results
evi_df01_s05.head(11)

#### Create subsets of columns for various purposes

In [None]:
evi_df01_s05_num_lst01 = []

evi_df01_s05_num_lst02 = []

evi_df02_s01 = evi_df01_s05[evi_df01_s05_num_lst01]
evi_df03_s01 = evi_df01_s05[evi_df01_s05_num_lst02]

display(evi_df02_s01.head(5))

#### Examine features with missing values

In [None]:
# Visualize missing values in each column
msno.matrix(evi_df01_s05)

In [None]:
# Remove any features for which the number of null vals exceed a threshold--
#-- (15% of total N)
evi_df01_s05_null_summ01 = pd.DataFrame(evi_df01_s05.isnull().sum(),
                                        columns=['null_count'])

evi_df01_s05_null_summ02 = evi_df01_s05_null_summ01.loc[(evi_df01_s05_null_summ01['null_count'] != 0)].sort_values('null_count',
                                                                                                                   ascending=False)
evi_df01_s05_null_summ03 = evi_df01_s05_null_summ02.reset_index()
print(evi_df01_s05_null_summ03)

evi_df01_s05_null_summ04 = evi_df01_s05_null_summ03.loc[evi_df01_s05_null_summ03['null_count'] > (len(evi_df01_s05)*.05)]
print('\n', evi_df01_s05_null_summ04)

evi_df01_s05_null_summ04_remove_lst01 = list(evi_df01_s05_null_summ04['index'])
print('\n', evi_df01_s05_null_summ04_remove_lst01)

#train_x03_tx_df01 = evi_df01_s05.drop(evi_df01_s05_null_summ04_remove_lst01, axis=1)
#test_x03_tx_df01 = test_x02_tx_df01_eda1.drop(evi_df01_s05_null_summ04_remove_lst01, axis=1)

#print(f'\n{train_x03_tx_df01.shape}')
#print(f'\n{test_x03_tx_df01.shape}')

#### Display time plots for select features

In [None]:
%matplotlib inline
fig = plt.gcf()
fig.set_size_inches(12, 5)

fig.suptitle("Eviction Counts Over Time")

ax = plt.gca()
# ax = plt.gca().set_xticks(df['year'])
ax.locator_params(integer=True)
ax.set_xticks(evi_df01_s06["executed_date"].unique())

evi_df01_s06.plot(kind="line",
        x="executed_date",
        y='daily_eviction_counts',
        color="red",
        ax=ax)

# plt.xticks(range(1995, 2016, 1))
# plt.yticks(range(0,6,1))
plt.xlabel("Dates")
plt.ylabel("Eviction Counts")
plt.xticks(rotation=45)

# fig.savefig('average-rating.png', dpi=300)
plt.show()

### `grad_outcomes`

In [None]:
grd_tsv_tbl_name = 'grad_outcomes'

#### Explore via SQL SELECT statements

In [None]:
# Run query to review a sample of records
grd_total_grads_n01 = "s"

grd_select_borough_stmnt01 = f"""
    SELECT * FROM {database_name}.{grd_tsv_tbl_name}
    WHERE total_grads_n <> '{grd_total_grads_n01}'
    LIMIT 1000
    """

# Display SQL statement
print(grd_select_borough_stmnt01)

# Run SQL statement against Athena table
grd_df01_s01 = pd.read_sql(grd_select_borough_stmnt01,
                           conn)
# Display results
grd_df01_s01.head(11)

#### Perform aggregated summaries

In [None]:
# Run query to review a sample of records
grd_select_hispanic_stmnt01 = f"""
    SELECT DISTINCT
        demographic,
        count(*)
    FROM {database_name}.{grd_tsv_tbl_name}
    GROUP BY demographic
    LIMIT 100
    """

# Display SQL statement
print(grd_select_hispanic_stmnt01)

# Run SQL statement against Athena table
grd_df01_s02 = pd.read_sql(grd_select_hispanic_stmnt01,
                           conn)
# Display results
grd_df01_s02.head(11)

#### Load potential predictors and target for further exploration using pandas

In [None]:
grd_box_stmnt01 = f"""
    SELECT
        demographic,
        dbn,
        CAST(cohort AS DOUBLE) AS cohort,
        CAST(total_cohort AS DOUBLE) AS total_cohort,
        CAST(total_grads_n AS DOUBLE) AS total_grads_n,
        CAST(total_regents_n AS DOUBLE) AS total_regents_n,
        CAST(advanced_regents_n AS DOUBLE) AS advanced_regents_n,
        CAST(regents_wo_advanced_n AS DOUBLE) AS regents_wo_advanced_n,
        CAST(local_n AS DOUBLE) AS local_n,
        CAST(still_enrolled_n AS DOUBLE) AS still_enrolled_n,
        CAST(dropped_out_n AS DOUBLE) AS dropped_out_n,
    FROM {database_name}.{grd_tsv_tbl_name}
    WHERE total_grads_n <> '{grd_total_grads_n01}'
    LIMIT 50000
    """

# Display SQL statement
print(grd_box_stmnt01)

# Run SQL statement against Athena table
grd_df01_s05 = pd.read_sql(grd_box_stmnt01,
                           conn)
# Display results
grd_df01_s05.head(11)

#### Display boxplots for select features

In [None]:
%matplotlib inline
sns.boxplot(x='cohort', y='total_cohort', data=grd_df01_s05).set(title='Total Cohorts by Cohort Year')

In [None]:
%matplotlib inline
sns.boxplot(x='cohort', y='total_grads_n', data=grd_df01_s05).set(title='Total Grads by Cohort Year')

In [None]:
%matplotlib inline
sns.boxplot(x='cohort', y='dropped_out_n', data=grd_df01_s05).set(title='Total Dropped Out by Cohort Year')

#### Create subsets of columns for various purposes

In [None]:
grd_df01_s05_num_lst01 = ['cohort',
                          'total_cohort',
                          'total_grads_n',
                          'total_regents_n',
                          'advanced_regents_n',
                          'regents_wo_advanced_n',
                          'local_n',
                          'still_enrolled_n',
                          'dropped_out_n'
                         ]

grd_df01_s05_num_lst02 = ['cohort',
                          'total_cohort',
                          'total_grads_n',
                          'total_regents_n',
                          'advanced_regents_n',
                          'regents_wo_advanced_n',
                          'local_n',
                          'still_enrolled_n',
                          'dropped_out_n'
                         ]

grd_df02_s01 = grd_df01_s05[grd_df01_s05_num_lst01]
grd_df03_s01 = grd_df01_s05[grd_df01_s05_num_lst02]

display(grd_df02_s01.head(5))

#### Scatterplots of numerical features

In [None]:
# Pair scatter plots for selected features
#sns.pairplot(grd_df02_s01.iloc[:, 0:2])
sns.pairplot(grd_df03_s01)

#### Examine features with missing values

In [None]:
# Visualize missing values in each column
msno.matrix(grd_df01_s05)

In [None]:
# Remove any features for which the number of null vals exceed a threshold--
#-- (15% of total N)
grd_df01_s05_null_summ01 = pd.DataFrame(grd_df01_s05.isnull().sum(),
                                        columns=['null_count'])

grd_df01_s05_null_summ02 = grd_df01_s05_null_summ01.loc[(grd_df01_s05_null_summ01['null_count'] != 0)].sort_values('null_count',
                                                                                                                   ascending=False)
grd_df01_s05_null_summ03 = grd_df01_s05_null_summ02.reset_index()
print(grd_df01_s05_null_summ03)

grd_df01_s05_null_summ04 = grd_df01_s05_null_summ03.loc[grd_df01_s05_null_summ03['null_count'] > (len(grd_df01_s05)*.05)]
print('\n', grd_df01_s05_null_summ04)

grd_df01_s05_null_summ04_remove_lst01 = list(grd_df01_s05_null_summ04['index'])
print('\n', grd_df01_s05_null_summ04_remove_lst01)

#train_x03_tx_df01 = grd_df01_s05.drop(grd_df01_s05_null_summ04_remove_lst01, axis=1)
#test_x03_tx_df01 = test_x02_tx_df01_eda1.drop(grd_df01_s05_null_summ04_remove_lst01, axis=1)

#print(f'\n{train_x03_tx_df01.shape}')
#print(f'\n{test_x03_tx_df01.shape}')

#### Examine features with near zero variances

In [None]:
# Review near-zero variance (NZV) features for possible removal
grd_df02_s01_nzv_fit = VarianceThreshold().fit(grd_df02_s01)
grd_df02_s01_nzv_vc01 = grd_df02_s01_nzv_fit.transform(grd_df02_s01)
#print(grd_df02_s01_nzv_vc01)
#print(grd_df02_s01_nzv_vc01.shape)

# Get the names of the selected features
grd_df02_s01_nzv_fit_select_features = grd_df02_s01.columns[grd_df02_s01_nzv_fit.get_support()]

grd_df02_s01_nzv_df01 = pd.DataFrame(grd_df02_s01_nzv_vc01,
                                     columns=grd_df02_s01_nzv_fit_select_features)

display(grd_df02_s01_nzv_df01.head(5))
print(f'NZV transformed matrix dimensions = {grd_df02_s01_nzv_df01.shape}')

print(f'\n{grd_df02_s01.shape[1] - grd_df02_s01_nzv_df01.shape[1]} near zero variance features were eliminated')

### `jobs`

In [None]:
job_tsv_tbl_name = 'jobs'

#### Explore via SQL SELECT statements

In [None]:
# Run query to review a sample of records
job_agency01 = "housing"

job_select_borough_stmnt01 = f"""
    SELECT * FROM {database_name}.{job_tsv_tbl_name}
    WHERE lower(agency) LIKE '%{job_bourough01}%'
    LIMIT 100
    """

# Display SQL statement
print(job_select_borough_stmnt01)

# Run SQL statement against Athena table
job_df01_s01 = pd.read_sql(job_select_borough_stmnt01,
                           conn)
# Display results
job_df01_s01.head(11)

#### Perform aggregated summaries

In [2]:
# Run query to review a sample of records
job_select_job_category_stmnt01 = f"""
    SELECT DISTINCT
        job_category,
        count(*)
    FROM {database_name}.{job_tsv_tbl_name}
    WHERE job_category IS NULL
    GROUP BY job_category
    LIMIT 100
    """

# Display SQL statement
print(job_select_job_category_stmnt01)

# Run SQL statement against Athena table
job_df01_s02 = pd.read_sql(job_select_job_category_stmnt01,
                           conn)
# Display results
print(job_df01_s02.shape)
display(job_df01_s02.head(11))

SyntaxError: invalid syntax (<ipython-input-2-abcaf7b1e819>, line 19)

In [None]:
job_summ_borough_stmnt01 = f"""
    SELECT
        job_id,
        COUNT(*) AS jobs_count
    FROM {database_name}.{job_tsv_tbl_name}
    GROUP BY job_id
    LIMIT 100
    """

# Display SQL statement
print(job_summ_borough_stmnt01)

# Run SQL statement against Athena table
job_df01_s03 = pd.read_sql(job_summ_borough_stmnt01,
                           conn)
# Display results
job_df01_s03.head(11)

#### Load potential predictors and target for further exploration using pandas

In [None]:
job_box_stmnt01 = f"""
    SELECT
        agency,
        posting_type,
        num_of_positions,
        business_title,
        civil_service_title,
        title_classification,
        title_code_no,
        level,
        job_category,
        fulltime_or_parttime_indicator,
        career_level,
        CAST(salary_range_from AS INT) AS salary_range_from,
        CAST(salary_range_to AS INT) AS salary_range_to,
        salary_frequency,
        work_location,
        division_or_work_unit,
        job_description,
        minimum_qual_requirements,
        preferred_skills,
        additional_information,
        to_apply,
        hours_or_shift,
        work_location_1,
        recruitment_contact,
        residency_requirement,
        posting_date,
        post_until,
        posting_updated,
        process_date
    FROM {database_name}.{job_tsv_tbl_name}
    LIMIT 5000
    """

# Display SQL statement
print(job_box_stmnt01)

# Run SQL statement against Athena table
job_df01_s05 = pd.read_sql(job_box_stmnt01,
                           conn)
# Display results
job_df01_s05.head(11)

#### Display boxplots for select features

In [None]:
%matplotlib inline
sns.boxplot(x='job_category', y='salary_range_from', data=job_df01_s05).set(title='Range Floor by Job Category')

In [None]:
%matplotlib inline
sns.boxplot(x='job_category', y='salary_range_to', data=job_df01_s05).set(title='Range Ceiling by Job Category')

#### Create subsets of columns for various purposes

In [None]:
job_df01_s05_num_lst01 = ['salary_range_from',
                          'salary_range_to'
                         ]

job_df01_s05_num_lst02 = ['salary_range_from',
                          'salary_range_to'
                         ]

job_df02_s01 = job_df01_s05[job_df01_s05_num_lst01]
job_df03_s01 = job_df01_s05[job_df01_s05_num_lst02]

display(job_df02_s01.head(5))

#### Scatterplots of numerical features

In [None]:
# Pair scatter plots for selected features
#sns.pairplot(job_df02_s01.iloc[:, 0:2])
sns.pairplot(job_df03_s01)

#### Examine features with missing values

In [None]:
# Visualize missing values in each column
msno.matrix(job_df01_s05)

In [None]:
# Remove any features for which the number of null vals exceed a threshold--
#-- (15% of total N)
job_df01_s05_null_summ01 = pd.DataFrame(job_df01_s05.isnull().sum(),
                                        columns=['null_count'])

job_df01_s05_null_summ02 = job_df01_s05_null_summ01.loc[(job_df01_s05_null_summ01['null_count'] != 0)].sort_values('null_count',
                                                                                                                   ascending=False)
job_df01_s05_null_summ03 = job_df01_s05_null_summ02.reset_index()
print(job_df01_s05_null_summ03)

job_df01_s05_null_summ04 = job_df01_s05_null_summ03.loc[job_df01_s05_null_summ03['null_count'] > (len(job_df01_s05)*.05)]
print('\n', job_df01_s05_null_summ04)

job_df01_s05_null_summ04_remove_lst01 = list(job_df01_s05_null_summ04['index'])
print('\n', job_df01_s05_null_summ04_remove_lst01)

#train_x03_tx_df01 = job_df01_s05.drop(job_df01_s05_null_summ04_remove_lst01, axis=1)
#test_x03_tx_df01 = test_x02_tx_df01_eda1.drop(job_df01_s05_null_summ04_remove_lst01, axis=1)

#print(f'\n{train_x03_tx_df01.shape}')
#print(f'\n{test_x03_tx_df01.shape}')

#### Examine featues with near zero variances

In [None]:
# Review near-zero variance (NZV) features for possible removal
job_df02_s01_nzv_fit = VarianceThreshold().fit(job_df02_s01)
job_df02_s01_nzv_vc01 = job_df02_s01_nzv_fit.transform(job_df02_s01)
#print(job_df02_s01_nzv_vc01)
#print(job_df02_s01_nzv_vc01.shape)

# Get the names of the selected features
job_df02_s01_nzv_fit_select_features = job_df02_s01.columns[job_df02_s01_nzv_fit.get_support()]

job_df02_s01_nzv_df01 = pd.DataFrame(job_df02_s01_nzv_vc01,
                                     columns=job_df02_s01_nzv_fit_select_features)

display(job_df02_s01_nzv_df01.head(5))
print(f'NZV transformed matrix dimensions = {job_df02_s01_nzv_df01.shape}')

print(f'\n{job_df02_s01.shape[1] - job_df02_s01_nzv_df01.shape[1]} near zero variance features were eliminated')

## Release Resources

%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}