# Register CSV Data With Athena


In [1]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [2]:
ingest_create_athena_table_csv_passed = False


In [3]:
%store

Stored variables and their in-db values:
dataset_path                                          -> '/root/aai-540-homework/homework-2-1/data/AAI-540'
ingest_create_athena_db_mod2_passed                   -> True
ingest_create_athena_db_passed                        -> True
ingest_create_athena_table_csv_passed                 -> True
ingest_create_athena_table_parquet_passed             -> True
ingest_create_athena_table_tsv_passed                 -> True
s3_private_path_csv                                   -> 's3://sagemaker-us-east-1-904981812149/module2_cle
s3_private_path_tsv                                   -> 's3://sagemaker-us-east-1-904981812149/amazon-revi
s3_public_path_tsv                                    -> 's3://dsoaws/amazon-reviews-pds/tsv'
setup_dependencies_mod2_passed                        -> True
setup_dependencies_passed                             -> True
setup_s3_bucket_passed                                -> True


In [4]:
%store -r ingest_create_athena_db_mod2_passed


In [5]:
try:
    ingest_create_athena_db_mod2_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

In [6]:
print(ingest_create_athena_db_mod2_passed)


True


In [7]:
if not ingest_create_athena_db_mod2_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
else:
    print("[OK]")

[OK]


In [8]:
%store -r s3_private_path_csv


In [9]:
try:
    s3_private_path_csv
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

In [10]:
print(s3_private_path_csv)


s3://sagemaker-us-east-1-904981812149/module2_cleaned_data/csv


In [11]:
import pandas as pd
data = pd.read_csv("s3://sagemaker-us-east-1-904981812149/module2_cleaned_data/csv/cleaned_dataset.csv")
data.head()

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,track_id,artists,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


# Import PyAthena


In [12]:
from pyathena import connect

In [13]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [14]:
# Set Athena parameters
database_name = "module2_aws"
table_name_csv = "music_csv2"

In [15]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)


In [16]:
# Create Statement
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
            track_id STRING,
            artists STRING,
            track_name STRING,
            popularity INT,
            duration_ms INT,
            explicit BOOLEAN,
            danceability FLOAT,
            energy FLOAT,
            key INT,
            loudness FLOAT,
            mode INT,
            speechiness FLOAT,
            acousticness FLOAT,
            instrumentalness FLOAT,
            liveness FLOAT,
            valence FLOAT,
            tempo FLOAT,
            time_signature INT,
            track_genre STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_csv
)

print(statement)


CREATE EXTERNAL TABLE IF NOT EXISTS module2_aws.music_csv2(
            track_id STRING,
            artists STRING,
            track_name STRING,
            popularity INT,
            duration_ms INT,
            explicit BOOLEAN,
            danceability FLOAT,
            energy FLOAT,
            key INT,
            loudness FLOAT,
            mode INT,
            speechiness FLOAT,
            acousticness FLOAT,
            instrumentalness FLOAT,
            liveness FLOAT,
            valence FLOAT,
            tempo FLOAT,
            time_signature INT,
            track_genre STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-904981812149/module2_cleaned_data/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [17]:
import pandas as pd

pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


### Verify The Table Has Been Created Succesfully


In [18]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,music_csv
1,music_csv1
2,music_csv2


In [19]:
if table_name_csv in df_show.values:
    ingest_create_athena_table_csv_passed = True
print(ingest_create_athena_table_csv_passed)

True


In [20]:
%store ingest_create_athena_table_csv_passed


Stored 'ingest_create_athena_table_csv_passed' (bool)


### Run A Sample Query


In [21]:
artists = "Gen Hoshino"

statement = """SELECT * FROM {}.{}
    WHERE artists = '{}' LIMIT 10""".format(
    database_name, table_name_csv, artists
)

print(statement)

SELECT * FROM module2_aws.music_csv2
    WHERE artists = 'Gen Hoshino' LIMIT 10


In [22]:
df = pd.read_sql(statement, conn)
df.head(5)

  df = pd.read_sql(statement, conn)


Unnamed: 0,track_id,artists,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4nmjL1mUKOAfAbo9QG9tSE,Gen Hoshino,Koi,53,251266,False,0.598,0.819,9,-2.619,1,0.0911,0.00081,0.0,0.218,0.878,158.115,4,acoustic
2,12qmPGMrOCogibc7qyxT9s,Gen Hoshino,I Wanna Be Your Ghost (feat. Ghosts),50,225533,False,0.829,0.751,11,-4.937,0,0.0645,0.0142,0.014,0.0414,0.96,129.993,4,acoustic
3,3dPpQeLTWjCjEbSevDMQfW,Gen Hoshino,FUSHIGI,49,290000,False,0.753,0.574,3,-4.57,1,0.0483,0.0512,4e-06,0.141,0.527,89.024,4,acoustic
4,2pcuXnZhTirLXsfXGVFTv2,Gen Hoshino,Doraemon,41,239933,False,0.64,0.547,4,-4.129,1,0.038,0.00676,4.9e-05,0.0585,0.95,159.992,4,acoustic


In [23]:
if not df.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


### Review the New Athena Table in the Glue Catalog


In [24]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={}#">AWS Glue Catalog</a></b>'.format(
            region
        )
    )
)

  from IPython.core.display import display, HTML


### Store Variables for the Next Notebooks


In [25]:
%store


Stored variables and their in-db values:
dataset_path                                          -> '/root/aai-540-homework/homework-2-1/data/AAI-540'
ingest_create_athena_db_mod2_passed                   -> True
ingest_create_athena_db_passed                        -> True
ingest_create_athena_table_csv_passed                 -> True
ingest_create_athena_table_parquet_passed             -> True
ingest_create_athena_table_tsv_passed                 -> True
s3_private_path_csv                                   -> 's3://sagemaker-us-east-1-904981812149/module2_cle
s3_private_path_tsv                                   -> 's3://sagemaker-us-east-1-904981812149/amazon-revi
s3_public_path_tsv                                    -> 's3://dsoaws/amazon-reviews-pds/tsv'
setup_dependencies_mod2_passed                        -> True
setup_dependencies_passed                             -> True
setup_s3_bucket_passed                                -> True


### Release Resources

In [26]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>