# Data Ingestion

## Copy CSV files to S3 --> Possible not needed since we are allowed to do a manual upload to S3

In [None]:
# Assume all the pre-requisites were set up 
%store -r setup_instance_check_passed
%store -r setup_dependencies_passed
%store -r setup_s3_bucket_passed
%store -r setup_iam_roles_passed

In [1]:
# Load Libraries
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
# Copy Datasets 

In [None]:
# End of Copy CSV files to S3
%store

## Create Database and Tables
### Database

In [None]:
# Setup and check pre-requisites to create Database
ingest_create_athena_db_passed = False
%store -r s3_public_path_csv
%store -r s3_private_path_csv

!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

In [None]:
# Create Database

In [None]:
# End of Create Database
%store ingest_create_athena_db_passed
%store

### Tables

In [None]:
# Setup and check pre-requisites to create Tables
ingest_create_athena_table_passed = False
%store -r ingest_create_athena_db_passed

In [None]:
# Create Tweeter Tables
table_name_csv = "hastag_donaldtrump"
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
  created_at DATETIME,
  tweet_id FLOAT,
  tweet VARCHAR(250),
  likes INT,
  retweet_count INT,
  source VARCHAR(45),
  user_id INT,
  user_name VARCHAR(250),
  user_screen_name VARCHAR(45),
  user_description VARCHAR(250),
  user_join_date DATETIME,
  user_followers_count INT,
  user_location VARCHAR(45),
  lat FLOAT,
  long FLOAT,
  city VARCHAR(45),
  country VARCHAR(45),
  continent VARCHAR(45),
  state VARCHAR(45),
  state_code VARCHAR(45),
  collected_at VARCHAR(45) 
) LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_csv
)

table_name_csv = "hastag_joebiden"

In [None]:
# Create NYT Tables
table_name_csv = "nyt_article"
statement = """CREATE TABLE {}.{} (
  newsdesk VARCHAR(25),
  section VARCHAR(25),
  subsection VARCHAR(25),
  material VARCHAR(25),
  headline VARCHAR(150),
  abstract VARCHAR(1500),
  keywords VARCHAR(250),
  word_count INT NULL
) LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_csv)

table_name_csv = "nyt_comment"
statement = """CREATE TABLE {}.{} (
  commentID INT,
  status VARCHAR(20),
  commentSequence INT,
  userID INT,
  userDisplayName VARCHAR(45),
  userLocation VARCHAR(45),
  userTitle VARCHAR(10),
  commentBody VARCHAR(500),
  createDate DATETIME,
  updateDate DATETIME,
  approveDate DATETIME,
  recommendation INT,
  replyCount INT,
  editorsSelection TEXT,
  parentID INT,
  parentUserDisplayName VARCHAR(45),
  depth INT,
  commentType TEXT,
  trusted TEXT,
  recommendedFlag TEXT,
  permID INT,
  isAnonymous TEXT,
  articleID VARCHAR(150) NULL 
) LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_csv)

In [None]:
# End of Create Database and Tables
%store ingest_create_athena_table_passed
%store

## Query Data

In [None]:
# Setup and check pre-requisites to create Database
%store -r ingest_create_athena_table_passed
!pip install --disable-pip-version-check -q awswrangler==2.3.0
import awswrangler as wr

In [2]:
# Read in Datasets

In [3]:
# Dataset Info

# Data Exploration

In [4]:
# Dataset Descriptives

In [5]:
# Data Distributions

In [None]:
# Filtering Text by Location