# I94 Database - Exploration Notebook

In [None]:
# Do all imports and installs here
import pandas as pd
from pyspark.sql import SparkSession

path_immigration = "output/fact/*"
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
df = spark.read.parquet(path_immigration)

df.head()

### Step 1: Enter the parameters you want to analyze

In [None]:
# Enter Filter Parameters
Year = 2016
Month = 4
MaxAge = 19
Gender = "F"
CountryOrigin = "MALAYSIA"
State = "FLORIDA"
City = "Orlando"

In [1]:
df.createOrReplaceTempView("df")

query = spark.sql("""
    SELECT
        year,
        month,
        state,
        city,
        origin_country,
        gender,
        age,
        SUM(total_flights) AS total_flights,
        SUM(total_people) AS total_people,
        SUM(total_people)/SUM(total_flights) AS avg_per_flight
    FROM
        df
    WHERE
        year = {} AND
        month = {} AND
        age < {} AND
        gender = {} AND
        origin_country = {} AND
        state = {} AND
        city = {}
    GROUP BY
        year,
        month,
        state,
        city,
        origin_country,
        gender,
        age
""".format(Year, Month, MaxAge, Gender, CountryOrigin, State, City))

print(query.sort(query.total_people.desc()).show(5, truncate=True))

pdf = query.toPandas()
pdf.plot

NameError: name 'df' is not defined

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.\
config("spark.jars.repositories", "https://repos.spark-packages.org/").\
config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11").\
enableHiveSupport().getOrCreate()

df_spark = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')


In [None]:
#write to parquet

df_spark = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')
# df_spark.write.parquet("sas_data")
# df_spark=spark.read.parquet("sas_data")
df_spark.head()

### Step 2: Explore and Assess the Data
#### Explore the Data 
Identify data quality issues, like missing values, duplicate data, etc.

#### Cleaning Steps
Document steps necessary to clean the data

In [None]:
# Performing cleaning tasks here



#### Dimension Tables ####

# Clean Airport Lookup

print("      1) Airport Codes Dimension Table ETL started...\n")

# Load raw Airport Code data
df = spark.read.format("csv").option("header", "true").load(file_airports)

# Transform raw Airport Code dataset
df.createOrReplaceTempView("df_airport")
df_airport = spark.sql('''
        SELECT
            iata_code as i94port,
            municipality,
            coordinates
        FROM
            df_airport
        WHERE
            iata_code IS NOT NULL OR iata_code != ""
''')

# Write cleaned Airport Codes data to S3 bucket as parquet files
df_airport.write.parquet(output_data+"dim/",mode='overwrite')

print("         Airport Codes ETL complete!\n")

### Step 3: Define the Data Model
#### 3.1 Conceptual Data Model
Map out the conceptual data model and explain why you chose that model

#### 3.2 Mapping Out Data Pipelines
List the steps necessary to pipeline the data into the chosen data model

### Step 4: Run Pipelines to Model the Data 
#### 4.1 Create the data model
Build the data pipelines to create the data model.

In [None]:
# Write code here

#### 4.2 Data Quality Checks
Explain the data quality checks you'll perform to ensure the pipeline ran as expected. These could include:
 * Integrity constraints on the relational database (e.g., unique key, data type, etc.)
 * Unit tests for the scripts to ensure they are doing the right thing
 * Source/Count checks to ensure completeness
 
Run Quality Checks

In [None]:
# Perform quality checks here

#### 4.3 Data dictionary 
Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where it came from. You can include the data dictionary in the notebook or in a separate file.

#### Step 5: Complete Project Write Up
* Clearly state the rationale for the choice of tools and technologies for the project.
* Propose how often the data should be updated and why.
* Write a description of how you would approach the problem differently under the following scenarios:
 * The data was increased by 100x.
 * The data populates a dashboard that must be updated on a daily basis by 7am every day.
 * The database needed to be accessed by 100+ people.