# Data Exploration for Quality Issues Detection

In [1]:
import pandas as pd
import os
import sys
import configparser
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, isnan, when, count, col
from pyspark.sql.functions import date_format, monotonically_increasing_id
from pyspark.sql.types import TimestampType

In [2]:
def create_spark_session():
    """Initialize a Spark session

    Returns:
        [SparkSession]: Spark session
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [3]:
spark = create_spark_session()

## Demographics

In [28]:
path = "us-cities-demographics.csv"

In [29]:
df = pd.read_csv(path, delimiter=";")
df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [32]:
len(df)

2891

In [6]:
def remove_cities_with_nulls(df):
    records_with_nulls = df[df.isnull().any(axis=1)].filter(['City','State']).drop_duplicates()
    keys = list(records_with_nulls.columns.values)
    i1 = df.set_index(keys).index
    i2 = records_with_nulls.set_index(keys).index
    return df[~i1.isin(i2)]

In [7]:
def pivot_table_race(df):
    columns = [col_name for col_name in list(df.columns) if col_name not in ['Race','Count']]
    df = df.pivot_table('Count',columns ,'Race').reset_index()
    df.columns.name = ''
    return df

In [8]:
def clean_data(df):
    df = remove_cities_with_nulls(df)
    df = df.drop_duplicates()
    return df

In [10]:
def transform_data(df, spark):
    df = pivot_table_race(df)
    df = df.fillna(0)
    df = df.groupby(['State','State Code']).sum().reset_index()
    df = df.drop(columns=['Average Household Size', 'Median Age'])
    df = df.rename(columns={'State Code':'state_code'})
    return spark.createDataFrame(df)

In [11]:
demographics = clean_data(df)
demographics = transform_data(demographics, spark)

In [12]:
demographics.printSchema()

root
 |-- State: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- Male Population: double (nullable = true)
 |-- Female Population: double (nullable = true)
 |-- Total Population: long (nullable = true)
 |-- Number of Veterans: double (nullable = true)
 |-- Foreign-born: double (nullable = true)
 |-- American Indian and Alaska Native: double (nullable = true)
 |-- Asian: double (nullable = true)
 |-- Black or African-American: double (nullable = true)
 |-- Hispanic or Latino: double (nullable = true)
 |-- White: double (nullable = true)



## Airport Codes

In [33]:
path = "airport-codes_csv.csv"

In [34]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [35]:
len(df)

55075

In [15]:
def pivot_table_type(df):
    df['Count'] = 1
    df = df.groupby(['State','type']).count().reset_index()
    df = df.pivot_table('Count',"State" ,'type').reset_index()
    df = df.fillna(0)
    df.columns.name = ''
    return df

In [16]:
def clean_data(df):
    df = df.filter(["ident",'type','iso_region','iso_country'])
    df = df.drop_duplicates()
    return df

In [17]:
def transform_data(df, spark):
    df = df[df["iso_country"] == "US"]
    df['State'] = df['iso_region'].str.replace("US-","")
    df = df.drop(columns=["iso_region","ident","iso_country"])
    df = pivot_table_type(df)
    return spark.createDataFrame(df)

In [18]:
airport = clean_data(df)
airport = transform_data(airport, spark)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
airport.printSchema()

root
 |-- State: string (nullable = true)
 |-- balloonport: double (nullable = true)
 |-- closed: double (nullable = true)
 |-- heliport: double (nullable = true)
 |-- large_airport: double (nullable = true)
 |-- medium_airport: double (nullable = true)
 |-- seaplane_base: double (nullable = true)
 |-- small_airport: double (nullable = true)



## Inmigration Data

In [36]:
sas_path = "sas_data/part-*"

In [37]:
df = spark.read.parquet(sas_path)

In [38]:
df.count()

3096313

In [22]:
demographics.printSchema()

root
 |-- State: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- Male Population: double (nullable = true)
 |-- Female Population: double (nullable = true)
 |-- Total Population: long (nullable = true)
 |-- Number of Veterans: double (nullable = true)
 |-- Foreign-born: double (nullable = true)
 |-- American Indian and Alaska Native: double (nullable = true)
 |-- Asian: double (nullable = true)
 |-- Black or African-American: double (nullable = true)
 |-- Hispanic or Latino: double (nullable = true)
 |-- White: double (nullable = true)



In [26]:
def clean_data(df):
    columns = [
        "cicid",
        "i94yr",
        "i94mon",
        "i94cit",
        "i94res",
        "i94port",
        "visatype",
        "gender",
        "visapost",
        "arrdate",
        "depdate",
        "i94mode",
        "i94addr"]
    df = df.select(columns)
    return df

In [27]:
inmigrations = clean_data(df)

In [28]:
inmigrations.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)



## Final Data Model

In [29]:
inmigrations.show(1)

+---------+------+------+------+------+-------+--------+------+--------+-------+-------+-------+-------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|visatype|gender|visapost|arrdate|depdate|i94mode|i94addr|
+---------+------+------+------+------+-------+--------+------+--------+-------+-------+-------+-------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|      B1|     F|     SYD|20574.0|20582.0|    1.0|     CA|
+---------+------+------+------+------+-------+--------+------+--------+-------+-------+-------+-------+
only showing top 1 row



In [30]:
demographics.show(1)

+-------+----------+---------------+-----------------+----------------+------------------+------------+---------------------------------+-------+-------------------------+------------------+--------+
|  State|state_code|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|American Indian and Alaska Native|  Asian|Black or African-American|Hispanic or Latino|   White|
+-------+----------+---------------+-----------------+----------------+------------------+------------+---------------------------------+-------+-------------------------+------------------+--------+
|Alabama|        AL|       497248.0|         552381.0|         1049629|           71543.0|     52154.0|                           8084.0|28769.0|                 521068.0|           39313.0|498920.0|
+-------+----------+---------------+-----------------+----------------+------------------+------------+---------------------------------+-------+-------------------------+------------------+--------+


In [32]:
airport.show(1)

+-----+-----------+------+--------+-------------+--------------+-------------+-------------+
|State|balloonport|closed|heliport|large_airport|medium_airport|seaplane_base|small_airport|
+-----+-----------+------+--------+-------------+--------------+-------------+-------------+
|   AK|        0.0|  33.0|    61.0|          2.0|          90.0|        146.0|        497.0|
+-----+-----------+------+--------+-------------+--------------+-------------+-------------+
only showing top 1 row



In [33]:
print(inmigrations.count())
print(demographics.count())
print(airport.count())

3096313
48
52
