# Basic data preparation

Import necesary libraries

In [60]:
import pandas as pd 
import numpy as nup 

import findspark
findspark.init()

import pyspark

In [61]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as pyf

Spark session

In [62]:
sc = SparkSession.builder.appName('practica').getOrCreate()

In [63]:
sc

Display the data

In [76]:
#Reading the data
df = sc.read.csv('airport_codes.csv', header=True, inferSchema=True)
df.show(5, False)
df.printSchema()

+-----+-------------+----------------------------------+------------+-------------+------------+---------+-----------+----------+------------+--------+---------+----------+
|ident|category     |name                              |latitude_deg|longitude_deg|elevation_ft|continent|iso_country|iso_region|municipality|gps_code|iata_code|local_code|
+-----+-------------+----------------------------------+------------+-------------+------------+---------+-----------+----------+------------+--------+---------+----------+
|00A  |heliport     |Total Rf Heliport                 |40.07080078 |-74.93360138 |11          |NA       |US         |US-PA     |Bensalem    |00A     |null     |00A       |
|00AK |small_airport|Lowell Field                      |59.94919968 |-151.6959991 |450         |NA       |US         |US-AK     |Anchor Point|00AK    |null     |00AK      |
|00AL |small_airport|Epps Airpark                      |34.8647995  |-86.77030182 |820         |NA       |US         |US-AL     |Harves

In [65]:
print('Total records in dataset ', df.count())

Total records in dataset  46235


Study the data

In [66]:
#Airports types
cat = df.groupBy('category').count().orderBy('count')
cat.show()

+--------------+-----+
|      category|count|
+--------------+-----+
|   balloonport|   17|
| large_airport|  566|
| seaplane_base|  909|
|        closed| 1429|
|medium_airport| 4531|
|      heliport| 8985|
| small_airport|29798|
+--------------+-----+



In [67]:
print('There are', cat.count(), 'types of airports')

There are 7 types of airports


In [68]:

country = df.groupBy('iso_country').count().orderBy(pyf.col('count').desc())
country.show()

+-----------+-----+
|iso_country|count|
+-----------+-----+
|         US|21487|
|         BR| 3833|
|         CA| 2434|
|         AU| 1879|
|         RU|  919|
|         FR|  789|
|         AR|  710|
|         CO|  701|
|         DE|  663|
|         VE|  592|
|         PG|  538|
|         CL|  466|
|         ID|  461|
|         GB|  456|
|         ZA|  443|
|         IN|  326|
|         CN|  321|
|         CD|  267|
|         KR|  239|
|         KE|  239|
+-----------+-----+
only showing top 20 rows



In [69]:
print('The dataset contains airports from', country.count(), 'different countries.')

The dataset contains airports from 244 different countries.


We are going to focus on US airports for this analysis.

In [70]:
#New dataframe with US airports
df_us = df.filter(df.iso_country == 'US')
df_us.show(4)

+-----+-------------+--------------------+------------+-------------+------------+---------+-----------+----------+------------+--------+---------+----------+
|ident|     category|                name|latitude_deg|longitude_deg|elevation_ft|continent|iso_country|iso_region|municipality|gps_code|iata_code|local_code|
+-----+-------------+--------------------+------------+-------------+------------+---------+-----------+----------+------------+--------+---------+----------+
|  00A|     heliport|   Total Rf Heliport| 40.07080078| -74.93360138|          11|       NA|         US|     US-PA|    Bensalem|     00A|     null|       00A|
| 00AK|small_airport|        Lowell Field| 59.94919968| -151.6959991|         450|       NA|         US|     US-AK|Anchor Point|    00AK|     null|      00AK|
| 00AL|small_airport|        Epps Airpark|  34.8647995| -86.77030182|         820|       NA|         US|     US-AL|     Harvest|    00AL|     null|      00AL|
| 00AR|     heliport|Newport Hospital ...|  35

In [71]:
print('Row count', df_us.count())

Row count 21487


In [72]:
#Select distinct multiple columns. Use trucate=False because we want to see the hole name. 
df_us.select('iso_region', 'name').distinct().show(10, truncate=False)

+----------+---------------------------------------------------+
|iso_region|name                                               |
+----------+---------------------------------------------------+
|US-CT     |Berlin Fairgrounds Heliport                        |
|US-KY     |Lourdes Hospital Heliport                          |
|US-WV     |West Virginia Univ. Hosp. Inc. Gnd. Pad #2 Heliport|
|US-OH     |Green Acres Airport                                |
|US-IN     |St Vincent Jennings Hospital Heliport              |
|US-KY     |Blue Lick Airport                                  |
|US-KY     |Boss Airport                                       |
|US-PA     |Eagle Field                                        |
|US-FL     |Joy Farms Airport                                  |
|US-LA     |Port of Iberia Heliport                            |
+----------+---------------------------------------------------+
only showing top 10 rows



In [73]:
#Group by name
df_us.groupBy('name').count().orderBy(pyf.col('count').desc()).show(truncate=False)

+--------------------------------+-----+
|name                            |count|
+--------------------------------+-----+
|Memorial Hospital Heliport      |20   |
|Smith Airport                   |11   |
|Miller Airport                  |11   |
|Smith Field                     |11   |
|Hilltop Airport                 |9    |
|Flying S Ranch Airport          |9    |
|Anderson Airport                |9    |
|Johnson Airport                 |9    |
|Flying H Ranch Airport          |8    |
|Taylor Airport                  |8    |
|Good Samaritan Hospital Heliport|8    |
|Harris Airport                  |7    |
|Davis Field                     |7    |
|Providence Hospital Heliport    |7    |
|St Francis Hospital Heliport    |7    |
|Green Acres Airport             |7    |
|St Joseph's Hospital Heliport   |7    |
|Wilson Airport                  |7    |
|St Mary's Hospital Heliport     |7    |
|Mercy Hospital Heliport         |7    |
+--------------------------------+-----+
only showing top

In [75]:
#Select the name with most airports
df_us.filter(df_us.name == 'Memorial Hospital Heliport').show(3)


+-----+--------+--------------------+------------+-------------+------------+---------+-----------+----------+----------------+--------+---------+----------+
|ident|category|                name|latitude_deg|longitude_deg|elevation_ft|continent|iso_country|iso_region|    municipality|gps_code|iata_code|local_code|
+-----+--------+--------------------+------------+-------------+------------+---------+-----------+----------+----------------+--------+---------+----------+
| 08NH|heliport|Memorial Hospital...| 44.06129837| -71.13580322|         574|       NA|         US|     US-NH|    North Conway|    08NH|     null|      08NH|
| 19CO|heliport|Memorial Hospital...|  38.8404007| -104.7990036|        6155|       NA|         US|     US-CO|Colorado Springs|    19CO|     null|      19CO|
| 2PS2|heliport|Memorial Hospital...| 39.96620178| -76.69219971|         410|       NA|         US|     US-PA|            York|    2PS2|     null|      2PS2|
+-----+--------+--------------------+------------+--

In [80]:
#Deleted columns with null/nan values

df_us = df_us.drop('continent', 'iata_code')
df_us.show(2)

+-----+-------------+-----------------+------------+-------------+------------+-----------+----------+------------+--------+----------+
|ident|     category|             name|latitude_deg|longitude_deg|elevation_ft|iso_country|iso_region|municipality|gps_code|local_code|
+-----+-------------+-----------------+------------+-------------+------------+-----------+----------+------------+--------+----------+
|  00A|     heliport|Total Rf Heliport| 40.07080078| -74.93360138|          11|         US|     US-PA|    Bensalem|     00A|       00A|
| 00AK|small_airport|     Lowell Field| 59.94919968| -151.6959991|         450|         US|     US-AK|Anchor Point|    00AK|      00AK|
+-----+-------------+-----------------+------------+-------------+------------+-----------+----------+------------+--------+----------+
only showing top 2 rows



In [81]:
#Renaming a column

df_us = df_us.withColumnRenamed('iso_country', 'country')
df_us.show(2)

+-----+-------------+-----------------+------------+-------------+------------+-------+----------+------------+--------+----------+
|ident|     category|             name|latitude_deg|longitude_deg|elevation_ft|country|iso_region|municipality|gps_code|local_code|
+-----+-------------+-----------------+------------+-------------+------------+-------+----------+------------+--------+----------+
|  00A|     heliport|Total Rf Heliport| 40.07080078| -74.93360138|          11|     US|     US-PA|    Bensalem|     00A|       00A|
| 00AK|small_airport|     Lowell Field| 59.94919968| -151.6959991|         450|     US|     US-AK|Anchor Point|    00AK|      00AK|
+-----+-------------+-----------------+------------+-------------+------------+-------+----------+------------+--------+----------+
only showing top 2 rows



In [82]:
for i in df_us.columns:
    df_us.describe(i).show()

+-------+--------------------+
|summary|               ident|
+-------+--------------------+
|  count|               21487|
|   mean|2.3873375337777779E8|
| stddev| 9.492375382267495E8|
|    min|            0.00E+00|
|    max|                seat|
+-------+--------------------+

+-------+-------------+
|summary|     category|
+-------+-------------+
|  count|        21487|
|   mean|         null|
| stddev|         null|
|    min|  balloonport|
|    max|small_airport|
+-------+-------------+

+-------+--------------------+
|summary|                name|
+-------+--------------------+
|  count|               21487|
|   mean|                null|
| stddev|                null|
|    min|"Fly ""N"" K Airp...|
|    max|     seattle airport|
+-------+--------------------+

+-------+------------------+
|summary|      latitude_deg|
+-------+------------------+
|  count|             21487|
|   mean|39.111338760574824|
| stddev| 6.941944580272601|
|    min|      -75.44080353|
|    max|           