## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
import pyspark.pandas as ps

In [0]:
#1
data_airports = ps.read_csv('/FileStore/tables/airports.csv')
display(data_airports.head())

faa,name,lat,lon,alt,tz,dst,tzone
04G,Lansdowne Airport,41.1304722,-80.6195833,1044,-5,A,America/New_York
06A,Moton Field Municipal Airport,32.4605722,-85.6800278,264,-6,A,America/Chicago
06C,Schaumburg Regional,41.9893408,-88.1012428,801,-6,A,America/Chicago
06N,Randall Airport,41.431912,-74.3915611,523,-5,A,America/New_York
09J,Jekyll Island Airport,31.0744722,-81.4277778,11,-5,A,America/New_York


In [0]:
display(data_airports.count())

faa      1458
name     1458
lat      1458
lon      1458
alt      1458
tz       1458
dst      1458
tzone    1458
dtype: int64

In [0]:
data_flights = ps.read_csv('/FileStore/tables/flights.csv')
display(data_flights.head())         

year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
2021,1,1,517,515,2,830,819,11,UA,1545,N14228,EWR,IAH,227,1400,5,15,2021-01-01T05:00:00.000+0000
2021,1,1,533,529,4,850,830,20,UA,1714,N24211,LGA,IAH,227,1416,5,29,2021-01-01T05:00:00.000+0000
2021,1,1,542,540,2,923,850,33,AA,1141,N619AA,JFK,MIA,160,1089,5,40,2021-01-01T05:00:00.000+0000
2021,1,1,544,545,-1,1004,1022,-18,B6,725,N804JB,JFK,BQN,183,1576,5,45,2021-01-01T05:00:00.000+0000
2021,1,1,554,600,-6,812,837,-25,DL,461,N668DN,LGA,ATL,116,762,6,0,2021-01-01T06:00:00.000+0000


In [0]:
# data_flights.groupby(by=['dep_delay']).count()

In [0]:
# data_flights.groupby(by=['origin']).count()

In [0]:
#1.2
data_airports.dst.value_counts()

Out[56]: A    1388
U      47
N      23
Name: dst, dtype: int64

In [0]:
data_airports.tzone.value_counts()

Out[58]: America/New_York       519
America/Chicago        342
America/Anchorage      239
America/Los_Angeles    176
America/Denver         119
America/Phoenix         38
Pacific/Honolulu        18
\N                       3
America/Vancouver        2
Asia/Chongqing           2
Name: tzone, dtype: int64

In [0]:
#1.3
data_planes = ps.read_csv('/FileStore/tables/planes.csv')
display(data_planes.head())

tailnum,year,type,manufacturer,model,engines,seats,speed,engine
N10156,2004,Fixed wing multi engine,EMBRAER,EMB-145XR,2,55,,Turbo-fan
N102UW,1998,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan
N103US,1999,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan
N104UW,1999,Fixed wing multi engine,AIRBUS INDUSTRIE,A320-214,2,182,,Turbo-fan
N10575,2002,Fixed wing multi engine,EMBRAER,EMB-145LR,2,55,,Turbo-fan


In [0]:
data_planes.count()

Out[64]: tailnum         3322
year            3322
type            3322
manufacturer    3322
model           3322
engines         3322
seats           3322
speed           3322
engine          3322
dtype: int64

In [0]:
data_airlines = ps.read_csv('/FileStore/tables/airlines.csv')
display(data_airlines.head())

carrier,name
9E,Endeavor Air Inc.
AA,American Airlines Inc.
AS,Alaska Airlines Inc.
B6,JetBlue Airways
DL,Delta Air Lines Inc.


In [0]:
data_airlines.count()

Out[69]: carrier    16
name       16
dtype: int64

In [0]:
cancelFlightsCount = data_flights[(data_flights.air_time == " ") | (data_flights.dep_time == " ")].count()# data_flights[data_flights.arr_time == ' '].count()
cancelFlightsCount

Out[87]: year              7273
month             7273
day               7273
dep_time          7273
sched_dep_time    7273
dep_delay         7273
arr_time          7273
sched_arr_time    7273
arr_delay         7273
carrier           7273
flight            7273
tailnum           7273
origin            7273
dest              7273
air_time          7273
distance          7273
hour              7273
minute            7273
time_hour         7273
dtype: int64