In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySparkMySQLIntegration") \
    .config("spark.jars.packages", "mysql:mysql-connector-java:8.0.30") \
    .getOrCreate()


In [3]:
flights_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql-nadim.alwaysdata.net:3306/nadim_db") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "flights") \
    .option("user", "nadim") \
    .option("password", "root123456789@") \
    .load()


In [4]:
flights_df.show()


+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|
+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|2021|    1|  1|     517|           515|        2|     830|           819|       11|     UA|  1545| N14228|   EWR| IAH|     227|    1400|   5|    15|2021-01-01 05:00:00|
|2021|    1|  1|     533|           529|        4|     850|           830|       20|     UA|  1714| N24211|   LGA| IAH|     227|    1416|   5|    29|2021-01-01 05:00:00|
|2021|    1|  1|     542|           540|        2|     923|           850|       33|     AA|  1141| N619AA|   JFK| MIA|     160|    1089|   5|    40|2

In [5]:
planes_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql-nadim.alwaysdata.net:3306/nadim_db") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "planes") \
    .option("user", "nadim") \
    .option("password", "root123456789@") \
    .load()


In [6]:
weather_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql-nadim.alwaysdata.net:3306/nadim_db") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "weather") \
    .option("user", "nadim") \
    .option("password", "root123456789@") \
    .load()


In [7]:
airlines_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql-nadim.alwaysdata.net:3306/nadim_db") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "airlines") \
    .option("user", "nadim") \
    .option("password", "root123456789@") \
    .load()


In [8]:
airports_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql-nadim.alwaysdata.net:3306/nadim_db") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "airports") \
    .option("user", "nadim") \
    .option("password", "root123456789@") \
    .load()


## Request 1: Counting Airports, Timezones, Companies, Planes, and Cancelled Flights


In [9]:
# Count total number of airports, departure and destination airports
total_airports = airports_df.select('faa').distinct().count()
departure_airports = flights_df.select('origin').distinct().count()
destination_airports = flights_df.select('dest').distinct().count()

# Count airports that do not observe daylight saving time (dst column is 'N')
airports_no_dst = airports_df.filter(airports_df.dst == 'N').count()

# Count different timezones
timezones = airports_df.select('tzone').distinct().count()

# Count companies, planes, and cancelled flights
companies = airlines_df.count()
planes = planes_df.count()
cancelled_flights = flights_df.filter(flights_df.cancelled == 1).count()

(total_airports, departure_airports, destination_airports, airports_no_dst, timezones, companies, planes, cancelled_flights)


AttributeError: 'DataFrame' object has no attribute 'cancelled'

## Request 2: Most and Least Frequented Departure and Destination Airports

In [10]:
from pyspark.sql.functions import desc, asc

# Most used departure airport
most_used_departure_airport = flights_df.groupBy("origin").count().orderBy(desc("count")).first()

# Top 10 most used destinations
top10_destinations = flights_df.groupBy("dest").count().orderBy(desc("count")).limit(10)

# Top 10 least used destinations
bottom10_destinations = flights_df.groupBy("dest").count().orderBy(asc("count")).limit(10)

# Top 10 planes that have taken off the most
top10_planes_takeoff = flights_df.groupBy("tailnum").count().orderBy(desc("count")).limit(10)

# Top 10 planes that have taken off the least
bottom10_planes_takeoff = flights_df.groupBy("tailnum").count().orderBy(asc("count")).limit(10)

(most_used_departure_airport, top10_destinations, bottom10_destinations, top10_planes_takeoff, bottom10_planes_takeoff)


(Row(origin='EWR', count=91374),
 DataFrame[dest: string, count: bigint],
 DataFrame[dest: string, count: bigint],
 DataFrame[tailnum: string, count: bigint],
 DataFrame[tailnum: string, count: bigint])

## Request 3: Destinations Served by Each Company

In [11]:
# Destinations served by each company
destinations_by_company = flights_df.groupBy("carrier").agg(countDistinct("dest"))

# Destinations served by each company by airport of origin
destinations_by_company_by_airport = flights_df.groupBy("carrier", "origin").agg(countDistinct("dest"))

(destinations_by_company, destinations_by_company_by_airport)


NameError: name 'countDistinct' is not defined

## Request 4: Flights to Houston and NYC to Seattle Information

In [None]:
# Flights that landed in Houston (IAH or HOU)
flights_to_houston = flights_df.filter(flights_df["dest"].isin(["IAH", "HOU"]))

# Flights from NYC airports to Seattle
nyc_to_seattle_flights = flights_df.filter(flights_df["origin"].isin(["JFK", "LGA", "EWR"]) & (flights_df["dest"] == "SEA"))

# Number of companies serving from NYC to Seattle
companies_serving_nyc_sea = nyc_to_seattle_flights.select("carrier").distinct().count()

# Number of unique planes serving from NYC to Seattle
unique_planes_nyc_sea = nyc_to_seattle_flights.select("tailnum").distinct().count()

(flights_to_houston.count(), nyc_to_seattle_flights.count(), companies_serving_nyc_sea, unique_planes_nyc_sea)


## Request 5: Number of Flights per Destination

## Request 6: Companies Not Operating on All Airports