# 0. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pyspark as sp
import findspark

from pyspark.sql import SparkSession

# 1. Find Spark

Adding pyspark to sys.path at runtime using the library findspark

In [2]:
findspark.init()
findspark.find()

'C:\\spark-3.4.1-bin-hadoop3'

# 2. Creating SparkSession

One aspect of the explanation why SparkSession is preferable over SparkContext in SparkSession Vs SparkContext battle is that SparkSession unifies all of Spark’s numerous contexts, removing the developer’s need to worry about generating separate contexts.

In [3]:
#Create the SparkSession
my_spark = SparkSession.builder.getOrCreate()

#print the session
print(my_spark)

<pyspark.sql.session.SparkSession object at 0x000001DA7FC359D0>


# 3. Join Dataframe

In [4]:
planes = my_spark.read.csv('planes.csv', header=True)
planes.show(5)

+-------+----+--------------------+----------------+---------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|    model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+---------+-------+-----+-----+---------+
| N10156|2004|Fixed wing multi ...|         EMBRAER|EMB-145XR|      2|   55|   NA|Turbo-fan|
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE| A320-214|      2|  182|   NA|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE| A320-214|      2|  182|   NA|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE| A320-214|      2|  182|   NA|Turbo-fan|
| N10575|2002|Fixed wing multi ...|         EMBRAER|EMB-145LR|      2|   55|   NA|Turbo-fan|
+-------+----+--------------------+----------------+---------+-------+-----+-----+---------+
only showing top 5 rows



In [5]:
flights = my_spark.read.csv('flights_small.csv', header=True)
flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
+----+-----+---+--------+---------+-----

In [6]:
# Add duration_hrs
from pyspark.sql.functions import round
flights = flights.withColumn('duration_hrs', round(flights.air_time / 60,3))
flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|         2.2|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|         6.0|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|        1.85|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|       1.383|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS

In [7]:
# Rename year column
planes = planes.withColumnRenamed("year", "plane_year")
planes.show(5)

+-------+----------+--------------------+----------------+---------+-------+-----+-----+---------+
|tailnum|plane_year|                type|    manufacturer|    model|engines|seats|speed|   engine|
+-------+----------+--------------------+----------------+---------+-------+-----+-----+---------+
| N10156|      2004|Fixed wing multi ...|         EMBRAER|EMB-145XR|      2|   55|   NA|Turbo-fan|
| N102UW|      1998|Fixed wing multi ...|AIRBUS INDUSTRIE| A320-214|      2|  182|   NA|Turbo-fan|
| N103US|      1999|Fixed wing multi ...|AIRBUS INDUSTRIE| A320-214|      2|  182|   NA|Turbo-fan|
| N104UW|      1999|Fixed wing multi ...|AIRBUS INDUSTRIE| A320-214|      2|  182|   NA|Turbo-fan|
| N10575|      2002|Fixed wing multi ...|         EMBRAER|EMB-145LR|      2|   55|   NA|Turbo-fan|
+-------+----------+--------------------+----------------+---------+-------+-----+-----+---------+
only showing top 5 rows



In [8]:
# Join the DataFrames
model_data = flights.join(planes, on="tailnum", how="leftouter")
model_data.show(5)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+------------+--------+-------+-----+-----+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+------------+--------+-------+-----+-----+---------+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|         2.2|      2011|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     AS|   851|   SEA| HNL|     360|    2677|  10|    40|         6.0|

# 4. Preprocesses

### 4.1 String to integer

It's important to know that Spark only handles numeric data. That means all of the columns in your DataFrame must be either integers or decimals (called 'doubles' in Spark)

you can use the .cast() method in combination with the .withColumn() method. It's important to note that .cast() works on columns, while .withColumn() works on DataFrames.

The only argument you need to pass to .cast() is the kind of value you want to create, in string form. For example, to create integers, you'll pass the argument "integer" and for decimal numbers you'll use "double".

In [9]:
# Cast the columns to integers
model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast("integer"))
model_data = model_data.withColumn("air_time", model_data.air_time.cast("integer"))
model_data = model_data.withColumn("month", model_data.month.cast("integer"))
model_data = model_data.withColumn("plane_year", model_data.plane_year.cast("integer"))

In [10]:
model_data

DataFrame[tailnum: string, year: string, month: int, day: string, dep_time: string, dep_delay: string, arr_time: string, arr_delay: int, carrier: string, flight: string, origin: string, dest: string, air_time: int, distance: string, hour: string, minute: string, duration_hrs: double, plane_year: int, type: string, manufacturer: string, model: string, engines: string, seats: string, speed: string, engine: string]

### 4.2 Creating column

In [11]:
# Create the column plane_age
model_data = model_data.withColumn("plane_age", model_data.year - model_data.plane_year)
model_data.show(5)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|plane_age|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|         2.2|      2011|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|      3.0|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     AS|   851|   SEA| HNL|   

### 4.3 Making Booleans

In [12]:
# Create is_late
model_data = model_data.withColumn("is_late", model_data.arr_delay > 0)
model_data.select("arr_delay","is_late").show(5)

+---------+-------+
|arr_delay|is_late|
+---------+-------+
|       -5|  false|
|        5|   true|
|        2|   true|
|       34|   true|
|        1|   true|
+---------+-------+
only showing top 5 rows



In [13]:
# Convert to an integer
model_data = model_data.withColumn("label", model_data.is_late.cast("integer"))
model_data.select("label","is_late").show(5)

+-----+-------+
|label|is_late|
+-----+-------+
|    0|  false|
|    1|   true|
|    1|   true|
|    1|   true|
|    1|   true|
+-----+-------+
only showing top 5 rows



In [14]:
# Remove missing values
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")
model_data.select("arr_delay","is_late").show(5)

+---------+-------+
|arr_delay|is_late|
+---------+-------+
|       -5|  false|
|        5|   true|
|        2|   true|
|       34|   true|
|        2|   true|
+---------+-------+
only showing top 5 rows



### 4.4 StringIndexer & OneHotEncoder

All we have to remember is that you need to create a StringIndexer and a OneHotEncoder, and the Pipeline will take care of the rest.

In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [16]:
# Create a StringIndexer
carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")

In [17]:
# Create a OneHotEncoder
carr_encoder = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact")

In [18]:
# Create a StringIndexer
dest_indexer = StringIndexer(inputCol="dest", outputCol="dest_index")

In [19]:
# Create a OneHotEncoder
dest_encoder = OneHotEncoder(inputCol="dest_index", outputCol="dest_fact")

### 4.5 Assemble a Vector

In [20]:
from pyspark.ml.feature import VectorAssembler

In [21]:
# Make a VectorAssembler
vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol="features")

### 4.5 Create the Pipeline

In [22]:
# Import Pipeline
from pyspark.ml import Pipeline

# Make the pipeline
flights_pipe = Pipeline(stages=[dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])

In [23]:
# Fit and transform the data
piped_data = flights_pipe.fit(model_data).transform(model_data)

In [24]:
piped_data.show(3)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|plane_age|is_late|label|dest_index|      dest_fact|carrier_index|  carrier_fact|            features|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+
| N846VA|2014|   12|  8|     658|       -7|   

In [25]:
piped_data.select("month", "air_time", "carrier_fact", "dest_fact", "plane_age").show(5, False)

+-----+--------+--------------+---------------+---------+
|month|air_time|carrier_fact  |dest_fact      |plane_age|
+-----+--------+--------------+---------------+---------+
|12   |132     |(10,[6],[1.0])|(59,[3],[1.0]) |3.0      |
|1    |360     |(10,[0],[1.0])|(59,[19],[1.0])|8.0      |
|3    |111     |(10,[6],[1.0])|(59,[0],[1.0]) |3.0      |
|4    |83      |(10,[1],[1.0])|(59,[10],[1.0])|22.0     |
|1    |121     |(10,[1],[1.0])|(59,[1],[1.0]) |17.0     |
+-----+--------+--------------+---------------+---------+
only showing top 5 rows



In [26]:
piped_data.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|(72,[0,1,8,15,71]...|
|(72,[0,1,2,31,71]...|
|(72,[0,1,8,12,71]...|
|(72,[0,1,3,22,71]...|
|(72,[0,1,3,13,71]...|
+--------------------+
only showing top 5 rows



### 4.6 Split data

In [27]:
# Split the data into training and test sets
training, test = piped_data.randomSplit([.6, .4])

In [28]:
training

DataFrame[tailnum: string, year: string, month: int, day: string, dep_time: string, dep_delay: string, arr_time: string, arr_delay: int, carrier: string, flight: string, origin: string, dest: string, air_time: int, distance: string, hour: string, minute: string, duration_hrs: double, plane_year: int, type: string, manufacturer: string, model: string, engines: string, seats: string, speed: string, engine: string, plane_age: double, is_late: boolean, label: int, dest_index: double, dest_fact: vector, carrier_index: double, carrier_fact: vector, features: vector]

# 5. Create the model

We are going to predict is the flight was late ("label" column) considering some features (specified in "features column", which are "month", "air_time", "carrier_fact", "dest_fact", "plane_age")

In [29]:
# Import LogisticRegression
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression Estimator
lr = LogisticRegression()

### 5.1 Create the evaluator

In [30]:
# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
# the curve is the ROC, or receiver operating curve.

### 5.2 Make a grid

In [31]:
# Import the tuning submodule
import pyspark.ml.tuning as tune

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0,1])

# Build the grid
grid = grid.build()

### 5.3 Make the validator

In [32]:
# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr,
               estimatorParamMaps=grid,
               evaluator=evaluator)

### 5.4 Fit the model

In [33]:
training.show(2)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+----------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|plane_year|                type|    manufacturer|   model|engines|seats|speed|   engine|plane_age|is_late|label|dest_index|      dest_fact|carrier_index|  carrier_fact|            features|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+------------+----------+--------------------+----------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+
| N102UW|2014|    5|  7|    1311| 

In [34]:
training.select("features", "label").show(5, False)

+-------------------------------------------+-----+
|features                                   |label|
+-------------------------------------------+-----+
|(72,[0,1,6,36,71],[5.0,274.0,1.0,1.0,16.0])|1    |
|(72,[0,1,6,36,71],[3.0,261.0,1.0,1.0,15.0])|1    |
|(72,[0,1,6,36,71],[4.0,266.0,1.0,1.0,15.0])|0    |
|(72,[0,1,6,36,71],[5.0,255.0,1.0,1.0,15.0])|0    |
|(72,[0,1,6,36,71],[4.0,271.0,1.0,1.0,15.0])|1    |
+-------------------------------------------+-----+
only showing top 5 rows



By default, logistic regression model take as input the column named "features"; and as output to predict the column named "label". That's why below the input and output are no specified.

In [35]:
# Call lr.fit()
best_lr = lr.fit(training)

# Print best_lr
print(best_lr)

LogisticRegressionModel: uid=LogisticRegression_6a1d2f3171ec, numClasses=2, numFeatures=72


In [36]:
best_lr.featuresCol

Param(parent='LogisticRegression_6a1d2f3171ec', name='featuresCol', doc='features column name.')

In [37]:
best_lr.labelCol

Param(parent='LogisticRegression_6a1d2f3171ec', name='labelCol', doc='label column name.')

### 5.5 Evaluate the model

In [38]:
# Use the model to predict the test set
test_results = best_lr.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results))

0.691048737696247


Sources:
https://github.com/ozlerhakan/datacamp/blob/master/Introduction%20to%20PySpark/introduction-to-pySpark.ipynbsS