# Machine Learning with Pyspark

In [3]:
# Import libraries and Initialize SparkSession
import sys
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        master('local[*]').\
        appName('p5-machine-learning-with-pyspark').\
        config('spark.local.ip', '192.168.157.254').\
        getOrCreate()
print(f'Spark version: {spark.version}')
print(f'system version: {sys.version_info}')

Spark version: 3.4.1
system version: sys.version_info(major=3, minor=9, micro=17, releaselevel='final', serial=0)


### Loading Data

#### Loading flighting data

In [6]:
# Read data from CSV file
flights = spark.read.csv('datasets/flights.csv', sep=',', header=True, inferSchema=True, nullValue='NA')
# Get number of records
print("The data contain %d records." % flights.count())
# View the first five records
flights.show(5)
# Check column data types
print(flights.dtypes)

The data contain 50000 records.
+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows

[('mon', 'int'), ('dom', 'int'), ('dow', 'int'), ('carrier', 'string'), ('flight', 'int'), ('org', 'string'), ('mile', 'int'), ('depart', 'double'), ('duration', 'int'), ('delay', 'int')]


#### Loading SMS spam data

In [13]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

sms = spark.read.csv("datasets/sms.csv", sep=";", header=False ,schema=schema)

sms.printSchema()

root
 |-- id: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



In [14]:
sms.show(5, truncate=100)

+---+----------------------------------------------------------------------------------------------------+-----+
| id|                                                                                                text|label|
+---+----------------------------------------------------------------------------------------------------+-----+
|  1|                                                                   Sorry, I'll call later in meeting|    0|
|  2|                                                                      Dont worry. I guess he's busy.|    0|
|  3|                                                                   Call FREEPHONE 0800 542 0578 now!|    1|
|  4|                                                         Win a 1000 cash prize or a prize worth 5000|    1|
|  5|Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there go...|    0|
+---+-------------------------------------------------------------------------------------------

## Classification

### Data Preparation

#### Removing columns and rows

In [28]:
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [29]:
# Remove the 'flight' column
flights_drop_column = flights.drop('flight')
flights_drop_column.show(5)

+---+---+---+-------+---+----+------+--------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|
+---+---+---+-------+---+----+------+--------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|ORD| 258|  8.92|      65| null|
+---+---+---+-------+---+----+------+--------+-----+
only showing top 5 rows



In [30]:
# Number of records with missing 'delay' values
flights_drop_column.filter('delay IS NULL').count()

2978

In [31]:
# Remove records with missing 'delay' values
flights_valid_delay = flights_drop_column.filter('delay IS NOT NULL')

In [32]:
# Remove records with missing values in any column and get the number of remaining rows
flights_none_missing = flights_valid_delay.dropna()
print(flights_none_missing.count())

47022


#### Column manipulation

In [33]:
flights_none_missing.show(5)

+---+---+---+-------+---+----+------+--------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|
+---+---+---+-------+---+----+------+--------+-----+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|
+---+---+---+-------+---+----+------+--------+-----+
only showing top 5 rows



In [34]:
# Import the required function
from pyspark.sql.functions import round
# Convert 'mile' to 'km' and drop 'mile' column (1 mile is equivalent to 1.60934 km)
flights_km = flights_none_missing.withColumn('km', round(flights_none_missing.mile * 1.60934, 0)).drop('mile')

In [35]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label', (flights_km.delay >= 15).cast('integer'))

In [36]:
# Check first five records
flights_km.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



#### Categorical columns

In [37]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights_km)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights_km)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
flights_indexed.show(5)

                                                                                

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
only showing top 5 rows



#### Assembling columns

In [38]:
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights_indexed)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)

+-----------------------------------------+-----+
|features                                 |delay|
+-----------------------------------------+-----+
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |30   |
|[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |-8   |
|[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|-5   |
|[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |2    |
|[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |54   |
+-----------------------------------------+-----+
only showing top 5 rows



### Decision Tree

In [39]:
flights_assembled.show(5, truncate=False)

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|km    |label|carrier_idx|org_idx|features                                 |
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|0  |22 |2  |UA     |ORD|16.33 |82      |30   |509.0 |1    |0.0        |0.0    |[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |
|2  |20 |4  |UA     |SFO|6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |
|9  |13 |1  |AA     |ORD|10.33 |195     |-5   |1989.0|0    |1.0        |0.0    |[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|
|5  |2  |1  |UA     |SFO|7.98  |102     |2    |885.0 |0    |0.0        |1.0    |[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |
|7  |2  |6  |AA     |ORD|10.83 |135     |54   |1180.0|1    |1.0        |0.0    |[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |
+---+---+---+---

#### Train/test split

In [42]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights_assembled.randomSplit([0.8, 0.2], 43)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights_assembled.count()
print(training_ratio)

0.8025392369529156


#### Build a Decision Tree

In [43]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

                                                                                

+-----+----------+---------------------------------------+
|label|prediction|probability                            |
+-----+----------+---------------------------------------+
|1    |0.0       |[0.5304268846503178,0.4695731153496821]|
|1    |0.0       |[0.5304268846503178,0.4695731153496821]|
|0    |1.0       |[0.3570197280214518,0.6429802719785481]|
|1    |1.0       |[0.3570197280214518,0.6429802719785481]|
|1    |1.0       |[0.3570197280214518,0.6429802719785481]|
+-----+----------+---------------------------------------+
only showing top 5 rows



#### Evaluate the Decision Tree

In [45]:
# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label <> prediction').count()
FP = prediction.filter('prediction = 1 AND label <> prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1188|
|    0|       0.0| 2348|
|    1|       1.0| 3575|
|    0|       1.0| 2174|
+-----+----------+-----+



                                                                                

0.6379106085083468


### Logistic Regression