# Classification
## Platform: Spark, colab.research.google.com

In [0]:
# Colab preinstalled packages
import pandas as pd

In [0]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# init Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [70]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
flights_dfs = spark.read.csv("/content/gdrive/My Drive/Colab Notebooks/SparkAzureTutorial/data/flights.csv", header=True, inferSchema=True)

In [72]:
flights_dfs.describe().show(5, False)

+-------+------------------+-----------------+-------+------------------+------------------+------------------+------------------+
|summary|DayofMonth        |DayOfWeek        |Carrier|OriginAirportID   |DestAirportID     |DepDelay          |ArrDelay          |
+-------+------------------+-----------------+-------+------------------+------------------+------------------+------------------+
|count  |2702218           |2702218          |2702218|2702218           |2702218           |2702218           |2702218           |
|mean   |15.797897875004903|3.899480352806472|null   |12742.597593162358|12743.000197985506|10.510732294729737|6.6550108096386005|
|stddev |8.7988350691642   |1.985924603367557|null   |1501.8408475102513|1501.8014309297723|36.02975608466093 |38.547584236791245|
|min    |1                 |1                |9E     |10140             |10140             |-63               |-94               |
|max    |31                |7                |YV     |15376             |15376     

In [73]:
data = flights_dfs.select("DayOfMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay", (col("ArrDelay")>15).cast("Int").alias("Late"))
data.show(5)

+----------+---------+---------------+-------------+--------+----+
|DayOfMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
|        19|        5|          15016|        11433|      28|   1|
|        19|        5|          11193|        12892|      -6|   0|
+----------+---------+---------------+-------------+--------+----+
only showing top 5 rows



In [74]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print("Train len: {}, test len: {}".format(train.count(), test.count()))

Train len: 1891173, test len: 811045


In [75]:
assembler = VectorAssembler(inputCols = ["DayOfMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay"], outputCol="features")
training = assembler.transform(train).select(col("features"), col("Late").alias("label"))
training.show(5, truncate=False)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|0    |
|[1.0,1.0,10140.0,10397.0,-2.0]|0    |
|[1.0,1.0,10140.0,10397.0,0.0] |0    |
|[1.0,1.0,10140.0,10821.0,4.0] |0    |
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
+------------------------------+-----+
only showing top 5 rows



In [0]:
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3)
model = lr.fit(training)

In [77]:
testing = assembler.transform(test).select(col("features"), col("Late").alias("label"))
testing.show(5, truncate=False)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-2.0]|0    |
|[1.0,1.0,10140.0,10397.0,0.0] |0    |
|[1.0,1.0,10140.0,11259.0,-5.0]|0    |
|[1.0,1.0,10140.0,11259.0,-2.0]|0    |
|[1.0,1.0,10140.0,11259.0,24.0]|0    |
+------------------------------+-----+
only showing top 5 rows



In [0]:
prediction = model.transform(testing)

In [79]:
predicted = prediction.filter("prediction = 0.0").select("features", "prediction", "probability", "label")
predicted.show(5, truncate=False)

+------------------------------+----------+----------------------------------------+-----+
|features                      |prediction|probability                             |label|
+------------------------------+----------+----------------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.8273427081270124,0.17265729187298773]|0    |
|[1.0,1.0,10140.0,10397.0,0.0] |0.0       |[0.823288234492235,0.17671176550776502] |0    |
|[1.0,1.0,10140.0,11259.0,-5.0]|0.0       |[0.8336477018241905,0.1663522981758096] |0    |
|[1.0,1.0,10140.0,11259.0,-2.0]|0.0       |[0.827714763655645,0.172285236344355]   |0    |
|[1.0,1.0,10140.0,11259.0,24.0]|0.0       |[0.769218878329818,0.23078112167018197] |0    |
+------------------------------+----------+----------------------------------------+-----+
only showing top 5 rows



In [80]:
predicted = prediction.filter("prediction = 1.0").select("features", "prediction", "probability", "label")
predicted.show(5, truncate=False)

+-------------------------------+----------+------------------------------------------+-----+
|features                       |prediction|probability                               |label|
+-------------------------------+----------+------------------------------------------+-----+
|[1.0,1.0,10140.0,12266.0,838.0]|1.0       |[3.5729577538386035E-5,0.9999642704224616]|1    |
|[1.0,1.0,10397.0,12953.0,131.0]|1.0       |[0.42731779419546534,0.5726822058045347]  |1    |
|[1.0,1.0,10397.0,12953.0,223.0]|1.0       |[0.16987624354452902,0.8301237564554709]  |1    |
|[1.0,1.0,10397.0,13871.0,120.0]|1.0       |[0.46621528447441146,0.5337847155255886]  |1    |
|[1.0,1.0,10693.0,12266.0,115.0]|1.0       |[0.48334003869755,0.51665996130245]       |1    |
+-------------------------------+----------+------------------------------------------+-----+
only showing top 5 rows



In [82]:
trainingSummary = model.summary
print("TP: {}. FP: {}".format(trainingSummary.weightedTruePositiveRate, trainingSummary.weightedFalsePositiveRate))
print("Accuracy: {}".format(trainingSummary.accuracy))
print("Precision: {}".format(trainingSummary.weightedPrecision))
print("Recall: {}".format(trainingSummary.weightedRecall))
print("Labels: {}".format(trainingSummary.labels))

TP: 0.8247077343003523. FP: 0.7048177777769301
Accuracy: 0.8247077343003523
Precision: 0.855440164913897
Recall: 0.8247077343003523
Labels: [0.0, 1.0]


## Platform: Pandas, scikit-learn, colab.research.google.com

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.model_selection import train_test_split

In [21]:
flights_df = flights_dfs.toPandas()
flights_df.describe()

Unnamed: 0,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,DepDelay,ArrDelay
count,2702218.0,2702218.0,2702218.0,2702218.0,2702218.0,2702218.0
mean,15.7979,3.89948,12742.6,12743.0,10.51073,6.655011
std,8.798835,1.985925,1501.841,1501.801,36.02976,38.54758
min,1.0,1.0,10140.0,10140.0,-63.0,-94.0
25%,8.0,2.0,11292.0,11292.0,-4.0,-11.0
50%,16.0,4.0,12892.0,12892.0,-1.0,-3.0
75%,23.0,6.0,14057.0,14057.0,9.0,10.0
max,31.0,7.0,15376.0,15376.0,1863.0,1845.0


In [22]:
flights_df["Late"] = 0
flights_df.loc[flights_df["ArrDelay"]>15, "Late"] = 1
flights_df.head(5)

Unnamed: 0,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,DepDelay,ArrDelay,Late
0,19,5,DL,11433,13303,-3,1,0
1,19,5,DL,14869,12478,0,-8,0
2,19,5,DL,14057,14869,-4,-15,0
3,19,5,DL,15016,11433,28,24,1
4,19,5,DL,11193,12892,-6,-11,0


In [0]:
X = flights_df.loc[:, ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay"]]
y = flights_df.loc[:, "Late"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [0]:
model = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)

In [25]:
accuracy = model.score(X_test, y_test)
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
# check
cm = confusion_matrix(y_test, y_pred)
tn, fn, fp, tp = cm[0][0], cm[1][0], cm[0][1], cm[1][1]
assert precision == tp/(tp + fp)
assert recall == tp/(tp + fn)

Accuracy: 0.925960876612563
Precision: 0.8933522186360787
Recall: 0.7139622921615202


### Reusing Spark split data to compare metrics

In [0]:
train_df = train.toPandas()
test_df = test.toPandas()

In [0]:
X_train = train_df.drop("Late", axis=1)
y_train = train_df["Late"]
X_test = test_df.drop("Late", axis=1)
y_test = test_df["Late"]

In [0]:
model = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)

In [29]:
accuracy = model.score(X_test, y_test)
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
# check
cm = confusion_matrix(y_test, y_pred)
tn, fn, fp, tp = cm[0][0], cm[1][0], cm[0][1], cm[1][1]
assert precision == tp/(tp + fp)
assert recall == tp/(tp + fn)

Accuracy: 0.9260545844378124
Precision: 0.89340917425466
Recall: 0.7126177821481472
