# Regression
## Platform: Spark, colab.research.google.com

In [0]:
# Colab preinstalled packages
import pandas as pd

In [0]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# init Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
flights_dfs = spark.read.csv("/content/gdrive/My Drive/Colab Notebooks/SparkAzureTutorial/data/flights.csv", header=True, inferSchema=True)

In [7]:
flights_dfs.describe().show(5, False)

+-------+------------------+-----------------+-------+------------------+------------------+------------------+------------------+
|summary|DayofMonth        |DayOfWeek        |Carrier|OriginAirportID   |DestAirportID     |DepDelay          |ArrDelay          |
+-------+------------------+-----------------+-------+------------------+------------------+------------------+------------------+
|count  |2702218           |2702218          |2702218|2702218           |2702218           |2702218           |2702218           |
|mean   |15.797897875004903|3.899480352806472|null   |12742.597593162358|12743.000197985506|10.510732294729737|6.6550108096386005|
|stddev |8.7988350691642   |1.985924603367557|null   |1501.8408475102513|1501.8014309297723|36.02975608466093 |38.547584236791245|
|min    |1                 |1                |9E     |10140             |10140             |-63               |-94               |
|max    |31                |7                |YV     |15376             |15376     

In [8]:
data = flights_dfs.select("DayOfMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay", "ArrDelay")
data.show(5)

+----------+---------+---------------+-------------+--------+--------+
|DayOfMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
|        19|        5|          15016|        11433|      28|      24|
|        19|        5|          11193|        12892|      -6|     -11|
+----------+---------+---------------+-------------+--------+--------+
only showing top 5 rows



In [9]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print("Train len: {}, test len: {}".format(train.count(), test.count()))

Train len: 1891657, test len: 810561


In [10]:
assembler = VectorAssembler(inputCols = ["DayOfMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay"], outputCol="features")
training = assembler.transform(train).select(col("features"), col("ArrDelay").alias("label"))
training.show(5, truncate=False)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|-11  |
|[1.0,1.0,10140.0,10397.0,-2.0]|-17  |
|[1.0,1.0,10140.0,10397.0,0.0] |-12  |
|[1.0,1.0,10140.0,10397.0,0.0] |-9   |
|[1.0,1.0,10140.0,10821.0,77.0]|94   |
+------------------------------+-----+
only showing top 5 rows



In [11]:
testing = assembler.transform(test).select(col("features"), col("ArrDelay").alias("label"))
testing.show(5, truncate=False)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-2.0]|-18  |
|[1.0,1.0,10140.0,10821.0,4.0] |4    |
|[1.0,1.0,10140.0,10821.0,8.0] |-9   |
|[1.0,1.0,10140.0,11259.0,-5.0]|-14  |
|[1.0,1.0,10140.0,11259.0,21.0]|23   |
+------------------------------+-----+
only showing top 5 rows



In [0]:
model = LinearRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3)
model = model.fit(training)

In [13]:
prediction = model.transform(testing)
predicted = prediction.select("features", "label", "prediction")
predicted.show(5, truncate=False)

+------------------------------+-----+-------------------+
|features                      |label|prediction         |
+------------------------------+-----+-------------------+
|[1.0,1.0,10140.0,10397.0,-2.0]|-18  |-5.55933705390471  |
|[1.0,1.0,10140.0,10821.0,4.0] |4    |0.33134687836675925|
|[1.0,1.0,10140.0,10821.0,8.0] |-9   |4.322552171282837  |
|[1.0,1.0,10140.0,11259.0,-5.0]|-14  |-8.748162943691959 |
|[1.0,1.0,10140.0,11259.0,21.0]|23   |17.194671460262544 |
+------------------------------+-----+-------------------+
only showing top 5 rows



In [14]:
trainingSummary = model.summary
rmse = trainingSummary.rootMeanSquaredError
r2 = trainingSummary.r2
print("RMSE: {}".format(rmse))
print("R2: {}".format(r2))

RMSE: 13.219624740367404
R2: 0.8830167251680671


## Platform: Pandas, scikit-learn, colab.research.google.com

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [16]:
flights_df = flights_dfs.toPandas()
flights_df.describe()

Unnamed: 0,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,DepDelay,ArrDelay
count,2702218.0,2702218.0,2702218.0,2702218.0,2702218.0,2702218.0
mean,15.7979,3.89948,12742.6,12743.0,10.51073,6.655011
std,8.798835,1.985925,1501.841,1501.801,36.02976,38.54758
min,1.0,1.0,10140.0,10140.0,-63.0,-94.0
25%,8.0,2.0,11292.0,11292.0,-4.0,-11.0
50%,16.0,4.0,12892.0,12892.0,-1.0,-3.0
75%,23.0,6.0,14057.0,14057.0,9.0,10.0
max,31.0,7.0,15376.0,15376.0,1863.0,1845.0


In [0]:
X = flights_df.loc[:, ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay"]]
y = flights_df.loc[:, "ArrDelay"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [0]:
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [19]:
predicted = pd.DataFrame(X_test)
predicted = pd.concat([predicted, 
                       pd.DataFrame(y_test, index=y_test.index), 
                       pd.DataFrame(y_pred, index=y_test.index)], 
                      axis=1, ignore_index=True)
predicted.columns = list(X.columns) + ["label"] + ["y_pred"]
predicted.head(5)

Unnamed: 0,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,DepDelay,label,y_pred
2159824,6,5,10821,11292,10,-17,5.834453
632083,16,4,15304,14843,4,-8,0.132481
115502,27,6,10821,13303,-8,-21,-12.630798
2577846,2,3,14492,15304,-2,-11,-6.182267
1194206,8,1,12266,14122,14,11,10.058543


In [20]:
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)
assert r2 == model.score(X_test, y_test)
print("RMSE: {}".format(rmse))
print("R2: {}".format(r2))

RMSE: 13.160356997190407
R2: 0.8836323895450338


### Reusing Spark split data to compare metrics

In [0]:
train_df = train.toPandas()
test_df = test.toPandas()

In [0]:
X_train = train_df.drop("ArrDelay", axis=1)
y_train = train_df["ArrDelay"]
X_test = test_df.drop("ArrDelay", axis=1)
y_test = test_df["ArrDelay"]

In [0]:
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [24]:
predicted = pd.DataFrame(X_test)
predicted = pd.concat([predicted, 
                       pd.DataFrame(y_test, index=y_test.index), 
                       pd.DataFrame(y_pred, index=y_test.index)], 
                      axis=1, ignore_index=True)
predicted.columns = list(X.columns) + ["label"] + ["y_pred"]
predicted.head(5)

Unnamed: 0,DayofMonth,DayOfWeek,OriginAirportID,DestAirportID,DepDelay,label,y_pred
0,1,1,10140,10397,-2,-18,-5.660242
1,1,1,10140,10821,4,4,0.275755
2,1,1,10140,10821,8,-9,4.297958
3,1,1,10140,11259,-5,-14,-8.874722
4,1,1,10140,11259,21,23,17.269596


In [25]:
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)
assert r2 == model.score(X_test, y_test)
print("RMSE: {}".format(rmse))
print("R2: {}".format(r2))

RMSE: 13.224326592568351
R2: 0.8808162448770165
