In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 70kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 49.8MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612242 sha256=ea2db94820a63687680ca27e2f5f1bbacba5f018ce184fa736e7c52bdd6492c4
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
sc = SparkContext()
spark = SparkSession.builder.master('local[*]').appName('first_spark_application').getOrCreate()

In [None]:
# Filter missing data

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df.show(5)
print(f'The dataframe has {df.count()} observations with {len(df.columns)} columns.')

# How many missing values
print(f'There are {df.where("delay IS NULL").count()} missing observations in "flights.csv"')
print(f'You can get the same value using either .where() or .either(): {df.filter("delay IS NULL").count()}.')

# Drop observations with missing values only form the "delay" column
print(f'We now have {df.where("delay IS NOT NULL").count()} observations left.')
print(f'If we drop missing values from any column we have {df.dropna().count()} left.')
print("This means only the 'delay' column having missing values. \n")

# Create a kilometers column
from pyspark.sql.functions import round
df = df.withColumn("kilometre", df["mile"]*1.609344)
df = df.withColumn("kilometre_round", round(df["mile"]*1.609344).cast("integer"))
df[["kilometre", "kilometre_round"]].show(3)

from pyspark.sql.functions import when
df = df.withColumn("label", when(df["delay"] > 15, 1).otherwise(0))
df.show(5)

# Indexing categorical data
from pyspark.ml.feature import StringIndexer

df = StringIndexer(inputCol="carrier",  outputCol="carrier_index").fit(df).transform(df)
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df).drop("kilometre_round")
df.show(5)
print("We can check that the index of zero has the most observations.")
print("Carrier 'UA' has the index 0 and have observations = ", df.where('carrier = "UA"').count())
print("Carrier 'US' has the index 6 and have observations = ", df.where('carrier = "US"').count())

# Vector Assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["mon", "dom", "dow", "carrier_index", "org_index", "kilometre", "depart", "duration"], outputCol='features')
df = assembler.transform(df)
df[["carrier_index", "org_index", "duration", "features"]].show(5, truncate=False)



+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows

The dataframe has 50000 observations with 10 columns.
There are 2978 missing observations in "flights.csv"
You can get the same value using either .where() or .either(): 2978.
We now have 47022 observations left.
If we drop missing values from any column we have 47022 left.
This means only the 'delay' column having missing values. 

+------------------+---------------+
|         kilometre|kilometre_round|
+------------------+------

In [None]:
from pyspark.sql.functions import when
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("kilometre", df["mile"]*1.609344)
df = df.withColumn("label", when(df["delay"] >= 15, 1).otherwise(0))
df = StringIndexer(inputCol="carrier",  outputCol="carrier_index").fit(df).transform(df)
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df)
df = VectorAssembler(inputCols=["mon", "dom", "dow", "carrier_index", "org_index", "kilometre", "depart", "duration"], outputCol='features').transform(df)
df.show(5, truncate=False)

# Specify the seed for reproducability
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Build a Decision Tree
tree = DecisionTreeClassifier()

# Train the data
tree_model = tree.fit(df_train)

# Evaluating the predictions from the test data and compare to known values
prediction = tree_model.transform(df_test)
prediction[["label", "prediction", "probability"]].show(5, truncate=False)

# Confusion matrix
prediction.groupBy("label", "prediction").count().show()
TP = prediction.where("label = 1 AND prediction = 1").count()
TN = prediction.where("label = 0 AND prediction = 0").count()
FP = prediction.where("label = 0 AND prediction = 1").count()
FN = prediction.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+----------------------------------------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|kilometre         |label|carrier_index|org_index|features                                            |
+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+----------------------------------------------------+
|11 |20 |6  |US     |19    |JFK|2153|9.48  |351     |null |3464.917632       |0    |6.0          |2.0      |[11.0,20.0,6.0,6.0,2.0,3464.917632,9.48,351.0]      |
|0  |22 |2  |UA     |1107  |ORD|316 |16.33 |82      |30   |508.55270400000006|1    |0.0          |0.0      |[0.0,22.0,2.0,0.0,0.0,508.55270400000006,16.33,82.0]|
|2  |20 |4  |UA     |226   |SFO|337 |6.17  |82      |-8   |542.348928        |0    |0.0          |1.0      |[2.0,20.0,4.0,0.0,1.0,542.348928,6.17,82.0]         |
|9  |13 |1  |AA     |419   |

In [None]:
# Logistic Regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("kilometre", df["mile"]*1.609344)
df = df.withColumn("label", when(df["delay"] >= 15, 1).otherwise(0))
df = StringIndexer(inputCol="carrier",  outputCol="carrier_index").fit(df).transform(df)
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df)
df = VectorAssembler(inputCols=["mon", "dom", "dow", "carrier_index", "org_index", "kilometre", "depart", "duration"], outputCol='features').transform(df)
df.show(5, truncate=False)

# Specify the seed for reproducability
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Create a logistic regression classifier
logistic = LogisticRegression()

# Train the data
logistic = logistic.fit(df_train) 
prediction = logistic.transform(df_test)
prediction[["label", "prediction", "probability"]].show(5, truncate=False)

# Confusion matrix
prediction.groupBy("label", "prediction").count().show()
TP = prediction.where("label = 1 AND prediction = 1").count()
TN = prediction.where("label = 0 AND prediction = 0").count()
FP = prediction.where("label = 0 AND prediction = 1").count()
FN = prediction.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

# Precision is the proportion of positive predictions which are correct. For all flights which are predicted to be delayed, what proportion is actually delayed?
print(f"The precision is {TP/(TP+FP)*100:.2f}%")

# Recall is the proportion of positives outcomes which are correctly predicted. For all delayed flights, what proportion is correctly predicted by the model?
print(f"The recall is {TP/(TP+FN)*100:.2f}%")

# Weighted metrics
evaluator = MulticlassClassificationEvaluator()
print(evaluator.evaluate(prediction, {evaluator.metricName: "weightedPrecision"}))
binary_evaluator = BinaryClassificationEvaluator()
print(binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"}))

+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+----------------------------------------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|kilometre         |label|carrier_index|org_index|features                                            |
+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+----------------------------------------------------+
|11 |20 |6  |US     |19    |JFK|2153|9.48  |351     |null |3464.917632       |0    |6.0          |2.0      |[11.0,20.0,6.0,6.0,2.0,3464.917632,9.48,351.0]      |
|0  |22 |2  |UA     |1107  |ORD|316 |16.33 |82      |30   |508.55270400000006|1    |0.0          |0.0      |[0.0,22.0,2.0,0.0,0.0,508.55270400000006,16.33,82.0]|
|2  |20 |4  |UA     |226   |SFO|337 |6.17  |82      |-8   |542.348928        |0    |0.0          |1.0      |[2.0,20.0,4.0,0.0,1.0,542.348928,6.17,82.0]         |
|9  |13 |1  |AA     |419   |

In [None]:
### Turning text into tables
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Removing punctuation
REGEX = '[_():;,.!?\\- 0-9]'

sms = spark.read.csv("sms.csv", inferSchema=True, sep=";")
sms.show(5, truncate=False)

sms = sms.withColumn("text", regexp_replace("_c1", REGEX, ' '))
sms[["_c1", "text"]].show(5, truncate=False)

# Text to tokens
sms = Tokenizer(inputCol="text", outputCol="tokens").transform(sms)
sms[["text", "tokens"]].show(5, truncate=False)

# Remove stop words
stopwords = StopWordsRemover()

# What are stop words?
print(stopwords.getStopWords())

# Specify the input and output column names  inputCol ต้องเป็น list จาก tokenization
stopwords = stopwords.setInputCol("tokens").setOutputCol("words")
sms = stopwords.transform(sms)
sms[["text", "words"]].show(5, truncate=False)

# Hasing words, numFeatures คือ จำนวนสูงสุดของ element ใน sparse vector ที่สสร้างด้วย HashingTF
# [4, win, 1000, cash, prize, prize, worth, 5000, 1]|(32,[3,5,11,15,18,26,30,31],[1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0])
# คำว่า prize มี hash value = 18 เกิดขึ้น 2 ครั้ง ?
hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32)
sms = hasher.transform(sms)
sms[["words", "hash"]].show(5, truncate=False)

# Dealing with common words
# คำที่โผล่มาบ่อย จะให้ค่าน้ำหนักน้อยลง คำที่โผล่มาน้อยจะให้ค่าน้ำหนักเพิ่ม
sms = IDF(inputCol="hash", outputCol="features").fit(sms).transform(sms)
sms[["features"]].show(5, truncate=False)



+---+---------------------------------------------------------------------------------------------------------------+---+
|_c0|_c1                                                                                                            |_c2|
+---+---------------------------------------------------------------------------------------------------------------+---+
|1  |Sorry, I'll call later in meeting                                                                              |0  |
|2  |Dont worry. I guess he's busy.                                                                                 |0  |
|3  |Call FREEPHONE 0800 542 0578 now!                                                                              |1  |
|4  |Win a 1000 cash prize or a prize worth 5000                                                                    |1  |
|5  |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...|0  |
+---+-------------------

In [None]:
# Example from exercise
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

REGEX = '[_():;,.!?\\- 0-9]'

sms = spark.read.csv("sms.csv", inferSchema=True, sep=";")
for i, j in zip(["_c0", "_c1", "_c2"], ["id", "text", "label"]):
    sms = sms.withColumnRenamed(i, j)
sms.show(5)

# Remove punctuation (REGEX provided) and numbers
sms = sms.withColumn('text', regexp_replace("text", '[_():;,.!?\\- 0-9]', ' '))

# Merge multiple spaces
sms = sms.withColumn('text', regexp_replace("text", ' +', ' '))

# Split the text into words
sms = Tokenizer(inputCol='text', outputCol="words").transform(sms)
sms.show(4, truncate=False)

# Remove stop words.
sms = StopWordsRemover(inputCol="words", outputCol="terms").transform(sms)

# Apply the hashing trick
sms = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024).transform(sms)

# Convert hashed symbols to TF-IDF
sms = IDF(inputCol="hash", outputCol="features").fit(sms).transform(sms)
      
sms.select('terms', 'features').show(4, truncate=False)

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", "prediction").count().show()

TP = prediction.where("label = 1 AND prediction = 1").count()
TN = prediction.where("label = 0 AND prediction = 0").count()
FP = prediction.where("label = 0 AND prediction = 1").count()
FN = prediction.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

+---+--------------------+-----+
| id|                text|label|
+---+--------------------+-----+
|  1|Sorry, I'll call ...|    0|
|  2|Dont worry. I gue...|    0|
|  3|Call FREEPHONE 08...|    1|
|  4|Win a 1000 cash p...|    1|
|  5|Go until jurong p...|    0|
+---+--------------------+-----+
only showing top 5 rows

+---+----------------------------------+-----+------------------------------------------+
|id |text                              |label|words                                     |
+---+----------------------------------+-----+------------------------------------------+
|1  |Sorry I'll call later in meeting  |0    |[sorry, i'll, call, later, in, meeting]   |
|2  |Dont worry I guess he's busy      |0    |[dont, worry, i, guess, he's, busy]       |
|3  |Call FREEPHONE now                |1    |[call, freephone, now]                    |
|4  |Win a cash prize or a prize worth |1    |[win, a, cash, prize, or, a, prize, worth]|
+---+----------------------------------+-----+--

In [None]:
# Onehot Encoder

from pyspark.sql.functions import when
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("kilometre", df["mile"]*1.609344)
df = df.withColumn("label", when(df["delay"] >= 15, 1).otherwise(0))
df = StringIndexer(inputCol="carrier",  outputCol="carrier_index").fit(df).transform(df)
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df)
df.show(5, truncate=False)

# Create onehot encoder
onehot = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_dummy")
onehot.fit(df).transform(df)[["carrier", "carrier_index", "carrier_dummy"]].distinct().sort("carrier_index").show(truncate=False)

onehots = OneHotEncoder(inputCols=["carrier_index", "org_index"], outputCols=["carrier_dummy", "org_dummy"])
onehots.fit(df).transform(df)[["carrier", "carrier_index", "carrier_dummy", "org", "org_index", "org_dummy"]].distinct().sort("org_index").show(truncate=False)


+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|kilometre         |label|carrier_index|org_index|
+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+
|11 |20 |6  |US     |19    |JFK|2153|9.48  |351     |null |3464.917632       |0    |6.0          |2.0      |
|0  |22 |2  |UA     |1107  |ORD|316 |16.33 |82      |30   |508.55270400000006|1    |0.0          |0.0      |
|2  |20 |4  |UA     |226   |SFO|337 |6.17  |82      |-8   |542.348928        |0    |0.0          |1.0      |
|9  |13 |1  |AA     |419   |ORD|1236|10.33 |195     |-5   |1989.149184       |0    |1.0          |0.0      |
|4  |2  |5  |AA     |325   |ORD|258 |8.92  |65      |null |415.210752        |0    |1.0          |0.0      |
+---+---+---+-------+------+---+----+------+--------+-----+------------------+-----+-------------+---------+
only showing top 5 

In [None]:
# Linear regression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import when
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("km", df["mile"]*1.609344).drop("mile")

# One variable regression: Y = duration, X = km
df = VectorAssembler(inputCols=["km"], outputCol='features').transform(df)
regression = LinearRegression(labelCol="duration")
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)
regression = regression.fit(df_train)
predictions = regression.transform(df_test)
print("RMSE = ", RegressionEvaluator(labelCol="duration").evaluate(predictions))
predictions[["duration", "prediction"]].show(5)



RMSE =  17.098406021968962
+--------+------------------+
|duration|        prediction|
+--------+------------------+
|     560| 560.7390896231818|
|     310| 346.9189834661732|
|      90|  85.0289046671506|
|     130|133.58566184224821|
|     251|245.42440831822486|
+--------+------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("km", df["mile"]*1.609344).drop("mile")
df = StringIndexer(inputCol="carrier",  outputCol="carrier_index").fit(df).transform(df)
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df)
onehots = OneHotEncoder(inputCols=["carrier_index", "org_index"], outputCols=["carrier_dummy", "org_dummy"])
df = onehots.fit(df).transform(df)

# Including only the distance of the flight (the km column) as a predictor.
df = VectorAssembler(inputCols=["km"], outputCol='features').transform(df)
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Create a regression object and train on training data
regression = LinearRegression(labelCol="duration").fit(df_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(df_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
print(f'RMSE={RegressionEvaluator(labelCol="duration").evaluate(predictions)}')
predictions[["duration", "prediction"]].show(5)

# Intercept (average minutes on ground)
inter = regression.intercept
print("intercept=",inter)

# Coefficients
coefs = regression.coefficients
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour
avg_speed = 60 / minutes_per_km 
print(avg_speed)
print("p-values")
print(regression.summary.pValues)
print("std errors")
print(regression.summary.coefficientStandardErrors)
print("t-statistic")
print(regression.intercept/regression.summary.coefficientStandardErrors[0], regression.coefficients[0]/regression.summary.coefficientStandardErrors[0])

# T-statistic of estimated coefficients and intercept.
print(regression.summary.tValues)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|560     |560.7390896231818 |
|310     |346.9189834661732 |
|90      |85.0289046671506  |
|130     |133.58566184224821|
|251     |245.42440831822486|
+--------+------------------+
only showing top 5 rows

RMSE=17.098406021968962
+--------+------------------+
|duration|        prediction|
+--------+------------------+
|     560| 560.7390896231818|
|     310| 346.9189834661732|
|      90|  85.0289046671506|
|     130|133.58566184224821|
|     251|245.42440831822486|
+--------+------------------+
only showing top 5 rows

intercept= 44.38239615466289
[0.07561847142122584]
0.07561847142122584
793.4569275511461
p-values
[0.0, 0.0]
std errors
[7.601309964235278e-05, 0.1378246804145554]
t-statistic
583878.257346238 994.808418246543
[994.808418246543, 322.0206716327401]


In [None]:
# Linear regression: including more independent variables

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("km", df["mile"]*1.609344).drop("mile")
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df)
onehots = OneHotEncoder(inputCols=["org_index"], outputCols=["org_dummy"])
df = onehots.fit(df).transform(df)

# Including both distance and airports as features.
df = VectorAssembler(inputCols=["km", "org_dummy"], outputCol='features').transform(df)
df[["km", "org_index", "org_dummy", "features"]].show(5, truncate=False)
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Create a regression object and train on training data
regression = LinearRegression(labelCol="duration").fit(df_train)

# Create predictions for the testing data
predictions = regression.transform(df_test)

# Calculate the RMSE on testing data
print("RMSE = ", RegressionEvaluator(labelCol="duration").evaluate(predictions))
predictions[["duration", "prediction"]].show(5, truncate=False)
df[["org", "org_dummy"]].distinct().sort("org_index").show(truncate=False)
# Average speed in km per hour
avg_speed_hour = 60 / regression.coefficients[0]
print(avg_speed_hour)

# Average minutes on ground at OGG
inter = regression.intercept
print("intercept=", inter, "which is the average minutes on ground at OGG airport.")

# Average minutes on ground at JFK
avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.coefficients[4]
print(avg_ground_lga, "\n")

# regression.coefficients[0] เป็น slope ของ ตัวแปร km
print("coefficients = ", regression.coefficients, "has ", len(regression.coefficients), "slopes")

# p-values (including intercepts)
print("p-values =", regression.summary.pValues)

+------------------+---------+-------------+----------------------------------+
|km                |org_index|org_dummy    |features                          |
+------------------+---------+-------------+----------------------------------+
|3464.917632       |2.0      |(7,[2],[1.0])|(8,[0,3],[3464.917632,1.0])       |
|508.55270400000006|0.0      |(7,[0],[1.0])|(8,[0,1],[508.55270400000006,1.0])|
|542.348928        |1.0      |(7,[1],[1.0])|(8,[0,2],[542.348928,1.0])        |
|1989.149184       |0.0      |(7,[0],[1.0])|(8,[0,1],[1989.149184,1.0])       |
|415.210752        |0.0      |(7,[0],[1.0])|(8,[0,1],[415.210752,1.0])        |
+------------------+---------+-------------+----------------------------------+
only showing top 5 rows

RMSE =  11.068097345225052
+--------+------------------+
|duration|prediction        |
+--------+------------------+
|560     |551.6335945083583 |
|310     |313.1047029694895 |
|90      |84.2679368060932  |
|130     |131.97295021699213|
|251     |241.8499

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

df = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
df = df.withColumn("km", df["mile"]*1.609344).drop("mile")
df = df.dropna()
df = StringIndexer(inputCol="carrier",  outputCol="carrier_index").fit(df).transform(df)
df = StringIndexer(inputCol="org", outputCol="org_index").fit(df).transform(df)
onehots = OneHotEncoder(inputCols=["carrier_index", "org_index"], outputCols=["carrier_dummy", "org_dummy"])
df = onehots.fit(df).transform(df)

# Including many as features.
### ห้ามเผลอใส่ตัวแปรต้น เป็นตัวเดียวกับ label เพราะจะกลายเป็น spurious perfect fit

df = VectorAssembler(inputCols=["mon", "dom", "dow", "flight", "depart",'delay','km','carrier_dummy', 'org_dummy'], outputCol='features').transform(df)
df[["delay","km", "carrier_dummy", "org_dummy", "features"]].show(5, truncate=False)
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Create a regression object and train on training data
regression = LinearRegression(labelCol="duration").fit(df_train)

# Create predictions for the testing data
predictions = regression.transform(df_test)

# Calculate the RMSE on testing data
print("RMSE = ", RegressionEvaluator(labelCol="duration").evaluate(predictions))
predictions[["duration", "prediction"]].show(5, truncate=False)

# p-values (including intercepts) จะเห็นว่า ใส่มั่วๆ หลายตัวไม่ sig
print("p-values =", regression.summary.pValues)

+-----+------------------+-------------+-------------+-----------------------------------------------------------------------------------+
|delay|km                |carrier_dummy|org_dummy    |features                                                                           |
+-----+------------------+-------------+-------------+-----------------------------------------------------------------------------------+
|30   |508.55270400000006|(8,[0],[1.0])|(7,[0],[1.0])|(22,[1,2,3,4,5,6,7,15],[22.0,2.0,1107.0,16.33,30.0,508.55270400000006,1.0,1.0])    |
|-8   |542.348928        |(8,[0],[1.0])|(7,[1],[1.0])|(22,[0,1,2,3,4,5,6,7,16],[2.0,20.0,4.0,226.0,6.17,-8.0,542.348928,1.0,1.0])        |
|-5   |1989.149184       |(8,[1],[1.0])|(7,[0],[1.0])|(22,[0,1,2,3,4,5,6,8,15],[9.0,13.0,1.0,419.0,10.33,-5.0,1989.149184,1.0,1.0])      |
|2    |885.1392000000001 |(8,[0],[1.0])|(7,[1],[1.0])|(22,[0,1,2,3,4,5,6,7,16],[5.0,2.0,1.0,704.0,7.98,2.0,885.1392000000001,1.0,1.0])   |
|54   |1179.6491520000002|(

In [None]:
# Bucketizing
from pyspark.ml.feature import Bucketizer, OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
flights = StringIndexer(inputCol="org", outputCol="org_index").fit(flights).transform(flights)

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol="depart", outputCol="depart_bucket")

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select("depart", "depart_bucket").show(5)

# Create a one-hot encoder
onehot = OneHotEncoder(inputCol="depart_bucket", outputCol="depart_dummy")

# One-hot encode the bucketed departure times
flights_onehot = onehot.fit(bucketed).transform(bucketed)
flights_onehot.select("depart", "depart_bucket", "depart_dummy").show(5)

# Include org_dummy
flights = OneHotEncoder(inputCol="org_index", outputCol="org_dummy").fit(flights_onehot).transform(flights_onehot)
flights = VectorAssembler(inputCols=["km", "org_dummy", "depart_dummy"], outputCol='features').transform(flights)
flights[["features"]].show(5, truncate=False)

# Run regression
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)

# Create a regression object and train on training data
regression = LinearRegression(labelCol="duration").fit(flights_train)

# Create predictions for the testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
print("RMSE = ", RegressionEvaluator(labelCol="duration").evaluate(predictions))
predictions[["duration", "prediction"]].show(5, truncate=False)

# Average minutes on ground at OGG for flights departing between 21:00 and 24:00
avg_eve_ogg = regression.intercept
print("Average minutes on ground at OGG for flights departing between 21:00 and 24:00 = ", avg_eve_ogg)

# Average minutes on ground at OGG for flights departing between 00:00 and 03:00
avg_night_ogg = regression.intercept + regression.coefficients[8]
print("Average minutes on ground at OGG for flights departing between 00:00 and 03:00 = ", avg_night_ogg)

# Average minutes on ground at JFK for flights departing between 00:00 and 03:00
avg_night_jfk = regression.intercept + regression.coefficients[3] + regression.coefficients[8]
print("Average minutes on ground at JFK for flights departing between 00:00 and 03:00 = ", avg_night_jfk)



+------+-------------+
|depart|depart_bucket|
+------+-------------+
|  9.48|          3.0|
| 16.33|          5.0|
|  6.17|          2.0|
| 10.33|          3.0|
|  8.92|          2.0|
+------+-------------+
only showing top 5 rows

+------+-------------+-------------+
|depart|depart_bucket| depart_dummy|
+------+-------------+-------------+
|  9.48|          3.0|(7,[3],[1.0])|
| 16.33|          5.0|(7,[5],[1.0])|
|  6.17|          2.0|(7,[2],[1.0])|
| 10.33|          3.0|(7,[3],[1.0])|
|  8.92|          2.0|(7,[2],[1.0])|
+------+-------------+-------------+
only showing top 5 rows

+------------------------------------------+
|features                                  |
+------------------------------------------+
|(15,[0,3,11],[3464.917632,1.0,1.0])       |
|(15,[0,1,13],[508.55270400000006,1.0,1.0])|
|(15,[0,2,10],[542.348928,1.0,1.0])        |
|(15,[0,1,11],[1989.149184,1.0,1.0])       |
|(15,[0,1,10],[415.210752,1.0,1.0])        |
+------------------------------------------+
only 

In [None]:
### Penalty for adding too many independent variables
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, Bucketizer

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
flights = flights.dropna() ถ้าเราไม่เอาคอลัมน์ที่มี missing values มาใส่ใน vector assembler ก็ไม่จำเป็นต้อง dropna

# StringIndexer เขียนรวมได้ แต่ต้องใช้ inputCols=['colname1', 'colname2'] outputCols=['colname', 'colname2']
flights = StringIndexer(inputCols=["carrier", "org"],  outputCols=["carrier_idx", "org_idx"]).fit(flights).transform(flights)

# Bucketizer ต้องมีตัวปิดตัวสุดท้าย ถ้า splits ค่าต่างกัน ต้องเขียนแยก
flights = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol="depart", outputCol="depart_bucket").transform(flights)
flights = Bucketizer(splits=[0, 1, 2, 3, 4, 5, 6, 7], inputCol="dow", outputCol="dow_bucket").transform(flights)
flights = Bucketizer(splits=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], inputCol="mon", outputCol="mon_bucket").transform(flights)
flights = OneHotEncoder(inputCols=["carrier_idx", "org_idx", "depart_bucket", "dow_bucket", "mon_bucket"], outputCols=["carrier_dummy", "org_dummy", "depart_dummy", "dow_dummy", "mon_dummy"]).fit(flights).transform(flights)

# Including many as features.
### ห้ามเผลอใส่ตัวแปรต้น เป็นตัวเดียวกับ label เพราะจะกลายเป็น spurious perfect fit

## The instructor reduced the number of observations from 50,000 to 1,000 on DC's browser 
## without telling and this process change the frequency of some dummy variables.  
## For example, OGG airport is a benchmark in the fulll dataset but TUS becomes a benchmark in the exercise.
# https://campus.datacamp.com/courses/machine-learning-with-pyspark/regression-ebb2870c-a2cd-40a0-a282-5604fdb4bd1c?ex=13

flights = VectorAssembler(inputCols=['km','org_dummy', 'depart_dummy', 'dow_dummy', 'mon_dummy'], outputCol='features').transform(flights)
flights[["features", "duration"]].show(5, truncate=False)
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)


# Fit linear regression model to training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Make predictions on testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol="duration").evaluate(predictions)
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Ridge regression จริงๆ elasticNetParam ค่าเริ่มต้นเป็น 0 ให้อยู่แล้ว Ridge คือแก้ที่ regParam
ridge = LinearRegression(labelCol="duration", elasticNetParam=0, regParam=0.1).fit(flights_train)

# Lasso regression
lasso = LinearRegression(labelCol="duration", elasticNetParam=1, regParam=0.1).fit(flights_train)

## exercise
# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol="duration", regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol="duration").evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta==0 for beta in regression.coefficients])
print("Number of coefficients equal to 0:", zero_coeff)

+--------------------------------------------------------+--------+
|features                                                |duration|
+--------------------------------------------------------+--------+
|(32,[0,3,11],[3464.917632,1.0,1.0])                     |351     |
|(32,[0,1,13,17,21],[508.55270400000006,1.0,1.0,1.0,1.0])|82      |
|(32,[0,2,10,19,23],[542.348928,1.0,1.0,1.0,1.0])        |82      |
|(32,[0,1,11,16,30],[1989.149184,1.0,1.0,1.0,1.0])       |195     |
|(32,[0,1,10,20,25],[415.210752,1.0,1.0,1.0,1.0])        |65      |
+--------------------------------------------------------+--------+
only showing top 5 rows

The test RMSE is 10.640898098422634
[0.07440105564835443,27.250459256776892,20.108551239680313,51.717706131537625,45.848484689332444,17.56510475699674,14.966511267973397,17.220627618309504,-15.252081036666475,0.4119592759848925,4.096780309434183,6.8582098158780775,4.619271845164633,8.7646491112333,8.72216284334477,0.13962677223553757,-0.13912740487790085,-0.404

In [52]:
### Pipeline: combining steps above
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, Bucketizer

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")

# เริ่มเปลี่ยนตรงนี้
indexer = StringIndexer(inputCols=["carrier", "org"],  outputCols=["carrier_idx", "org_idx"])#.fit(flights).transform(flights)
onehot = OneHotEncoder(inputCols=["carrier_idx", "org_idx", "dow"], outputCols=["carrier_dummy", "org_dummy", "dow_dummy"])#.fit(flights).transform(flights)

# ใส่มาแค่ "km", "org_dummy", "dow_dummy" ให้เหมือนในแบบฝึกหัด
assembler = VectorAssembler(inputCols=['km','org_dummy', 'dow_dummy'], outputCol='features')#.transform(flights)
regression = LinearRegression(labelCol="duration")
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)
# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

# Access the regression coefficients
pipeline.stages[3].coefficients

DenseVector([0.0743, 28.4722, 20.4571, 52.4312, 46.7778, 18.2882, 15.5471, 17.7419, 0.2034, -0.083, -0.4135, 0.0509, -0.0633, -0.0856])

In [None]:
# SMS Exercise again for reference convenience
from pyspark.ml import Pipeline
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

REGEX = '[_():;,.!?\\- 0-9]'

sms = spark.read.csv("sms.csv", inferSchema=True, sep=";")
for i, j in zip(["_c0", "_c1", "_c2"], ["id", "text", "label"]):
    sms = sms.withColumnRenamed(i, j)
sms.show(5)

# Remove punctuation (REGEX provided) and numbers
sms = sms.withColumn('text', regexp_replace("text", '[_():;,.!?\\- 0-9]', ' '))

# Merge multiple spaces
sms = sms.withColumn('text', regexp_replace("text", ' +', ' '))

# Split the text into words
sms = Tokenizer(inputCol='text', outputCol="words").transform(sms)
sms.show(4, truncate=False)

# Remove stop words.
sms = StopWordsRemover(inputCol="words", outputCol="terms").transform(sms)

# Apply the hashing trick
sms = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024).transform(sms)

# Convert hashed symbols to TF-IDF
sms = IDF(inputCol="hash", outputCol="features").fit(sms).transform(sms)
      
sms.select('terms', 'features').show(4, truncate=False)

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", "prediction").count().show()

TP = prediction.where("label = 1 AND prediction = 1").count()
TN = prediction.where("label = 0 AND prediction = 0").count()
FP = prediction.where("label = 0 AND prediction = 1").count()
FN = prediction.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

+---+--------------------+-----+
| id|                text|label|
+---+--------------------+-----+
|  1|Sorry, I'll call ...|    0|
|  2|Dont worry. I gue...|    0|
|  3|Call FREEPHONE 08...|    1|
|  4|Win a 1000 cash p...|    1|
|  5|Go until jurong p...|    0|
+---+--------------------+-----+
only showing top 5 rows

+---+----------------------------------+-----+------------------------------------------+
|id |text                              |label|words                                     |
+---+----------------------------------+-----+------------------------------------------+
|1  |Sorry I'll call later in meeting  |0    |[sorry, i'll, call, later, in, meeting]   |
|2  |Dont worry I guess he's busy      |0    |[dont, worry, i, guess, he's, busy]       |
|3  |Call FREEPHONE now                |1    |[call, freephone, now]                    |
|4  |Win a cash prize or a prize worth |1    |[win, a, cash, prize, or, a, prize, worth]|
+---+----------------------------------+-----+--

In [None]:
### SMS spam pipeline
from pyspark.ml import Pipeline
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Prepare data as in the exercise
REGEX = '[_():;,.!?\\- 0-9]'
sms = spark.read.csv("sms.csv", inferSchema=True, sep=";")
for i, j in zip(["_c0", "_c1", "_c2"], ["id", "text", "label"]):
    sms = sms.withColumnRenamed(i, j)
# Remove punctuation (REGEX provided) and numbers
sms = sms.withColumn('text', regexp_replace("text", '[_():;,.!?\\- 0-9]', ' '))
# Merge multiple spaces
sms = sms.withColumn('text', regexp_replace("text", ' +', ' '))
sms.show(4, truncate=False)

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression(regParam=0.2)
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

# Training data
pipeline = pipeline.fit(sms_train)

# Testing data
predictions = pipeline.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
predictions.groupBy("label", "prediction").count().show()

# คอลัมน์ label เหมือนจะใช้แค่กับ classification เท่านั้น 

+---+----------------------------------+-----+
|id |text                              |label|
+---+----------------------------------+-----+
|1  |Sorry I'll call later in meeting  |0    |
|2  |Dont worry I guess he's busy      |0    |
|3  |Call FREEPHONE now                |1    |
|4  |Win a cash prize or a prize worth |1    |
+---+----------------------------------+-----+
only showing top 4 rows

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   42|
|    0|       0.0|  950|
|    1|       1.0|  104|
+-----+----------+-----+



In [None]:
### Cross Validation

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, Bucketizer

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
indexer = StringIndexer(inputCols=["carrier", "org"],  outputCols=["carrier_idx", "org_idx"])#.fit(flights).transform(flights)
onehot = OneHotEncoder(inputCols=["carrier_idx", "org_idx", "dow"], outputCols=["carrier_dummy", "org_dummy", "dow_dummy"])#.fit(flights).transform(flights)
assembler = VectorAssembler(inputCols=['km', "org_dummy"], outputCol='features')#.transform(flights)
regression = LinearRegression(labelCol="duration")
evaluator = RegressionEvaluator(labelCol="duration")
piped_data = Pipeline(stages=[indexer, onehot, assembler]).fit(flights).transform(flights)
flights_train, flights_test = piped_data.randomSplit([0.8, 0.2], seed=42)

## ขั้นตอนนี้งงๆ ต้องทำ pipeline ก่อน split หรือ split ก่อน pipeline
## รู้แต่ว่า ใน cross validation ต้องมี features มาก่อน เหมือนกับต้องเอา piped data ไป split ไม่ใช่เอา raw data มา split

# Create a grid for parameter
params = ParamGridBuilder().build()

# Create the cross validator object: numFolds คือการเอา train data มาแบ่งเพื่อทำ cross validation
# cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=10, seed=13)
# estimator = regression ได้ ถ้าหากไม่ได้ทำ pipeline มา

# ???
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv2 = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=10, seed=13)

# Apply cross-validation to the training data
cv = cv.fit(flights_train)
cv.avgMetrics


[11.251745641698584]

In [None]:
### Grid Search
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, Bucketizer

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
indexer = StringIndexer(inputCols=["carrier", "org"],  outputCols=["carrier_idx", "org_idx"])#.fit(flights).transform(flights)
onehot = OneHotEncoder(inputCols=["carrier_idx", "org_idx", "dow"], outputCols=["carrier_dummy", "org_dummy", "dow_dummy"])#.fit(flights).transform(flights)
assembler = VectorAssembler(inputCols=['km', "org_dummy"], outputCol='features')#.transform(flights)
evaluator = RegressionEvaluator(labelCol="duration")
piped_data = Pipeline(stages=[indexer, onehot, assembler]).fit(flights).transform(flights)
flights_train, flights_test = piped_data.randomSplit([0.8, 0.2], seed=42)

regression = LinearRegression(labelCol="duration")
#predictions = regression.transform(flights_test)
#params = ParamGridBuilder().addGrid(regression.fitIntercept, [True, False]).build()
params = ParamGridBuilder().addGrid(regression.fitIntercept, [True, False]).addGrid(regression.regParam, [0.001, 0.01, 0.1, 1, 10]).addGrid(regression.elasticNetParam, [0, 0.25, 0.5, 0.75, 1]).build()
cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=10, seed=13)
# Apply cross-validation to the training data
cv = cv.fit(flights_train)
print(f'RMSE with intercept is {cv.avgMetrics[0]:.2f}.\nRMSE without intercept is {cv.avgMetrics[1]:.2f}.')

# Make prediction after cross validation
predictions = cv.transform(flights_test)
print("RMSE of tested data = ", RegressionEvaluator(labelCol="duration").evaluate(predictions))

# Retrieve the best parameter
print(cv.bestModel.explainParam('fitIntercept'))
print(cv.bestModel.explainParam("regParam"))
print(cv.bestModel.explainParam("elasticNetParam"))




RMSE with intercept is 11.25.
RMSE without intercept is 11.25.
RMSE of tested data =  11.068149616453239
fitIntercept: whether to fit an intercept term. (default: True, current: True)
regParam: regularization parameter (>= 0). (default: 0.0, current: 0.001)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.25)


In [None]:
### Exercise example on grid search
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, Bucketizer

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)
indexer = StringIndexer(inputCols=["carrier", "org"],  outputCols=["carrier_idx", "org_idx"])#.fit(flights).transform(flights)
onehot = OneHotEncoder(inputCols=["carrier_idx", "org_idx", "dow"], outputCols=["carrier_dummy", "org_dummy", "dow_dummy"])#.fit(flights).transform(flights)
assembler = VectorAssembler(inputCols=['km', "org_dummy"], outputCol='features')#.transform(flights)
regression = LinearRegression(labelCol="duration")
evaluator = RegressionEvaluator(labelCol="duration")
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \
               .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)
cv = cv.fit(flights_train)

# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
print("RMSE after cross validation = ", evaluator.evaluate(predictions))


Number of models to be tested:  12
[StringIndexerModel: uid=StringIndexer_3dea0e38ba78, handleInvalid=error, numInputCols=2, numOutputCols=2, OneHotEncoderModel: uid=OneHotEncoder_800891249a32, dropLast=true, handleInvalid=error, numInputCols=3, numOutputCols=3, VectorAssembler_a4ae9bcd6f8b, LinearRegressionModel: uid=LinearRegression_3d286bc8fed4, numFeatures=8]
RMSE after cross validation =  11.06828961932966


In [None]:
### SMS spam pipeline Grid Search
from pyspark.ml import Pipeline
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Prepare data as in the exercise
REGEX = '[_():;,.!?\\- 0-9]'
sms = spark.read.csv("sms.csv", inferSchema=True, sep=";")
for i, j in zip(["_c0", "_c1", "_c2"], ["id", "text", "label"]):
    sms = sms.withColumnRenamed(i, j)
# Remove punctuation (REGEX provided) and numbers
sms = sms.withColumn('text', regexp_replace("text", '[_():;,.!?\\- 0-9]', ' '))
# Merge multiple spaces
sms = sms.withColumn('text', regexp_replace("text", ' +', ' '))
sms.show(4, truncate=False)

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
binary_evaluator = BinaryClassificationEvaluator()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Create parameter grid
params = ParamGridBuilder()

# Add grid for hashing trick parameters
params = params.addGrid(hasher.numFeatures, [1024, 4096, 16384]) \
               .addGrid(hasher.binary, [True, False])

# Add grid for logistic regression parameters
params = params.addGrid(logistic.regParam, [0.01, 0.1, 1.0, 10.0]).addGrid(logistic.elasticNetParam, [0.0, 0.5, 1.0])

# Build parameter grid
params = params.build()

# Training data
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=binary_evaluator, numFolds=5)
cv = cv.fit(sms_train)

# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(sms_test)
print("RMSE after cross validation = ", binary_evaluator.evaluate(predictions))



+---+----------------------------------+-----+
|id |text                              |label|
+---+----------------------------------+-----+
|1  |Sorry I'll call later in meeting  |0    |
|2  |Dont worry I guess he's busy      |0    |
|3  |Call FREEPHONE now                |1    |
|4  |Win a cash prize or a prize worth |1    |
+---+----------------------------------+-----+
only showing top 4 rows

[Tokenizer_402a220b20f5, StopWordsRemover_862f39bc11e7, HashingTF_a0341b90f27b, IDFModel: uid=IDF_4da3519785f1, numDocs=4478, numFeatures=16384, LogisticRegressionModel: uid=LogisticRegression_8eb2cf6de984, numClasses=2, numFeatures=16384]
RMSE after cross validation =  0.9933237202595511


In [39]:
### Random Forest
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, Bucketizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
flights = flights.withColumn("label", (flights["delay"] >= 15).cast("integer"))
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)
indexer = StringIndexer(inputCols=["carrier", "org"],  outputCols=["carrier_idx", "org_idx"])#.fit(flights).transform(flights)
onehot = OneHotEncoder(inputCols=["carrier_idx", "org_idx", "dow"], outputCols=["carrier_dummy", "org_dummy", "dow_dummy"])#.fit(flights).transform(flights)
assembler = VectorAssembler(inputCols=["duration", 'km', "org_dummy"], outputCol='features')#.transform(flights)
pipeline = Pipeline(stages=[indexer, onehot, assembler])
piped_data = pipeline.fit(flights).transform(flights).dropna() # random forest ต้องมีคอลัมน์ label, features และต้องไม่มี missing values
flights_train, flights_test = piped_data.randomSplit([0.8, 0.2], seed=42)

# Create tree classifier
forest = RandomForestClassifier(numTrees=5)
gbt = GBTClassifier(maxIter=10)
tree = DecisionTreeClassifier()

# Fit the training data
forest = forest.fit(flights_train)
gbt = gbt.fit(flights_train)
tree = tree.fit(flights_train)

# How to access trees within forest?
print(forest.trees)
print(gbt.trees)

# Consensus predictions
predictions_forest = forest.transform(flights_test)
predictions_gbt = gbt.transform(flights_test)
predictions_tree = tree.transform(flights_test)
predictions[["label", "probability", "prediction"]].show(5, truncate=False)


# Confusion matrix
predictions_forest.groupBy("label", "prediction").count().show()
TP = predictions_forest.where("label = 1 AND prediction = 1").count()
TN = predictions_forest.where("label = 0 AND prediction = 0").count()
FP = predictions_forest.where("label = 0 AND prediction = 1").count()
FN = predictions_forest.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is of Random Forest is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

predictions_gbt.groupBy("label", "prediction").count().show()
TP = predictions_gbt.where("label = 1 AND prediction = 1").count()
TN = predictions_gbt.where("label = 0 AND prediction = 0").count()
FP = predictions_gbt.where("label = 0 AND prediction = 1").count()
FN = predictions_gbt.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is of Gradient Boosted Trees is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

predictions_tree.groupBy("label", "prediction").count().show()
TP = predictions_tree.where("label = 1 AND prediction = 1").count()
TN = predictions_tree.where("label = 0 AND prediction = 0").count()
FP = predictions_tree.where("label = 0 AND prediction = 1").count()
FN = predictions_tree.where("label = 1 AND prediction = 0").count()
print(f"The accuracy is of Decision Trees is {(TP+TN)/(TP+TN+FP+FN)*100:.2f}%")

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
print("The AUC of Decision Tree = ", evaluator.evaluate(predictions_tree))
print("The AUC of Gradient Boosted Tree = ", evaluator.evaluate(predictions_gbt))
print("The AUC of Random Forest = ", evaluator.evaluate(predictions_forest))

# Feature importances: index เรียงตามลำดับของตัวแปรที่ใส่ใน inputCols ขั้นตอน assembele
display(forest.featureImportances)
flights_train[["org", "org_dummy"]].distinct().sort("org_idx").show()
print(f"index เรียงตามลำดับของตัวแปรที่ใส่ใน inputCols ขั้นตอน assembele \nแสดงว่า duration = {forest.featureImportances[0]:.2f} มีผลมากที่สุด ส่วนสนามบิน ORG มีผลอันดับสอง = {forest.featureImportances[2]:.2f} ส่วนสนามบิน LGA มีผลน้อยที่สุด = {forest.featureImportances[5]:.2f}.")





[DecisionTreeClassificationModel: uid=dtc_0bade214d955, depth=5, numNodes=27, numClasses=2, numFeatures=9, DecisionTreeClassificationModel: uid=dtc_b686d4d2abe8, depth=5, numNodes=19, numClasses=2, numFeatures=9, DecisionTreeClassificationModel: uid=dtc_e471dbde3cea, depth=5, numNodes=11, numClasses=2, numFeatures=9, DecisionTreeClassificationModel: uid=dtc_d18e5513250c, depth=4, numNodes=17, numClasses=2, numFeatures=9, DecisionTreeClassificationModel: uid=dtc_1174bfa385ae, depth=5, numNodes=21, numClasses=2, numFeatures=9]
[DecisionTreeRegressionModel: uid=dtr_15dc9ea878cf, depth=5, numNodes=63, numFeatures=9, DecisionTreeRegressionModel: uid=dtr_6395b7c83f2d, depth=5, numNodes=49, numFeatures=9, DecisionTreeRegressionModel: uid=dtr_45e460e87f21, depth=5, numNodes=49, numFeatures=9, DecisionTreeRegressionModel: uid=dtr_3070118b8781, depth=5, numNodes=57, numFeatures=9, DecisionTreeRegressionModel: uid=dtr_bd53fc515015, depth=5, numNodes=49, numFeatures=9, DecisionTreeRegressionModel:

SparseVector(9, {0: 0.2227, 1: 0.1373, 2: 0.1668, 3: 0.1, 4: 0.0317, 5: 0.0308, 6: 0.141, 7: 0.1058, 8: 0.0639})

+---+-------------+
|org|    org_dummy|
+---+-------------+
|ORD|(7,[0],[1.0])|
|SFO|(7,[1],[1.0])|
|JFK|(7,[2],[1.0])|
|LGA|(7,[3],[1.0])|
|SJC|(7,[4],[1.0])|
|SMF|(7,[5],[1.0])|
|TUS|(7,[6],[1.0])|
|OGG|    (7,[],[])|
+---+-------------+

index เรียงตามลำดับของตัวแปรที่ใส่ใน inputCols ขั้นตอน assembele 
แสดงว่า duration = 0.22 มีผลมากที่สุด ส่วนสนามบิน ORG มีผลอันดับสอง = 0.17 ส่วนสนามบิน LGA มีผลน้อยที่สุด = 0.03.


In [76]:
### Random Forest Cross validation เอาตามโจทย์เลย ไม่มีการทำ indexer, onehot ใดๆทั้งสิ้น
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

flights = spark.read.csv("flights.csv", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", flights["mile"]*1.609344).drop("mile")
flights = flights.withColumn("label", (flights["delay"] >= 15).cast("integer"))
flights = flights[["mon", "depart", "duration", "label"]].dropna()
flights = VectorAssembler(inputCols=["mon", "depart", "duration"], outputCol='features').transform(flights)
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)
forest = RandomForestClassifier()
evaluator = BinaryClassificationEvaluator()

# Create a parameter grid
# featureSubsetStrategy — the number of features to consider for splitting at each node.
# maxDepth — the maximum number of splits along any branch.
params = ParamGridBuilder() \
            .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
            .addGrid(forest.maxDepth, [2, 5, 10, 20, 25]) \
            .build()

# Create a cross-validator
cv = CrossValidator(estimator=forest, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)
cv = cv.fit(flights_train)

# Get the best model from cross validation
best_model = cv.bestModel

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
print("AUC after cross validation = ", evaluator.evaluate(predictions))
print("Optimal parameters")
print(best_model.explainParam('featureSubsetStrategy'))
print(best_model.explainParam("maxDepth"))



AUC after cross validation =  0.6796854991144257
Optimal parameters
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto' (default: auto, current: onethird)
maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 10)
