In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# Machine Learning with PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.master('local[2]') \
        .appName('first_spark_ML_application') \
        .getOrCreate()

![alt text](https://learning.oreilly.com/library/view/spark-the-definitive/9781491912201/assets/spdg_2402.png)

## What is PySpark MLlib?
Spark MLlib is short for spark machine learning library. Machine learning in PySpark is easy to use and scalable. It works on distributed systems. We use in Spark machine learning for data analysis. We get the benefit of various machine learning algorithms such as Regression, classification etc, because of the PySpark MLlib.

## Data 



### flight dataset 
Some airline flight data stored in a csv file


#### Data dictionary
mon --> month (integer between 1 and 12)

dom --> day of month (integer between 1 and 31)

dow --> day of week (integer; 1 = Monday and 7 = Sunday)

org --> origin airport (IATA code)

mile --> distance (miles)

carrier --> carrier (IATA code)

depart --> departure time (decimal hour)

duration --> expected duration (minutes)

delay --> delay (minutes)


### Loading Data

**Creating a SparkSession**

In [None]:
# Read data from CSV file
flights = spark.read.csv('flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# View the first five records
flights.show(5)

# Check column data types
flights.dtypes

The data contain 50000 records.
+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

### Data Preparation

In [None]:
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



#### Removing columns and Rows

Do we needs all the columns?
* Removing an informative column

In any machine learning project, we always have a few columns that are not required for solving the problem.

In [None]:
# Remove the 'flight' column
flights = flights.drop('flight')

Filtering out missing data
* removing rows which do not have information about whether or not a flight was delayed.

It is important to check the number of missing values present in all the columns. Knowing the count helps us treat the missing values before building any machine learning model using that data.

In [None]:
# Number of records with missing 'delay' values
flights.filter('delay IS NULL').count()

2978

In [None]:
# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

In [None]:
# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print(flights.count())

47022


#### Column manipulation

The Federal Aviation Administration (FAA) considers a flight to be "delayed" when it arrives **15 minutes or more** after its scheduled time.

The next step of preparing the flight data has two parts:

1. convert the units of distance, replacing the mile column with a kmcolumn; and
2. create a Boolean column indicating whether or not a flight was delayed.

In [None]:
# Import the round function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
# we use cast function to convert from boolean to integer (0 and 1)
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
# Cast convert boolean value to integer
flights_km = flights_km.withColumn('label', (flights_km.delay >= 15).cast('integer'))

# Check first five records
flights_km.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



#### Categorical columns
Categorical variables represent types of data which may be divided into groups. Examples of categorical variables are race, sex, age group, and educational level.

Most machine learning algorithms accept the data only in numerical form. So, it is essential to convert any categorical variables present in our dataset into numbers.

Remember that we cannot simply drop them from our dataset as they might contain useful information. It would be a nightmare to lose that just because we don’t want to figure out how to use them!

In the flights data there are two columns, carrier and org, which hold categorical data. We need to transform these columns into indexed numerical values.

* Import the appropriate class and create an indexer object to transform the carrier column from a string to an numeric index.
* Prepare the indexer object on the flight data.
* Use the prepared indexer to create the numeric index column.
* Repeat the process for the org column.

**StringIndexer** assigns a unique integer value to each category. 0 is assigned to the most frequent category, 1 to the next most frequent value, and so on. We have to define the input column name that we want to index and the output column name in which we want the results

In [None]:
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights_km)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights_km)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

*Transformers* are functions that convert raw data in some way. This might be to create a new interaction variable (from two other variables), normalize a column, or simply change an Integer into a Double type to be input into a model. An example of a transformer is one that converts string categorical variables into numerical values that can be used in MLlib. Transformers are primarily used in preprocessing and feature engineering. Transformers take a DataFrame as input and produce a new DataFrame as output, as illustrated below:
![alt text](https://learning.oreilly.com/library/view/spark-the-definitive/9781491912201/assets/spdg_2403.png)

In [None]:
flights_indexed.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
only showing top 5 rows



In [None]:
flights_indexed.describe('org_idx').show()

+-------+------------------+
|summary|           org_idx|
+-------+------------------+
|  count|             47022|
|   mean|1.6448896261324486|
| stddev|1.8366570435032554|
|    min|               0.0|
|    max|               7.0|
+-------+------------------+



* 0.0 --> is the most frequent value
* 7.0 --> is the less frequent value

#### Assembling columns

The final stage of data preparation is to consolidate all of the predictor columns into a single column.

At present our data has the following predictor columns:

* mon, dom and dow
* carrier_idx (derived from carrier)
* org_idx (derived from org)
* km
* depart
* duration

**A vector assembler** combines a given list of columns into a single vector column

In [None]:
# Import the VectorAssembler class
# we use a vector assembler to transform the data.
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=['mon', 'dom', 'dow', 'carrier_idx', 
                                       'org_idx', 'km', 'depart', 'duration'], 
                            outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights_indexed)

# Check the resulting column
flights_assembled.select('features', 'label').show(5, truncate=False)

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |1    |
|[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |0    |
|[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|0    |
|[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |0    |
|[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |1    |
+-----------------------------------------+-----+
only showing top 5 rows



## Classification
Predict whether or not a given flight will be delayed.

### Decision Tree
Decision tree is the most powerful and popular tool for classification and prediction. A Decision tree is a flowchart like tree structure, where each internal node denotes a test on an attribute, each branch represents an outcome of the test, and each leaf node (terminal node) holds a class label.

![alt text]('DT.png')

#### Classifying flights

In [None]:
flights_assembled.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+--------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|            features|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+--------------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|[0.0,22.0,2.0,0.0...|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|[2.0,20.0,4.0,0.0...|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|[9.0,13.0,1.0,1.0...|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|[5.0,2.0,1.0,0.0,...|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|[7.0,2.0,6.0,1.0,...|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+--------------------+
only showing top 5 rows



#### Split train/test
Split data into training and testing sets

In [None]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights_assembled.randomSplit([0.8, 0.2], seed=17)
flights_train = flights_train.select(['features', 'label'])
flights_test = flights_test.select(['features', 'label'])

In [None]:
# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights.count()
print(training_ratio)

0.7980732423121092


#### Build a Decision Tree model

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

Create a Decision Tree classier.

In [None]:
tree = DecisionTreeClassifier()

Learn from the training data

In [None]:
tree = tree.fit(flights_train)

#### Evaluating the DT model
Make predictions on the testing data and aompare to known values.

In [None]:
prediction = tree.transform(flights_test)

In [None]:
prediction.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.48838709677419356,0.5116129032258064]|
|1    |1.0       |[0.3543011744450593,0.6456988255549406] |
|1    |1.0       |[0.3543011744450593,0.6456988255549406] |
|1    |1.0       |[0.3543011744450593,0.6456988255549406] |
|1    |1.0       |[0.3543011744450593,0.6456988255549406] |
+-----+----------+----------------------------------------+
only showing top 5 rows



##### Confusion Matrix

A confusion matrix gives a useful breakdown of predictions versus known values. It has four cells which represent the counts of:

* True Negatives (TN) — model predicts negative outcome & known outcome is negative
* True Positives (TP) — model predicts positive outcome & known outcome is positive
* False Negatives (FN) — model predicts negative outcome but known outcome is positive
* False Positives (FP) — model predicts positive outcome but known outcome is negative.

In [None]:
prediction.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1087|
|    0|       0.0| 2286|
|    1|       1.0| 3737|
|    0|       1.0| 2385|
+-----+----------+-----+



In [None]:
# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

##### Accuracy
Accuracy is one metric for evaluating classification models. Informally, accuracy is the fraction of predictions our model got right. Formally, accuracy has the following definition

$$Accuracy = \frac{(TN + TP)}{(TN + TP + FN + FP)}$$

In [None]:
# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP)/(TN+ TP+ FN + FP)
print(accuracy)

0.634333859926277


### Logistic Regression
In statistics, the logistic model (or logit model) is used to model the probability of a certain class or event existing such as pass/fail, win/lose, alive/dead or healthy/sick. This can be extended to model several classes of events such as determining whether an image contains a cat, dog, lion, etc. Each object being detected in the image would be assigned a probability between 0 and 1 and the sum adding to one.

#### Build a logistic Regreesion model

In [None]:
from pyspark.ml.classification import LogisticRegression

**Create a logistic Regression classifier**

In [None]:
logistic = LogisticRegression()

**Learn from the training data**

In [None]:
logistic = logistic.fit(flights_train)

**Predictions**

In [None]:
prediction = logistic.transform(flights_test)

In [None]:
prediction.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |0.0       |[0.5033470048952998,0.49665299510470023]|
|1    |1.0       |[0.35967532714510964,0.6403246728548905]|
|1    |1.0       |[0.3071406970789145,0.6928593029210856] |
|1    |1.0       |[0.2149373998372294,0.7850626001627706] |
|1    |1.0       |[0.2742323639584431,0.7257676360415569] |
+-----+----------+----------------------------------------+
only showing top 5 rows



In [None]:
# confusion matrix
prediction.groupby('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1652|
|    0|       0.0| 2645|
|    1|       1.0| 3172|
|    0|       1.0| 2026|
+-----+----------+-----+



#### Evaluate the Logistic Regression model

##### Confusion matrix

In [None]:
# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

##### Precision

Precision attempts to answer the following question:


*What proportion of positive identifications was actually correct?*


$$Precision (positive) = \frac{TP}{(TP + FP)}$$



##### Recall
Recall attempts to answer the following question:

*What proportion of actual positives was identified correctly?*


$$Recall (positive) = \frac{TP}{(TP + FN)}$$

In [None]:
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

precision = 0.61
recall    = 0.66


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [None]:
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(prediction, {evaluator.metricName:'weightedPrecision'})

0.6128474273772628

### SMS dataset
The file sms.csv contains a selection of SMS messages which have been classified as either 'spam' or 'ham'. These data have been adapted from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php). There are a total of 5574 SMS, of which 747 have been labelled as spam.

Notes on CSV format:
* no header record and
* fields are separated by a semicolon (this is not the default separator).

#### Loading SMS spam data
In the SMS data the header is missing. So we need to create the data schema

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

We can define the custom schema for our dataframe in Spark. For this, we need to create an object of StructType which takes a list of StructField. And of course, we should define StructField with a column name, the data type of the column and whether null values are allowed for the particular column or not.


In [None]:
# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv('sms.csv', sep=';', header=False, schema=schema)

# Print schema of DataFrame
sms.printSchema()

root
 |-- id: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



#### Turning Text into Tables

##### Data dictionary
id — record identifier

text — content of SMS message

label — spam or ham (integer; 0 = ham and 1 = spam)

In [None]:
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '\d', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

+---+----------------------------------+-----+------------------------------------------+
|id |text                              |label|words                                     |
+---+----------------------------------+-----+------------------------------------------+
|1  |Sorry I'll call later in meeting  |0    |[sorry, i'll, call, later, in, meeting]   |
|2  |Dont worry I guess he's busy      |0    |[dont, worry, i, guess, he's, busy]       |
|3  |Call FREEPHONE now                |1    |[call, freephone, now]                    |
|4  |Win a cash prize or a prize worth |1    |[win, a, cash, prize, or, a, prize, worth]|
+---+----------------------------------+-----+------------------------------------------+
only showing top 4 rows



In [None]:
sms = wrangled.select(['id', 'words', 'label'])

In [None]:
sms.show(5)

+---+--------------------+-----+
| id|               words|label|
+---+--------------------+-----+
|  1|[sorry, i'll, cal...|    0|
|  2|[dont, worry, i, ...|    0|
|  3|[call, freephone,...|    1|
|  4|[win, a, cash, pr...|    1|
|  5|[go, until, juron...|    0|
+---+--------------------+-----+
only showing top 5 rows



#### Stop Words
A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

In [None]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)
wrangled.show(5)

+---+--------------------+-----+--------------------+
| id|               words|label|               terms|
+---+--------------------+-----+--------------------+
|  1|[sorry, i'll, cal...|    0|[sorry, call, lat...|
|  2|[dont, worry, i, ...|    0|[dont, worry, gue...|
|  3|[call, freephone,...|    1|   [call, freephone]|
|  4|[win, a, cash, pr...|    1|[win, cash, prize...|
|  5|[go, until, juron...|    0|[go, jurong, poin...|
+---+--------------------+-----+--------------------+
only showing top 5 rows



#### Hashing Trick
Hashing Trick is a fast and space-efficient way of vectorizing features, i.e. turning arbitrary features into indices in a vector or matrix.

In [None]:
# Apply the hashing trick
wrangled = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024)\
      .transform(wrangled)
wrangled.show(5)

+---+--------------------+-----+--------------------+--------------------+
| id|               words|label|               terms|                hash|
+---+--------------------+-----+--------------------+--------------------+
|  1|[sorry, i'll, cal...|    0|[sorry, call, lat...|(1024,[138,344,37...|
|  2|[dont, worry, i, ...|    0|[dont, worry, gue...|(1024,[53,233,329...|
|  3|[call, freephone,...|    1|   [call, freephone]|(1024,[138,396],[...|
|  4|[win, a, cash, pr...|    1|[win, cash, prize...|(1024,[31,69,387,...|
|  5|[go, until, juron...|    0|[go, jurong, poin...|(1024,[116,262,33...|
+---+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



#### TFIDF
TF-IDF (term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.

In [None]:
# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)
      
tf_idf.select('terms', 'features').show(4, truncate=False)

+--------------------------------+----------------------------------------------------------------------------------------------------+
|terms                           |features                                                                                            |
+--------------------------------+----------------------------------------------------------------------------------------------------+
|[sorry, call, later, meeting]   |(1024,[138,344,378,1006],[2.2391682769656747,2.892706319430574,3.684405173719015,4.244020961654438])|
|[dont, worry, guess, busy]      |(1024,[53,233,329,858],[4.618714411095849,3.557143394108088,4.618714411095849,4.937168142214383])   |
|[call, freephone]               |(1024,[138,396],[2.2391682769656747,3.3843005812686773])                                            |
|[win, cash, prize, prize, worth]|(1024,[31,69,387,428],[3.7897656893768414,7.284881949239966,4.4671645129686475,3.898659777615979])  |
+--------------------------------+--------------

In [None]:
sms = tf_idf

#### Train and Test

In [None]:
# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed = 13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   47|
|    0|       0.0|  987|
|    1|       1.0|  124|
|    0|       1.0|    3|
+-----+----------+-----+



### Regression


We want to build a regression model to predict flight duration

#### One-Hot Encoding
using indexing values to convert a category type is not going to work in a regression problem. as each numeric value has different weight.


##### Dummy Varaibles: sparse representation

**Dummy Variable**

Each categorical level becomes a column.

![alt text](https://www.mathworks.com/help/stats/dummy1.png)

*This approach can increase the size of our dataset.*


**Dummy variable: sparse representation**

<img src ='/content/dumy_variable.png'/>

![title]('/content/dumy_variable.png')


Sparse representation: store column index and value.

##### Applying one_hot encoding

In [None]:
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator
# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'] , outputCols=['org_dummy'])

In [None]:
# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights_assembled)
flights = onehot.transform(flights_assembled)

In [None]:
# check the number of categories
onehot.categorySizes

[8]

In [None]:
flights_assembled.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+--------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|            features|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+--------------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|[0.0,22.0,2.0,0.0...|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|[2.0,20.0,4.0,0.0...|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|[9.0,13.0,1.0,1.0...|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|[5.0,2.0,1.0,0.0,...|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|[7.0,2.0,6.0,1.0,...|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+--------------------+
only showing top 5 rows



In [None]:
# Check the results
flights.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SMF|    4.0|(7,[4],[1.0])|
|SJC|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



#### Regression Model
![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Linear_regression.svg/1920px-Linear_regression.svg.png)

#### Loss Function
How well the model is?

$$MSE = \frac{1}{N}\sum_{i=1}^{N}(y_i - \hat{y_i})^2$$

MSE: "Mean Square Error"
Where: 

$y_i$ -- observed values

$\hat{y_i}$ -- model value

**Don't wory** the spark will do all the calculations for you

##### Build regression model

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
# Create a regression object
regression = LinearRegression(labelCol='duration')

In [None]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

Fit to flight_train

In [None]:
# train the model
regression = regression.fit(flights_train)

In [None]:
# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|240     |240.00000000000017|
|160     |160.00000000000003|
|130     |129.99999999999997|
|275     |275.00000000000006|
|85      |84.99999999999994 |
+--------+------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import RegressionEvaluator


# Calculate the RMSE
RegressionEvaluator(labelCol='duration').evaluate(predictions)

8.447208869735431e-14

In [None]:
flights_train, flights_test = flights_km.randomSplit([0.8, 0.2], seed=17)
flights_train.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
|  0|  1|  2|     AA|JFK|  6.58|     230|   50|2570.0|    1|
|  0|  1|  2|     AA|JFK|   7.0|     385|  -16|4162.0|    0|
|  0|  1|  2|     AA|JFK|  12.0|     370|   11|3983.0|    0|
|  0|  1|  2|     AA|JFK|  17.0|     379|  -10|3983.0|    0|
|  0|  1|  2|     AA|LGA|  8.25|     250|   27|2235.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



In [None]:
# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols=['org_idx', 'dow'],
    outputCols=['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

In [None]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

In [None]:
# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed = 13)

# Fit a Logistic Regression model to the training data
pipeline = pipeline.fit(sms_train)

# Make predictions on the testing data
prediction = pipeline.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   40|
|    0|       0.0|  989|
|    1|       1.0|  131|
|    0|       1.0|    1|
+-----+----------+-----+

