<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Linear-Regression-Consulting-Project" data-toc-modified-id="Linear-Regression-Consulting-Project-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Linear Regression Consulting Project</a></span></li><li><span><a href="#Read-the-data" data-toc-modified-id="Read-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read the data</a></span></li><li><span><a href="#String-Indexing" data-toc-modified-id="String-Indexing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>String Indexing</a></span></li><li><span><a href="#Use-VectorAssembler-to-get-final-data" data-toc-modified-id="Use-VectorAssembler-to-get-final-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Use VectorAssembler to get final data</a></span></li><li><span><a href="#train-test-split" data-toc-modified-id="train-test-split-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>train test split</a></span></li><li><span><a href="#Build-the-model" data-toc-modified-id="Build-the-model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Build the model</a></span></li><li><span><a href="#Evaluate-the-model" data-toc-modified-id="Evaluate-the-model-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Evaluate the model</a></span></li><li><span><a href="#Test-predictions" data-toc-modified-id="Test-predictions-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Test predictions</a></span></li><li><span><a href="#Check-corrrelations" data-toc-modified-id="Check-corrrelations-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Check corrrelations</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('lrex').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.regression import LinearRegression

In [3]:
def show_method_attributes(method, ncols=2):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [I for I in dir(method) if I[0].islower()]
    x = [I for I in x if I not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')

# Linear Regression Consulting Project

<font color='gray'> Your job is to create a regression model
that will help predict how many crew
members will be needed for future ships. </font>

> In other words, use the features you think
will be useful to predict the value in the
Crew column.

> The cruise line value is a string however!
We haven't covered exactly how to
convert strings to numbers with Python
and Spark (yet)
Try to see if you can discover how to use
Stringlndexer from the documentation!

# Read the data

In [4]:
!head -2 ../data/cruise_ship_info.csv

Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55


In [28]:
df = spark.read.csv('../data/cruise_ship_info.csv',
                         header=True, inferSchema=True)
print(df.count())
df.show()

158
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 

In [29]:
print(df.columns)

['Ship_name', 'Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew']


In [30]:
df.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



# String Indexing

In [31]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df)
            for column in ['Cruise_line'] ]


pipeline = Pipeline(stages=indexers)
df = pipeline.fit(data).transform(df)

df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_index|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|              1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|              1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|              1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|       

In [48]:
print(df.columns)

['Ship_name', 'Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew', 'Cruise_line_index']


# Use VectorAssembler to get final data

In [8]:
inputCols = ['Age', 'Tonnage', 'passengers', 'length',
             'cabins', 'passenger_density', 'Cruise_line_index', 'crew']
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')

In [32]:
output = assembler.transform(df)

In [33]:
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- Cruise_line_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [49]:
output.select('features', 'crew').show(2)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
+--------------------+----+
only showing top 2 rows



In [36]:
labelCol = 'crew'
final_data = output.select('features', labelCol)
final_data.show(2)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
+--------------------+----+
only showing top 2 rows



# train test split

In [50]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               111|
|   mean|  7.71774774774776|
| stddev|3.7361892799736913|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [51]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                47|
|   mean| 7.974680851063829|
| stddev|2.9095112775499885|
|    min|              2.11|
|    max|              13.6|
+-------+------------------+



# Build the model

In [38]:
lr = LinearRegression(labelCol=labelCol)

In [39]:
lr_model = lr.fit(train_data)

# Evaluate the model

In [40]:
test_results = lr_model.evaluate(test_data)

In [41]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|2.486899575160350...|
|-3.55271367880050...|
|-2.57571741713036...|
|1.598721155460225...|
|1.598721155460225...|
|-8.88178419700125...|
|-1.95399252334027...|
|5.329070518200751...|
|-7.10542735760100...|
|7.105427357601002...|
|-3.55271367880050...|
|3.552713678800501...|
|-1.77635683940025...|
|-2.75335310107038...|
|5.240252676230739...|
|-3.55271367880050...|
|-1.77635683940025...|
|-1.77635683940025...|
|                 0.0|
|3.552713678800501...|
+--------------------+
only showing top 20 rows



In [42]:
test_results.rootMeanSquaredError

2.2701904826307156e-14

In [52]:
test_results.r2

1.0

In [54]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               111|
|   mean|  7.71774774774776|
| stddev|3.7361892799736913|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [55]:
test_results.meanSquaredError

5.153764827427081e-28

# Test predictions

In [44]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[5.0,133.5,39.59,...|
|[5.0,160.0,36.34,...|
|[6.0,30.276999999...|
|[6.0,158.0,43.7,1...|
|[7.0,89.6,25.5,9....|
|[8.0,91.0,22.44,9...|
|[9.0,59.058,17.0,...|
|[9.0,90.09,25.01,...|
|[9.0,105.0,27.2,8...|
|[10.0,86.0,21.14,...|
|[10.0,138.0,31.14...|
|[11.0,91.62700000...|
|[11.0,138.0,31.14...|
|[12.0,42.0,14.8,7...|
|[12.0,50.0,7.0,7....|
|[12.0,88.5,21.24,...|
|[12.0,91.0,20.32,...|
|[12.0,91.0,20.32,...|
|[12.0,138.0,31.14...|
|[13.0,63.0,14.4,7...|
+--------------------+
only showing top 20 rows



In [45]:
preds = lr_model.transform(unlabeled_data)
preds.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,133.5,39.59,...|13.129999999999976|
|[5.0,160.0,36.34,...|13.600000000000003|
|[6.0,30.276999999...|3.5500000000000256|
|[6.0,158.0,43.7,1...|13.599999999999984|
|[7.0,89.6,25.5,9....| 9.869999999999983|
|[8.0,91.0,22.44,9...|11.000000000000009|
|[9.0,59.058,17.0,...|  7.40000000000002|
|[9.0,90.09,25.01,...| 8.689999999999994|
|[9.0,105.0,27.2,8...|10.680000000000007|
|[10.0,86.0,21.14,...| 9.199999999999992|
|[10.0,138.0,31.14...|11.850000000000003|
|[11.0,91.62700000...| 8.999999999999996|
|[11.0,138.0,31.14...|11.850000000000001|
|[12.0,42.0,14.8,7...| 6.800000000000027|
|[12.0,50.0,7.0,7....| 4.449999999999948|
|[12.0,88.5,21.24,...|10.290000000000003|
|[12.0,91.0,20.32,...| 9.990000000000002|
|[12.0,91.0,20.32,...| 9.990000000000002|
|[12.0,138.0,31.14...|             11.85|
|[13.0,63.0,14.4,7...| 5.309999999999996|
+--------------------+------------

# Check corrrelations

In [56]:
from pyspark.sql.functions import corr

In [58]:
df.select( corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [59]:
df.select( corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

