In [1]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Our task is to develop a regression model that will predict the number of  crew members required for future ships from the given features. 

In [4]:
import os
import findspark
findspark.init()
from pyspark.sql import SparkSession    
from pyspark.sql import functions as F  
from pyspark.sql.types import * 

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("lab2") \
    .getOrCreate()

### Read the data Crew.csv into spark dataframe
- inferSchema=True and header=True.
- Print the schema and show the first few rows.
- Use df.describe() to see the statistical properties of the data.

In [11]:
df = spark.read.csv("..\Data\Crew.csv", header=True, inferSchema=True)  

In [12]:
df.printSchema()   

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [13]:
df.show(20)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [14]:
df.describe().show()    

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       NULL|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|     NULL|       NULL| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

### StringIndexer and OneHotEncoder 
- Create StringIndexer and OneHotEncoder to process the data.
- StringIndexer is for any string data type.
- OneHotEncoder will be applied to the StringIndexer columns.
- Convert all obtained columns from OneHotEncoder and the other numeric columns into a feature column (use VectorAssembler) 

In [16]:
df.dtypes

[('Ship_name', 'string'),
 ('Cruise_line', 'string'),
 ('Age', 'int'),
 ('Tonnage', 'double'),
 ('passengers', 'double'),
 ('length', 'double'),
 ('cabins', 'double'),
 ('passenger_density', 'double'),
 ('crew', 'double')]

In [20]:
str_columns = [f[0] for f in df.dtypes if f[1] == 'string'] 
str_columns 

['Ship_name', 'Cruise_line']

In [21]:
categorical_columns = [str_column + "_Index" for str_column in str_columns  ]
categorical_columns

['Ship_name_Index', 'Cruise_line_Index']

In [23]:
ONEHOT_COLUMNS = [str_column + "_OHE" for str_column in str_columns]
ONEHOT_COLUMNS

['Ship_name_OHE', 'Cruise_line_OHE']

<B>string Indexer</B>

In [25]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCols=str_columns, outputCols=categorical_columns, handleInvalid="keep")  

<B>One Hot</B>

In [27]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCols=categorical_columns, outputCols=ONEHOT_COLUMNS)

In [28]:
numeric_columns = [f[0] for f in df.dtypes if f[1] != 'string'] 
numeric_columns

['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

<B>IMPUTE</B>

In [31]:
# Impute numeric columns
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=numeric_columns,
    outputCols=numeric_columns,
)

In [32]:
features = numeric_columns + ONEHOT_COLUMNS 
features

['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Ship_name_OHE',
 'Cruise_line_OHE']

In [33]:
# removing The Crew column
features = [f for f in features if f != 'crew']
features

['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Ship_name_OHE',
 'Cruise_line_OHE']

<B>Vector Assembler</B>

In [34]:
from pyspark.ml.feature import VectorAssembler

vecAssmbler = VectorAssembler(inputCols=features, outputCol="features")
vecAssmbler

VectorAssembler_6f29ca116df1

### Divide the data into Train/Test

In [38]:
# train test split  

train, test = df.randomSplit([0.8, 0.2], seed=42)
print(f"train: {train.count()}, test: {test.count()}")

train: 133, test: 25


### Create a Linear Regression Model 

In [39]:
from pyspark.ml.regression import LinearRegression  

LR = LinearRegression(featuresCol="features", labelCol="crew", predictionCol="crew_prediction")  

### Create a Pipeline model

In [40]:
from pyspark.ml import Pipeline 
pip = Pipeline(stages=[indexer, encoder, imputer, vecAssmbler, LR])

### Fit the Pipeline model to the trainig data

In [41]:
pipe_model = pip.fit(train) 

### Make a prediction for the same training data and evaluate the model performance using RMSE and r2

In [42]:
pred_train = pipe_model.transform(train)    
pred_train.select("crew", "crew_prediction").show(10)    

+-----+------------------+
| crew|   crew_prediction|
+-----+------------------+
|11.85|11.849779087814968|
|  4.0| 4.000599016844456|
| 8.69| 8.689700038306382|
| 0.59|0.5894655890376785|
|  7.0|6.9993786295581195|
|  9.2|   9.2001141958825|
| 8.48| 8.480396746577874|
| 11.0|10.999847959138341|
|  6.7| 6.699152004397167|
| 8.58| 8.579895217223928|
+-----+------------------+
only showing top 10 rows



In [43]:
# Evaluation (RMSE and R2)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="crew", predictionCol="crew_prediction", metricName="rmse")
rmse = evaluator.evaluate(pred_train)
print(f"RMSE: {rmse}")

RMSE: 0.07108467356604929


In [44]:
evaluator = RegressionEvaluator(labelCol="crew", predictionCol="crew_prediction", metricName="r2")
r2 = evaluator.evaluate(pred_train)
print(f"R2: {r2}")  

R2: 0.9995973787048772


### Make a prediction for the test data and evaluate the model performance using RMSE and r2

In [45]:
pred_test = pipe_model.transform(test)    
pred_test.select("crew", "crew_prediction").show(10)

+-----+------------------+
| crew|   crew_prediction|
+-----+------------------+
|  6.0| 6.043642467755465|
|  5.2|3.6513808070905784|
|  8.5| 7.977652830050516|
| 6.17|4.5763085254240305|
| 12.0|14.990300762833456|
|12.38| 13.72534331894821|
|  7.6|  6.45483806672612|
|  9.2| 6.663490184992261|
| 11.1|13.117806725663803|
|  7.6| 6.393912944040357|
+-----+------------------+
only showing top 10 rows



In [48]:
evaluator = RegressionEvaluator(labelCol="crew", predictionCol="crew_prediction", metricName="rmse")
rmse = evaluator.evaluate(pred_test)
print(f"RMSE: {rmse}")

RMSE: 1.9393353168463836


In [50]:
evaluator = RegressionEvaluator(labelCol="crew", predictionCol="crew_prediction", metricName="r2") 
r2 = evaluator.evaluate(pred_test)
print(f"R2: {r2}")

R2: 0.629126258801238
