# Create a regression model that will help predict how many crew members will be needed for future ships.

Create Spark session and data frame

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CM').getOrCreate()

In [2]:
df = spark.read.csv('../data/cruise_ship_info.csv', header=True, inferSchema=True)

In [3]:
df.describe().toPandas()

Unnamed: 0,summary,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0
1,mean,Infinity,,15.689873417721518,71.28467088607599,18.45740506329114,8.130632911392404,8.830000000000005,39.90094936708861,7.794177215189873
2,stddev,,,7.615691058751413,37.229540025907866,9.677094775143416,1.793473548054825,4.4714172221480615,8.63921711391542,3.503486564627034
3,min,Adventure,Azamara,4.0,2.329,0.66,2.79,0.33,17.7,0.59
4,max,Zuiderdam,Windstar,48.0,220.0,54.0,11.82,27.0,71.43,21.0


The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!  
Ship_name is probably not an important feature.

In [4]:
from pyspark.ml.feature import VectorAssembler 

## Without Ship_name and Cruise_line:

In [26]:
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density'],outputCol='features')

In [27]:
features=assembler.transform(df)
features.select('features').head(1)

[Row(features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64]))]

In [9]:
data=features.select(['features','crew'])

In [11]:
train,test=data.randomSplit([0.7,0.3])

In [15]:
from pyspark.ml.regression import LinearRegression as LR

In [17]:
lr = LR(labelCol='crew')

In [18]:
lr_model=lr.fit(train)

In [19]:
evaluation=lr_model.evaluate(test)

In [23]:
print (evaluation.r2)
print(evaluation.rootMeanSquaredError)

0.9393478771365484
0.7454941890372148


## Without Ship_name :

In [107]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer, VectorIndexer

In [152]:
cruise_indexer=StringIndexer(inputCol='Cruise_line',outputCol='CruiseIndex')
cruise_encoder=OneHotEncoder(inputCol='CruiseIndex',outputCol='CruiseVec')
assembler2=VectorAssembler(inputCols=['CruiseIndex','Age','Tonnage','passengers','length','cabins','passenger_density'],outputCol='features')

In [153]:
df_ind=cruise_indexer.fit(df).transform(df)
df_enc=cruise_encoder.fit(df_ind).transform(df_ind)
features2=assembler2.transform(df_enc)

data2=features2.select(['features','crew'])
train2,test2=data2.randomSplit([0.7,0.3])

In [154]:
lr_model2=lr.fit(train2)
eval2=lr_model2.evaluate(test2)

In [155]:
print (eval2.r2)
print(eval2.rootMeanSquaredError)

0.9594584086911953
0.6629006621180089


## With all the features :

In [134]:
ship_indexer=StringIndexer(inputCol='Ship_name',outputCol='ShipIndex')
ship_encoder=OneHotEncoder(inputCol='ShipIndex',outputCol='ShipVec')
assembler3=VectorAssembler(inputCols=['ShipIndex','CruiseIndex','Age','Tonnage','passengers','length','cabins','passenger_density'],outputCol='features')

In [135]:
df_ind2=ship_indexer.fit(df_enc).transform(df_enc)
df_enc2=ship_encoder.fit(df_ind2).transform(df_ind2)
features3=assembler3.transform(df_enc2)
data3=features3.select(['features','crew'])
train3,test3=data3.randomSplit([0.7,0.3])

In [136]:
lr_model3=lr.fit(train3)
eval3=lr_model3.evaluate(test3)

In [137]:
print (eval3.r2)
print(eval3.rootMeanSquaredError)

0.9039395908688531
0.9241082061474073


The best results have been obtained with all the features except Ship_name. The initial information about Cruise_line being an important feature was true, it improved the results.