In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('cruise app').getOrCreate()

In [0]:
df = spark.read.csv('dbfs:/FileStore/shared_uploads/abhishekbedarkar28@gmail.com/cruise_ship_info.csv', inferSchema=True, header=True)

In [0]:
df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [0]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [0]:
df.groupBy('Cruise_line').count().show(5)

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
+-----------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_category')
indexed_data = indexer.fit(df).transform(df)
indexed_data.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_category|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
df.columns

Out[17]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [0]:
assembler = VectorAssembler(inputCols=[ 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'], outputCol='features')
model_data = assembler.transform(df).select('features','crew')

In [0]:
model_data.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
+--------------------+----+
only showing top 5 rows



In [0]:
train_data,test_data = model_data.randomSplit([0.7,0.3])

In [0]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               119|
|   mean| 7.794453781512617|
| stddev|3.5001311591596576|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [0]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                39|
|   mean| 7.793333333333335|
| stddev|3.5595507798577573|
|    min|              0.59|
|    max|              19.1|
+-------+------------------+



In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
lr = LinearRegression(featuresCol='features', labelCol='crew')
lr_model = lr.fit(model_data)

In [0]:
test_result = lr_model.evaluate(test_data)

In [0]:
test_result.r2

Out[30]: 0.8744410359422597

In [0]:
test_result.residuals.show(5)

+--------------------+
|           residuals|
+--------------------+
| 0.36495825851059394|
|-0.42934344988415374|
|  0.4404191822470054|
| 0.46558279288522186|
| -0.5780264708722402|
+--------------------+
only showing top 5 rows



In [0]:
test_result.explainedVariance

Out[38]: 9.094026569993497

In [0]:
unlabled_data = test_data.select('features')

In [0]:
predictions = lr_model.transform(unlabled_data)

In [0]:
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,133.5,39.59,...|12.765041741489407|
|[6.0,30.276999999...|3.9793434498841536|
|[6.0,110.23899999...|11.059580817752995|
|[6.0,113.0,37.82,...|11.534417207114778|
|[7.0,116.0,31.0,9...| 12.57802647087224|
+--------------------+------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import corr

In [0]:
df.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

