In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=dead180adb0c7017e810b1b35dbd9402028b3636521860724348dacb747d550f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
# Create a Spark session
spark = SparkSession.builder.appName("crew_requirements").getOrCreate()


In [4]:
df=spark.read.csv('/content/cruise_ship_info.csv',inferSchema=True,header=True)

In [5]:
df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [7]:
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol='Cruise_line',outputCol='Cruise_line_index')

df_indexed=indexer.fit(df).transform(df)

In [8]:
df_indexed.show(10)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_index|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|              1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|              1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|              1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|       

In [9]:
df_indexed.select('Cruise_line','Cruise_line_index').distinct().show()

+-----------------+-----------------+
|      Cruise_line|Cruise_line_index|
+-----------------+-----------------+
|            Costa|              5.0|
|        Norwegian|              4.0|
|              MSC|              7.0|
|           Orient|             19.0|
|Regent_Seven_Seas|             10.0|
|           Disney|             18.0|
|         Windstar|             15.0|
|              P&O|              8.0|
|  Royal_Caribbean|              0.0|
|         Seabourn|             14.0|
|             Star|              9.0|
|         Princess|              2.0|
|          Oceania|             13.0|
|          Azamara|             16.0|
| Holland_American|              3.0|
|           Cunard|             12.0|
|        Celebrity|              6.0|
|        Silversea|             11.0|
|          Crystal|             17.0|
|         Carnival|              1.0|
+-----------------+-----------------+



In [10]:
df_indexed.groupby('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [11]:
df_indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_index']

In [12]:
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=['Age',
'Tonnage',
'passengers',
'length',
'cabins',
'passenger_density',
'crew',
'Cruise_line_index'],outputCol='features')

output=assembler.transform(df_indexed)


In [13]:
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_index|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|              1.0|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|              1.0|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|

In [14]:
train_data,test_data=output.randomSplit([.8,.2])
train_data.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|            length|           cabins|passenger_density|              crew|Cruise_line_index|
+-------+---------+-----------+------------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+-----------------+
|  count|      119|        119|               119|               119|              119|               119|              119|              119|               119|              119|
|   mean| Infinity|       NULL|15.865546218487395| 69.93369747899162|18.39512605042017| 8.076302521008401|8.767310924369752|39.13453781512604| 7.693781512605046|4.983193277310924|
| stddev|     NULL|       NULL| 7.857765639364967|36.029637418629065|9.542693616952578|1.74227148906

In [15]:
from pyspark.ml.regression import LinearRegression
crew_req=LinearRegression(featuresCol='features',labelCol='crew')

model=crew_req.fit(train_data)

In [16]:
result=model.evaluate(train_data)

In [17]:
result.r2

1.0

In [18]:
pred=model.transform(test_data)

In [19]:
result_test=model.evaluate(test_data)

In [20]:
result_test.r2

1.0

In [21]:
pred.show()

+------------+---------------+---+------------------+----------+------+------+-----------------+-----+-----------------+--------------------+------------------+
|   Ship_name|    Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density| crew|Cruise_line_index|            features|        prediction|
+------------+---------------+---+------------------+----------+------+------+-----------------+-----+-----------------+--------------------+------------------+
|   Adventure|Royal_Caribbean| 12|             138.0|     31.14|  10.2| 15.57|            44.32|11.85|              0.0|[12.0,138.0,31.14...|11.850000000000033|
|   Atlantica|          Costa| 13|            85.619|     21.14|  9.57| 10.56|             40.5|  9.2|              5.0|[13.0,85.619,21.1...| 9.199999999999989|
|    Conquest|       Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99| 19.1|              1.0|[11.0,110.0,29.74...|19.099999999999905|
|        Dawn|      Norwegian| 11|