In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Regresión Lineal").getOrCreate()


In [3]:
datos=spark.read.csv("cruceros.csv",header=True,inferSchema=True)

In [5]:
datos.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [6]:
datos.head(2)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)]

In [10]:
datos.groupBy("Cruise_line").count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [12]:
from pyspark.ml.feature import StringIndexer

In [13]:
datos.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [15]:
indexer=StringIndexer(inputCol="Cruise_line",outputCol="indice")

In [16]:
indice=indexer.fit(datos)
datosIndexados=indice.transform(datos)

In [17]:
datosIndexados.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- indice: double (nullable = true)



In [19]:
datosIndexados.head(3)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, indice=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, indice=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, indice=1.0)]

In [22]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [23]:
datosIndexados.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'indice']

In [55]:
constructor=VectorAssembler(inputCols=[
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density'],
    outputCol="features")

In [56]:
salida=constructor.transform(datosIndexados)

In [57]:
salida.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- indice: double (nullable = true)
 |-- features: vector (nullable = true)



In [58]:
salida.select("features","crew").show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [59]:
datosFinales=salida.select("features","crew")

In [60]:
datosFinales.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [75]:
datosEntrena,datosEvalua=datosFinales.randomSplit([0.7,0.3])

In [76]:
datosEntrena.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 7.652272727272738|
| stddev|3.4751444189093603|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [77]:
datosEvalua.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                48|
|   mean|          8.119375|
| stddev|3.5831754645250733|
|    min|               0.6|
|    max|              19.1|
+-------+------------------+



In [78]:
from pyspark.ml.regression import LinearRegression

In [79]:
barcosLR=LinearRegression(labelCol="crew")

In [80]:
modelo=barcosLR.fit(datosEntrena)

In [81]:
modelo.coefficients

DenseVector([-0.0073, 0.0221, -0.1406, 0.3807, 0.7254, 0.0005])

In [82]:
evaluacion=modelo.evaluate(datosEvalua)

In [83]:
evaluacion.r2

0.8532665840266722

In [84]:
evaluacion.predictions.show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[5.0,86.0,21.04,9...|  8.0| 9.267256572169268|
|[5.0,160.0,36.34,...| 13.6|15.187197045552939|
|[6.0,30.276999999...| 3.55| 3.873495459818178|
|[6.0,158.0,43.7,1...| 13.6| 14.02328191461335|
|[7.0,158.0,43.7,1...| 13.6|13.966469774213152|
|[8.0,77.499,19.5,...|  9.0| 8.628189591799176|
|[9.0,81.0,21.44,9...| 10.0| 9.375452302483344|
|[9.0,105.0,27.2,8...|10.68|11.037604091623813|
|[10.0,68.0,10.8,7...| 6.36| 6.304619235672249|
|[10.0,77.0,20.16,...|  9.0| 8.508978410010855|
|[10.0,81.76899999...| 8.42| 8.875604042452212|
|[10.0,90.09,25.01...| 8.58|  9.06242876802185|
|[10.0,138.0,31.14...|11.85|13.160212925050795|
|[10.0,151.4,26.2,...|12.53| 11.51521943242552|
|[11.0,90.0,22.4,9...| 11.0| 9.941300504419589|
|[11.0,110.0,29.74...| 19.1|11.972702692202573|
|[11.0,138.0,31.14...|11.85|13.152892501103013|
|[12.0,2.329,0.94,...|  0.6|0.6644468577

In [85]:
evaluacion.meanSquaredError

1.8446832314656623

In [86]:
evaluacion.meanAbsoluteError

0.8209001741352037

In [87]:
datosNuevos=datosEvalua.select("features")#truco para tener datos "nuevos"

prediccion=modelo.transform(datosNuevos)

In [88]:
prediccion.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,86.0,21.04,9...| 9.267256572169268|
|[5.0,160.0,36.34,...|15.187197045552939|
|[6.0,30.276999999...| 3.873495459818178|
|[6.0,158.0,43.7,1...| 14.02328191461335|
|[7.0,158.0,43.7,1...|13.966469774213152|
|[8.0,77.499,19.5,...| 8.628189591799176|
|[9.0,81.0,21.44,9...| 9.375452302483344|
|[9.0,105.0,27.2,8...|11.037604091623813|
|[10.0,68.0,10.8,7...| 6.304619235672249|
|[10.0,77.0,20.16,...| 8.508978410010855|
|[10.0,81.76899999...| 8.875604042452212|
|[10.0,90.09,25.01...|  9.06242876802185|
|[10.0,138.0,31.14...|13.160212925050795|
|[10.0,151.4,26.2,...| 11.51521943242552|
|[11.0,90.0,22.4,9...| 9.941300504419589|
|[11.0,110.0,29.74...|11.972702692202573|
|[11.0,138.0,31.14...|13.152892501103013|
|[12.0,2.329,0.94,...|0.6644468577685168|
|[12.0,42.0,14.8,7...| 6.221763508055092|
|[12.0,77.104,20.0...| 8.693645475123002|
+--------------------+------------

In [89]:
from pyspark.sql.functions import corr

In [90]:
datos.select(corr("passengers","crew")).show()

+----------------------+
|corr(passengers, crew)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [91]:
datos.select(corr("cabins","crew")).show()

+------------------+
|corr(cabins, crew)|
+------------------+
|0.9508226063578497|
+------------------+



In [92]:
datosIndexados.select(corr("indice","crew")).show()

+--------------------+
|  corr(indice, crew)|
+--------------------+
|-0.48332562728617057|
+--------------------+

