# Curso Big Data #7 - Linear Regression in PySpark 2

#### 1. Creamos  la session en spark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_reg').getOrCreate()

#### 2. Carga de los datos

In [2]:
# Importing the dataset
df = spark.read.csv('C:/Users/pc/pruebas/cruise_ship_info.csv', inferSchema=True, header=True)

In [4]:
df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



#### 3. Transformacion de los datos de la columna Cruise_line en categorias numericas

In [5]:
# StringIndexer for the categorical column
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_cat')
df = indexer.fit(df).transform(df)

In [7]:
df. show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_cat|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|

In [12]:
df.columns[2:8]  + df.columns[-1:]

['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruise_line_cat']

#### 4. Transformar el dataframe en un dataframe aceptado para PySpark ml algorithms

In [13]:

from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=df.columns[2:8] + df.columns[-1:], outputCol='features')
df_final = vectorAssembler.transform(df).select('features', 'crew')

In [14]:
df_final.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



#### 5. Dividir prueba del entrenamiento

In [15]:

train_data, test_data = df_final.randomSplit([0.7, 0.3])

In [16]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               115|
|   mean|  7.71600000000001|
| stddev|3.6179517807344115|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [17]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                43|
|   mean| 8.003255813953489|
| stddev|3.2082094998465847|
|    min|              0.88|
|    max|              13.6|
+-------+------------------+



#### 5. Ajuste del modelo

In [18]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='features', labelCol='crew', predictionCol='predictions')
regressor = regressor.fit(train_data)

#### 6. Efectuar algunas predicciones

In [19]:
test_results = regressor.evaluate(test_data)

# El error cuadrático medio es una medida de uso frecuente de las diferencias entre los valores predichos por un modelo 
#o un estimador y los valores observados.
test_results.rootMeanSquaredError

test_results.r2

df.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|  Cruise_line_cat|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|5.063291139240507|
| stddev|     null|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|

In [20]:
from pyspark.sql.functions import corr
df.select(corr('crew', 'passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [21]:
df.select(corr('crew', 'cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



In [22]:
df.select(corr('crew', 'Tonnage')).show()

+-------------------+
|corr(crew, Tonnage)|
+-------------------+
|  0.927568811544939|
+-------------------+



In [23]:
df.select(corr('crew', 'Age')).show()

+-------------------+
|    corr(crew, Age)|
+-------------------+
|-0.5306565039638852|
+-------------------+



#### 7. Prediciendo unlabeled test set

In [24]:

unlabeled_data = test_data.select('features')
predictions = regressor.transform(unlabeled_data)

In [25]:
predictions.select('predictions').head(10)

[Row(predictions=13.306114007296639),
 Row(predictions=4.390632238313791),
 Row(predictions=4.390632238313791),
 Row(predictions=10.066156595048028),
 Row(predictions=10.591246414702695),
 Row(predictions=11.527424897059507),
 Row(predictions=14.03651284982309),
 Row(predictions=10.120613251196573),
 Row(predictions=12.134138297838767),
 Row(predictions=9.56060284202562)]

In [26]:
test_data.select('crew').head(10)

[Row(crew=13.13),
 Row(crew=3.55),
 Row(crew=3.55),
 Row(crew=9.0),
 Row(crew=11.09),
 Row(crew=10.9),
 Row(crew=13.6),
 Row(crew=11.0),
 Row(crew=11.6),
 Row(crew=10.0)]