# VectorAssembler

### Regresion pyspark para predecir columna mpg del dataset mpg

In [40]:
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import NumericType, StringType 

In [22]:
spark = SparkSession.builder.appName('regresion_mpg').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('mpg').dropna())
df.show(5)

+----+---------+------------+----------+------+------------+----------+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model_year|origin|                name|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
|18.0|        8|       307.0|     130.0|  3504|        12.0|        70|   usa|chevrolet chevell...|
|15.0|        8|       350.0|     165.0|  3693|        11.5|        70|   usa|   buick skylark 320|
|18.0|        8|       318.0|     150.0|  3436|        11.0|        70|   usa|  plymouth satellite|
|16.0|        8|       304.0|     150.0|  3433|        12.0|        70|   usa|       amc rebel sst|
|17.0|        8|       302.0|     140.0|  3449|        10.5|        70|   usa|         ford torino|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
only showing top 5 rows



### Opcion 1: hacer assembler antes de particionar datos


In [23]:

assembler = VectorAssembler(
    inputCols=['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year'],
    outputCol= 'features' #le llamamos features para que coincida con lo que piden los algoritmos
)
df_assembled = assembler.transform(df)
df_assembled.show(3)

+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model_year|origin|                name|            features|
+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
|18.0|        8|       307.0|     130.0|  3504|        12.0|        70|   usa|chevrolet chevell...|[8.0,307.0,130.0,...|
|15.0|        8|       350.0|     165.0|  3693|        11.5|        70|   usa|   buick skylark 320|[8.0,350.0,165.0,...|
|18.0|        8|       318.0|     150.0|  3436|        11.0|        70|   usa|  plymouth satellite|[8.0,318.0,150.0,...|
+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
only showing top 3 rows



In [24]:
df_features_label = df_assembled.withColumnRenamed('mpg', 'label').select('features', 'label')
df_features_label.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[8.0,307.0,130.0,...| 18.0|
|[8.0,350.0,165.0,...| 15.0|
|[8.0,318.0,150.0,...| 18.0|
+--------------------+-----+
only showing top 3 rows



In [25]:
#particionamiento de datos
df_train, df_test = df_features_label.randomSplit([0.8, 0.2], seed=42)


### Opcion 2: primero particionar y luego usar VectorAssembler

In [26]:
numeric_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
label_col = 'mpg'
df_selected = df.select(numeric_cols + [label_col])
df_selected.show(2)

+---------+------------+----------+------+------------+----------+----+
|cylinders|displacement|horsepower|weight|acceleration|model_year| mpg|
+---------+------------+----------+------+------------+----------+----+
|        8|       307.0|     130.0|  3504|        12.0|        70|18.0|
|        8|       350.0|     165.0|  3693|        11.5|        70|15.0|
+---------+------------+----------+------+------------+----------+----+
only showing top 2 rows



In [27]:
df_train, df_test = df_selected.randomSplit([0.8, 0.2], seed=42)

In [28]:
assembler = VectorAssembler(
    inputCols= numeric_cols,
    outputCol= 'features' #le llamamos features para que coincida con lo que piden los algoritmos
)
df_train = assembler.transform(df_train).select('features', label_col)
df_test = assembler.transform(df_test).select('features', label_col)
df_train.show(1)

+--------------------+----+
|            features| mpg|
+--------------------+----+
|[4.0,97.0,46.0,18...|26.0|
+--------------------+----+
only showing top 1 row



### Regression

In [None]:
lr = LinearRegression(labelCol=label_col)
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)

+--------------------+----+------------------+
|            features| mpg|        prediction|
+--------------------+----+------------------+
|[4.0,97.0,88.0,21...|27.0|25.250093988412278|
|[4.0,113.0,95.0,2...|25.0|24.676738282044827|
|[4.0,121.0,113.0,...|26.0|23.814798786244754|
|[6.0,199.0,90.0,2...|21.0|21.084588525221672|
+--------------------+----+------------------+
only showing top 4 rows



In [31]:
evaluator_r2 = RegressionEvaluator(metricName='r2', labelCol=label_col)
evaluator_mae = RegressionEvaluator(metricName='mae', labelCol=label_col)
evaluator_mse = RegressionEvaluator(metricName='mse', labelCol=label_col)
evaluator_rmse = RegressionEvaluator(metricName='rmse', labelCol=label_col)
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))


r2 0.7855411849361159
mae 2.729576315612858
mse 12.796950495638
rmse 3.5772825574223237


In [None]:
tree = DecisionTreeRegressor(labelCol=label_col)
model = tree.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

+--------------------+----+------------------+
|            features| mpg|        prediction|
+--------------------+----+------------------+
|[4.0,97.0,88.0,21...|27.0| 28.38888888888889|
|[4.0,113.0,95.0,2...|25.0|23.452380952380953|
|[4.0,121.0,113.0,...|26.0|23.452380952380953|
|[6.0,199.0,90.0,2...|21.0|19.545454545454547|
+--------------------+----+------------------+
only showing top 4 rows

r2 0.7365300126693128
mae 2.727834251947929
mse 15.72149125207477
rmse 3.9650335751510064


In [38]:
rfores = RandomForestRegressor(labelCol=label_col)
model = rfores.fit(df_train)
df_pred2 = model.transform(df_test)
df_pred2.show(4)
print('r2', evaluator_r2.evaluate(df_pred2))
print('mae', evaluator_mae.evaluate(df_pred2))
print('mse', evaluator_mse.evaluate(df_pred2))
print('rmse', evaluator_rmse.evaluate(df_pred2))

+--------------------+----+------------------+
|            features| mpg|        prediction|
+--------------------+----+------------------+
|[4.0,97.0,88.0,21...|27.0| 26.38153733393471|
|[4.0,113.0,95.0,2...|25.0|24.004228108417472|
|[4.0,121.0,113.0,...|26.0| 23.23222333171911|
|[6.0,199.0,90.0,2...|21.0|20.984431060662835|
+--------------------+----+------------------+
only showing top 4 rows

r2 0.8265663244052999
mae 2.0549119456815133
mse 10.348943503211952
rmse 3.216977386182867


In [None]:
gbt = GBTRegressor(labelCol=label_col)
model = gbt.fit(df_train)
df_pred2 = model.transform(df_test)
df_pred2.show(4)
print('r2', evaluator_r2.evaluate(df_pred2))
print('mae', evaluator_mae.evaluate(df_pred2))
print('mse', evaluator_mse.evaluate(df_pred2))
print('rmse', evaluator_rmse.evaluate(df_pred2))

+--------------------+----+------------------+
|            features| mpg|        prediction|
+--------------------+----+------------------+
|[4.0,97.0,88.0,21...|27.0| 27.75746768399558|
|[4.0,113.0,95.0,2...|25.0|22.617828495200573|
|[4.0,121.0,113.0,...|26.0| 21.98734984759712|
|[6.0,199.0,90.0,2...|21.0| 20.78967097491329|
+--------------------+----+------------------+
only showing top 4 rows

r2 0.7397536860595073
mae 2.712554975632221
mse 15.52913175975856
rmse 3.9407019374419274


In [None]:
numeric_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]
categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType)]