In [1]:
import seaborn as sns
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('regression_diamonds').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('diamonds'))
df.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows



In [22]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, DecisionTreeRegressor, GBTRegressor
help(LinearRegression)

Help on class LinearRegression in module pyspark.ml.regression:

class LinearRegression(_JavaRegressor, _LinearRegressionParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  LinearRegression(*, featuresCol: str = 'features', labelCol: str = 'label', predictionCol: str = 'prediction', maxIter: int = 100, regParam: float = 0.0, elasticNetParam: float = 0.0, tol: float = 1e-06, fitIntercept: bool = True, standardization: bool = True, solver: str = 'auto', weightCol: Optional[str] = None, aggregationDepth: int = 2, loss: str = 'squaredError', epsilon: float = 1.35, maxBlockSizeInMB: float = 0.0)
 |  
 |  Linear regression.
 |  
 |  The learning objective is to minimize the specified loss function, with regularization.
 |  This supports two kinds of loss:
 |  
 |  * squaredError (a.k.a squared loss)
 |  * huber (a hybrid of squared error for relatively small errors and absolute error for     relatively large ones, and we estimate the scale parameter from training data

In [10]:
# Obtener la X que para pyspark se llaman 'features'
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=['carat', 'depth', 'x', 'y', 'z', 'table'],
    outputCol= 'features' #le llamamos features para que coincida con lo que piden los algoritmos
)
df_assembled = assembler.transform(df)
df_assembled.show(3)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|            features|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|[0.23,61.5,3.95,3...|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|[0.21,59.8,3.89,3...|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|[0.23,56.9,4.05,4...|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
only showing top 3 rows



In [11]:
df_features_label = df_assembled.withColumnRenamed('price', 'label').select('features', 'label')
df_features_label.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.23,61.5,3.95,3...|  326|
|[0.21,59.8,3.89,3...|  326|
|[0.23,56.9,4.05,4...|  327|
+--------------------+-----+
only showing top 3 rows



In [13]:
#particionamiento de datos
df_train, df_test = df_features_label.randomSplit([0.8, 0.2], seed=42)

In [17]:
lr = LinearRegression()
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.22,59.3,3.91,3...|  404| 23.60821074157866|
|[0.23,56.9,4.05,4...|  327|135.66724901116686|
|[0.23,59.4,4.0,4....|  338|104.80613500547406|
|[0.23,60.5,3.96,3...|  357|-69.15086283610799|
+--------------------+-----+------------------+
only showing top 4 rows



In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator_r2 = RegressionEvaluator(metricName='r2')
evaluator_mae = RegressionEvaluator(metricName='mae')
evaluator_mse = RegressionEvaluator(metricName='mse')
evaluator_rmse = RegressionEvaluator(metricName='rmse')

print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))


r2 0.8549461831380685
mae 894.0239725727469
mse 2312229.6146338275
rmse 1520.6017278149552


In [20]:
tree = DecisionTreeRegressor()
model = tree.fit(df_train)
df_pred1 = model.transform(df_test)
df_pred1.show(4)
print('r2', evaluator_r2.evaluate(df_pred1))
print('mae', evaluator_mae.evaluate(df_pred1))
print('mse', evaluator_mse.evaluate(df_pred1))
print('rmse', evaluator_rmse.evaluate(df_pred1))

+--------------------+-----+-----------------+
|            features|label|       prediction|
+--------------------+-----+-----------------+
|[0.22,59.3,3.91,3...|  404|599.2502536354414|
|[0.23,56.9,4.05,4...|  327|599.2502536354414|
|[0.23,59.4,4.0,4....|  338|599.2502536354414|
|[0.23,60.5,3.96,3...|  357|599.2502536354414|
+--------------------+-----+-----------------+
only showing top 4 rows

r2 0.8753935817871554
mae 799.4523941592846
mse 1986287.9626217007
rmse 1409.3572870715575


In [21]:
rfores = RandomForestRegressor()
model = rfores.fit(df_train)
df_pred2 = model.transform(df_test)
df_pred2.show(4)
print('r2', evaluator_r2.evaluate(df_pred2))
print('mae', evaluator_mae.evaluate(df_pred2))
print('mse', evaluator_mse.evaluate(df_pred2))
print('rmse', evaluator_rmse.evaluate(df_pred2))

+--------------------+-----+-----------------+
|            features|label|       prediction|
+--------------------+-----+-----------------+
|[0.22,59.3,3.91,3...|  404|664.6009616650199|
|[0.23,56.9,4.05,4...|  327|664.6009616650199|
|[0.23,59.4,4.0,4....|  338|664.6009616650199|
|[0.23,60.5,3.96,3...|  357|664.6009616650199|
+--------------------+-----+-----------------+
only showing top 4 rows

r2 0.8767774947730056
mae 797.8078362541166
mse 1964227.704855406
rmse 1401.5090812604126


In [23]:
gbt = GBTRegressor()
model = gbt.fit(df_train)
df_pred3 = model.transform(df_test)
df_pred3.show(4)
print('r2', evaluator_r2.evaluate(df_pred3))
print('mae', evaluator_mae.evaluate(df_pred3))
print('mse', evaluator_mse.evaluate(df_pred3))
print('rmse', evaluator_rmse.evaluate(df_pred3))

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.22,59.3,3.91,3...|  404|479.39028182380264|
|[0.23,56.9,4.05,4...|  327| 431.1458877763015|
|[0.23,59.4,4.0,4....|  338| 486.7608846471924|
|[0.23,60.5,3.96,3...|  357|490.46803301018366|
+--------------------+-----+------------------+
only showing top 4 rows

r2 0.882083332436065
mae 779.5744327454869
mse 1879650.0271329167
rmse 1371.00329216706
