In [0]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [0]:
Carros_temp = spark.read.csv('dbfs:/FileStore/tables/arquivos/Carros.csv', inferSchema = True, header=True, sep= ';')

In [0]:
Carros_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [0]:
Carros = Carros_temp.select('Consumo','Cilindros','Cilindradas', 'HP')

In [0]:
Carros.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



In [0]:
veccaracteristicas = VectorAssembler(inputCols=[('Consumo'),('Cilindros'),('Cilindradas')], outputCol = 'caracteristicas')

In [0]:
Carros = veccaracteristicas.transform(Carros)

In [0]:
Carros.show(5)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  caracteristicas|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



In [0]:
CarrosTreino, CarrosTeste = Carros.randomSplit([0.7,0.3])

In [0]:
reglin = LinearRegression(featuresCol='caracteristicas',labelCol = 'HP')
modelo = reglin.fit(CarrosTreino)

In [0]:
previsao = modelo.transform(CarrosTeste)

In [0]:
previsao.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     15|        8|        301|335|  [15.0,8.0,301.0]|191.58550078523723|
|    104|        8|        472|205| [104.0,8.0,472.0]|  193.610153496484|
|    143|        8|        360|245| [143.0,8.0,360.0]|195.35261541878214|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|184.66513337946353|
|    192|        8|        400|175| [192.0,8.0,400.0]| 196.7150411152672|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



In [0]:
avaliar = RegressionEvaluator(predictionCol='prediction', labelCol='HP', metricName='rmse')

In [0]:
rmse = avaliar.evaluate(previsao)

In [0]:
print(rmse)

59.01050222798654


In [0]:
rfreg = RandomForestRegressor(featuresCol='caracteristicas', labelCol='HP')

In [0]:
modelo2 = rfreg.fit(CarrosTreino)

In [0]:
previsao2 = modelo2.transform(CarrosTeste)

In [0]:
previsao2.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     15|        8|        301|335|  [15.0,8.0,301.0]|175.68028846153845|
|    104|        8|        472|205| [104.0,8.0,472.0]|205.55624084249084|
|    143|        8|        360|245| [143.0,8.0,360.0]|209.62290750915753|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|193.24153124653122|
|    192|        8|        400|175| [192.0,8.0,400.0]|171.49168997668997|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows

