In [48]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkApp").getOrCreate()

In [49]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [50]:
carros_temp = spark.read.csv("/home/anycaroliny/download/Carros.csv", inferSchema=True, header=True, sep=";")

In [51]:
carros_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [52]:
carros = carros_temp.select("Consumo", "Cilindros", "Cilindradas", "HP")

In [53]:
carros.show()

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
|    181|        6|        225|105|
|    143|        8|        360|245|
|    244|        4|       1467| 62|
|    228|        4|       1408| 95|
|    192|        6|       1676|123|
|    178|        6|       1676|123|
|    164|        8|       2758|180|
|    173|        8|       2758|180|
|    152|        8|       2758|180|
|    104|        8|        472|205|
|    104|        8|        460|215|
|    147|        8|        440|230|
|    324|        4|        787| 66|
|    304|        4|        757| 52|
|    339|        4|        711| 65|
+-------+---------+-----------+---+
only showing top 20 rows



In [54]:
vector_caracteristicas = VectorAssembler(inputCols=[("Consumo"),("Cilindros"),("Cilindradas")], outputCol="specs")

In [55]:
vector_carros = vector_caracteristicas.transform(carros)

In [56]:
vector_carros.show()

+-------+---------+-----------+---+------------------+
|Consumo|Cilindros|Cilindradas| HP|             specs|
+-------+---------+-----------+---+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|    228|        4|        108| 93| [228.0,4.0,108.0]|
|    214|        6|        258|110| [214.0,6.0,258.0]|
|    187|        8|        360|175| [187.0,8.0,360.0]|
|    181|        6|        225|105| [181.0,6.0,225.0]|
|    143|        8|        360|245| [143.0,8.0,360.0]|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|
|    178|        6|       1676|123|[178.0,6.0,1676.0]|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|
|    104|        8|        472|205| [104.0,8.0,472.0]|
|    104| 

In [57]:
carros_treino, carros_teste = vector_carros.randomSplit([0.7, 0.3])

In [58]:
print(carros_treino.count())
print(carros_teste.count())

23
9


In [59]:
linear_reg = LinearRegression(featuresCol="specs", labelCol="HP")

In [60]:
modelo = linear_reg.fit(carros_treino)

23/07/24 15:36:27 WARN Instrumentation: [036b1a0a] regParam is zero, which might cause numerical instability and overfitting.
23/07/24 15:36:27 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/07/24 15:36:27 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/07/24 15:36:27 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [61]:
previsao = modelo.transform(carros_teste)

In [62]:
previsao.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|             specs|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    104|        8|        460|215| [104.0,8.0,460.0]|219.09385008019643|
|    133|        8|        350|245| [133.0,8.0,350.0]|219.05506336487593|
|    152|        8|        304|150| [152.0,8.0,304.0]|218.69806414989236|
|    178|        6|       1676|123|[178.0,6.0,1676.0]| 136.8272942699164|
|    181|        6|        225|105| [181.0,6.0,225.0]|155.13470410906024|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows



In [65]:
avaliador = RegressionEvaluator(predictionCol="prediction", labelCol="HP", metricName="rmse")
# Root mean squared error
rmse = avaliador.evaluate(previsao)
print(rmse)

37.22133721541322


In [66]:
random_forest_reg = RandomForestRegressor(featuresCol="specs", labelCol="HP")
modelo2 = random_forest_reg.fit(carros_treino)
previsao2 = modelo2.transform(carros_teste)

23/07/24 15:44:53 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 23 (= number of training instances)


In [70]:
previsao.show(5)
previsao2.show(5)

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|             specs|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    104|        8|        460|215| [104.0,8.0,460.0]|219.09385008019643|
|    133|        8|        350|245| [133.0,8.0,350.0]|219.05506336487593|
|    152|        8|        304|150| [152.0,8.0,304.0]|218.69806414989236|
|    178|        6|       1676|123|[178.0,6.0,1676.0]| 136.8272942699164|
|    181|        6|        225|105| [181.0,6.0,225.0]|155.13470410906024|
+-------+---------+-----------+---+------------------+------------------+
only showing top 5 rows

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|             specs|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    104|        8|        460|215| [104.0,8.0,460.0]|         184.02125|
|    133|    

In [74]:
rmse2 = avaliador.evaluate(previsao2)
print(rmse)
print(rmse2)

37.22133721541322
31.98469121444896
