### MLLib 

In [1]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spSession = SparkSession.builder.master("local").appName("MLApp").getOrCreate()

In [3]:
carrosRDD = sc.textFile("data/carros.csv")

In [4]:
carrosRDD.cache()

data/carros.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
carrosRDD.count()

399

In [6]:
carrosRDD.take(5)

['MPG,CYLINDERS,DISPLACEMENT,HORSEPOWER,WEIGHT,ACCELERATION,MODELYEAR,NAME',
 '18,8,307,130,3504,12,70,chevrolet chevelle malibu',
 '15,8,350,165,3693,11.5,70,buick skylark 320',
 '18,8,318,150,3436,11,70,plymouth satellite',
 '16,8,304,150,3433,12,70,amc rebel sst']

In [7]:
# removendo header
carrosRDD2 = carrosRDD.filter(lambda line : "DISPLACEMENT" not in line)

In [8]:
carrosRDD2.count()

398

#### Limpeza de dados

In [9]:
# constant - valor padrão para average HP
mediaHP = sc.broadcast(75.8)

In [10]:
def limpaDados(inputStr):
        global mediaHP
        attList = inputStr.split(",")
        
        hpValue = attList[3]
        if hpValue == "?":
            hpValue = mediaHP.value
         
        linhas = Row(MPG = float(attList[0]), 
                    CYLINDERS =  float(attList[1]), 
                    DISPLACEMENT =  float(attList[2]), 
                    HORSEPOWER =  float(hpValue), 
                    WEIGHT =  float(attList[4]), 
                    ACCELERATION =  float(attList[5]), 
                    MODELYEAR =  float(attList[6]), 
                    NAME =  attList[7])
        return linhas
            

In [11]:
carrosRDD3 = carrosRDD2.map(limpaDados)
carrosRDD3.count()


398

In [12]:
carrosRDD3.take(5)

[Row(ACCELERATION=12.0, CYLINDERS=8.0, DISPLACEMENT=307.0, HORSEPOWER=130.0, MODELYEAR=70.0, MPG=18.0, NAME='chevrolet chevelle malibu', WEIGHT=3504.0),
 Row(ACCELERATION=11.5, CYLINDERS=8.0, DISPLACEMENT=350.0, HORSEPOWER=165.0, MODELYEAR=70.0, MPG=15.0, NAME='buick skylark 320', WEIGHT=3693.0),
 Row(ACCELERATION=11.0, CYLINDERS=8.0, DISPLACEMENT=318.0, HORSEPOWER=150.0, MODELYEAR=70.0, MPG=18.0, NAME='plymouth satellite', WEIGHT=3436.0),
 Row(ACCELERATION=12.0, CYLINDERS=8.0, DISPLACEMENT=304.0, HORSEPOWER=150.0, MODELYEAR=70.0, MPG=16.0, NAME='amc rebel sst', WEIGHT=3433.0),
 Row(ACCELERATION=10.5, CYLINDERS=8.0, DISPLACEMENT=302.0, HORSEPOWER=140.0, MODELYEAR=70.0, MPG=17.0, NAME='ford torino', WEIGHT=3449.0)]

### Análise Exploratória de Dados

In [13]:
carrosDF = spSession.createDataFrame(carrosRDD3)

In [14]:
carrosDF.select("MPG", "CYLINDERS").describe().show()

+-------+-----------------+------------------+
|summary|              MPG|         CYLINDERS|
+-------+-----------------+------------------+
|  count|              398|               398|
|   mean|23.51457286432161| 5.454773869346734|
| stddev|7.815984312565782|1.7010042445332125|
|    min|              9.0|               3.0|
|    max|             46.6|               8.0|
+-------+-----------------+------------------+



In [15]:
for i in carrosDF.columns:
    if not(isinstance(carrosDF.select(i).take(1)[0][0], str)):
        print("Correlação de MPG com", i, carrosDF.stat.corr("MPG", i))

Correlação de MPG com ACCELERATION 0.4202889121016501
Correlação de MPG com CYLINDERS -0.7753962854205548
Correlação de MPG com DISPLACEMENT -0.8042028248058979
Correlação de MPG com HORSEPOWER -0.7747055053130659
Correlação de MPG com MODELYEAR 0.5792671330833091
Correlação de MPG com MPG 1.0
Correlação de MPG com WEIGHT -0.8317409332443347


### Pré Processamento 

In [16]:
#from IPhyton.display import Image
#Image("images/vetores.png")


In [17]:
def transformaVar(row):
    obj = (row["MPG"], Vectors.dense([row["ACCELERATION"],row["DISPLACEMENT"],row["WEIGHT"]]))
    return obj

In [19]:
carrosRDD4 = carrosRDD3.map(transformaVar)

In [23]:
carrosDF = spSession.createDataFrame(carrosRDD4, ["label", "features"])

In [24]:
carrosDF.show(5)

+-----+-------------------+
|label|           features|
+-----+-------------------+
| 18.0|[12.0,307.0,3504.0]|
| 15.0|[11.5,350.0,3693.0]|
| 18.0|[11.0,318.0,3436.0]|
| 16.0|[12.0,304.0,3433.0]|
| 17.0|[10.5,302.0,3449.0]|
+-----+-------------------+
only showing top 5 rows



In [26]:
carrosRDD4.take(5)

[(18.0, DenseVector([12.0, 307.0, 3504.0])),
 (15.0, DenseVector([11.5, 350.0, 3693.0])),
 (18.0, DenseVector([11.0, 318.0, 3436.0])),
 (16.0, DenseVector([12.0, 304.0, 3433.0])),
 (17.0, DenseVector([10.5, 302.0, 3449.0]))]

### Machine Learning

In [28]:
(dados_treino, dados_teste) = carrosDF.randomSplit([0.7, 0.3])

In [31]:
dados_treino.count()

283

In [32]:
dados_teste.count()

115

In [33]:
linearReg = LinearRegression(maxIter = 10)
modelo = linearReg.fit(dados_treino)

In [34]:
print(modelo)

LinearRegression_697b7265fe4c


In [36]:
print("coefic: " + str(modelo.coefficients))
print("intercept: " + str(modelo.intercept))


coefic: [0.16728194142447705,-0.01085036084612367,-0.006218280932126904]
intercept: 41.52388714177826


In [39]:
predictions = modelo.transform(dados_teste)
predictions.show()

+-----+-------------------+------------------+
|label|           features|        prediction|
+-----+-------------------+------------------+
| 10.0|[14.0,360.0,4615.0]|11.262337915350756|
| 11.0|[11.0,350.0,3664.0]|16.782580865991246|
| 11.0|[13.5,318.0,4382.0]|13.083271557361279|
| 12.0|[11.0,455.0,4951.0]| 7.640365417500938|
| 12.0|[11.5,429.0,4952.0]| 7.999897489280265|
| 12.0|[12.5,350.0,4499.0]|11.841239199801997|
| 12.0|[13.5,350.0,4456.0]|12.275907221307932|
| 13.0|[11.0,360.0,3821.0]| 15.69780715118609|
| 13.0|[12.0,302.0,3169.0]| 20.54872918943248|
| 13.0|[13.0,350.0,4100.0]| 14.40597426243287|
| 13.0|[16.0,302.0,4294.0]|14.222290906487618|
| 14.0| [8.5,440.0,4312.0]|11.358397492260693|
| 14.0|[10.0,455.0,4425.0]|10.743899246375214|
| 14.0|[11.5,400.0,4464.0]|11.349079048695778|
| 14.0|[13.0,318.0,4096.0]|14.778058933237336|
| 14.0|[14.0,318.0,4077.0]|15.063488212372224|
| 15.0| [8.5,390.0,3850.0]|14.773761325209502|
| 15.0|[21.0,250.0,3432.0]|20.983077541101828|
| 15.5|[13.9,

In [40]:
avaliador = RegressionEvaluator(predictionCol = "prediction", labelCol="label", metricName ="r2")

In [41]:
avaliador.evaluate(predictions)

0.6972233895747204

In [None]:
# Melhorar!
# incluir outras variaveis
# mudar as variaveis
# mudar o split de treinamento e teste
# mudar a quantidade de iteracoes