# Mini Projeto - 5 

Previsões da força do concreto com base nas características

###### Construção de AutoML

In [2]:
import findspark
findspark.init()

In [4]:
# Imports
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.stat import Correlation
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # Otimização d ehiperparâmetros

## Preparando ambiente SPARK

In [5]:
# Criando o Spark Context
sc = SparkContext(appName = 'Mini_Projeto5')

In [7]:
# Criando a sessão
spark = SparkSession.builder.getOrCreate()

In [14]:
spark

## Carregando Dataset 

In [10]:
dados = spark.read.csv("C:/formacao_dataScience_DSA_DADOS/01_bigData_RealTime_Python_Spark/cap14_Apache_Spark_MLlib/mini_projeto_5/dataset.csv",
                      inferSchema = True, header = True)

In [11]:
dados.show(5)

+------+-----+------+-----+----------------+---------------+-------------+---+-----+
|cement| slag|flyash|water|superplasticizer|coarseaggregate|fineaggregate|age|csMPa|
+------+-----+------+-----+----------------+---------------+-------------+---+-----+
| 540.0|  0.0|   0.0|162.0|             2.5|         1040.0|        676.0| 28|79.99|
| 540.0|  0.0|   0.0|162.0|             2.5|         1055.0|        676.0| 28|61.89|
| 332.5|142.5|   0.0|228.0|             0.0|          932.0|        594.0|270|40.27|
| 332.5|142.5|   0.0|228.0|             0.0|          932.0|        594.0|365|41.05|
| 198.6|132.4|   0.0|192.0|             0.0|          978.4|        825.5|360| 44.3|
+------+-----+------+-----+----------------+---------------+-------------+---+-----+
only showing top 5 rows



In [13]:
dados.count()

1030

In [19]:
# Utilizando o sparkSQL
dados.limit(10).toPandas()

# Utilizando o metodo '.toPandas()' 

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365,43.7
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29


In [20]:
dados.printSchema()

root
 |-- cement: double (nullable = true)
 |-- slag: double (nullable = true)
 |-- flyash: double (nullable = true)
 |-- water: double (nullable = true)
 |-- superplasticizer: double (nullable = true)
 |-- coarseaggregate: double (nullable = true)
 |-- fineaggregate: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- csMPa: double (nullable = true)



# Módulo de Automação da Preparação de Dados


O MLlib erquer que todas as colunas de entrada do dataframe sejam vetorizadas. Vamos criar uma função Python que irá automatizar nosso trabalho de preparação dos dados, incluindo a vetorização de todas as tarefas necessárias


Primeiro, vamos listar e remover valores ausentes.

In [21]:
# Separação dos dados ausentes (se existirem) e remoção
dados_com_linhas_removidas = dados.na.drop()
print(f"Número de linhas antes de remover valores ausentes: {dados.count()}")
print(f"Número de linhas após de remover valores ausentes: {dados_com_linhas_removidas.count()}")

Número de linhas antes de remover valores ausentes: 1030
Número de linhas após de remover valores ausentes: 1030


In [None]:
# Função para preparação dos dados

def func_modulo_prep_dados(df,
                          variaveis_entrada,
                          vaiavel_saida,
                          tratar_outliers = True,
                          padronizar_dados = True00):
    
    # Vamos gerar um novo dataframe, renomeando o argumento que representa a saída.
    novo_df = df.withColumnRenamed(variavel_saida, 'label')
    
    # Convertemos a variável alvo para o tipo numérico como float (encoding)
    