# Componente de machine learning do Spark MLlib
* Acesso a dados
* Regressão, classificação e agrupamento com Spark

In [None]:
from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession

# ponto de entrada - sessão spark

In [None]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# exemplo de operação com dataframe

In [None]:
df = spark.read.json("pessoas.json")
df.show()

# Algumas operações com dataframe

In [None]:
# Print schema
df.printSchema()

# selecione apenas a coluna nome
df.select("nome").show()

# seleciona todo mundo e adiciona 1 a idade
df.select(df['nome'], df['idade'] + 1).show()

# filtra apenas maiores de 21
df.filter(df['idade'] > 21).show()

# Conta pessoas por idade
df.groupBy("idade").count().show()

# regressao linear com PySpark
* Preparação de dados

In [None]:
#carregando CSV
training = spark.read.load("Auto2.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

training.show()

# regressao linear com PySpark
* Indexando para coluna categórica

In [None]:
from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCol='origin',outputCol='origin_cat')
indexed=indexer.fit(training).transform(training)
indexed.show()

# regressao linear com PySpark
* Separando entre features e target

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

#input cols identifica features
assembler=VectorAssembler(inputCols=['cylinders','displacement','horsepower','weight','acceleration','year','origin_cat']
                          ,outputCol='features')

output=assembler.transform(indexed)
output.select('features','mpg').show(5)

# regressao linear com PySpark
* Separa em treino e teste

In [None]:

final_data=output.select('features','mpg')
train_data,test_data=final_data.randomSplit([0.7,0.3])
train_data.describe().show()

# regressao linear com PySpark
* Executa o modelo

In [None]:

from pyspark.ml.regression import LinearRegression

lr=LinearRegression(featuresCol='features',labelCol='mpg')

model=lr.fit(train_data)

results=model.evaluate(train_data)
  
print('Rsquared :',results.r2)

In [None]:
unlabeled_data=test_data.select('features')
unlabeled_data.show(5)

In [None]:
predictions=model.transform(unlabeled_data)
predictions.show()

# classificação com pyspark

In [None]:
#carregando CSV
diab = spark.read.load("pima-indians-diabetes.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

diab.show()

In [None]:
assembler=VectorAssembler(inputCols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
                          ,outputCol='features')

output=assembler.transform(diab)
output.select('features','Class').show(5)

In [None]:
final_data=output.select('features','Class')
train_data,test_data=final_data.randomSplit([0.7,0.3])
train_data.describe().show()

In [None]:
from pyspark.ml.classification import LogisticRegression

logr = LogisticRegression(featuresCol='features', labelCol='Class')

model=logr.fit(train_data)

results=model.evaluate(train_data)
  
#print('Rsquared :',results.r2)

In [None]:
predictions = model.transform(test_data)
predictions.select( 'Class', 'rawPrediction', 'prediction', 'probability').show(50)

# Kmeans Spark

In [2]:
import numpy as np

import matplotlib.pyplot as plt

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

import datetime

spark = SparkSession.builder.appName('SparkKmeans').getOrCreate()

df2 = spark.read.load("/home/silvio/dataset/minute_weather.csv",
                     format="csv", sep=",", inferSchema="true", header="true")
                     
df = df2.drop("rowID","hpwren_timestamp")

df = df.fillna(0)

B=datetime.datetime.now()

cost = []
vecAssembler = VectorAssembler(inputCols=df.columns, outputCol="features")
vector_df = vecAssembler.transform(df)

print('teste silhoute')    

K = range(2,15)
for k in K:
    #kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol('features')
    #model = kmeans.fit(vector_df)
    kmeans = KMeans().setK(k).setSeed(1)
    model = kmeans.fit(vector_df )
    cost.append(model.summary.trainingCost)

E=datetime.datetime.now()
print(E-B)
print(cost)

teste silhoute
teste silhoute
teste silhoute
teste silhoute
0:01:01.698701
[31674549689.40811, 24087331264.325245, 19794348925.613716, 15439516264.466814, 9412087755.858288, 5057262331.923229, 3956913265.8476677, 3363326457.9555845, 3244683515.5098653, 2985889535.915539, 3432965682.942521, 2582613120.354102, 2266288466.903803]
0:01:01.698701
[31674549689.40811, 24087331264.325245, 19794348925.613716, 15439516264.466814, 9412087755.858288, 5057262331.923229, 3956913265.8476677, 3363326457.9555845, 3244683515.5098653, 2985889535.915539, 3432965682.942521, 2582613120.354102, 2266288466.903803]
0:01:01.698701
[31674549689.40811, 24087331264.325245, 19794348925.613716, 15439516264.466814, 9412087755.858288, 5057262331.923229, 3956913265.8476677, 3363326457.9555845, 3244683515.5098653, 2985889535.915539, 3432965682.942521, 2582613120.354102, 2266288466.903803]
0:01:01.698701
[31674549689.40811, 24087331264.325245, 19794348925.613716, 15439516264.466814, 9412087755.858288, 5057262331.923229, 