In [1]:
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spSession = SparkSession.builder.master("local").appName("DSA").getOrCreate()

In [3]:
irisRDD = sc.textFile("data/iris.csv")

In [4]:
irisRDD.cache()

data/iris.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
irisRDD.count()

151

In [6]:
irisRDD.take(5)

['Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species',
 '5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa']

In [7]:
irisRDD2 = irisRDD.filter(lambda line: "Sepal" not in line)
irisRDD2.count()

150

### Limpeza de Dados

In [8]:
irisRDD3 = irisRDD2.map(lambda line: line.split(","))

In [9]:
irisRDD4 = irisRDD3.map(lambda p: Row(SEPAL_LENGTH = float(p[0]), 
                                      SEPAL_WIDTH = float(p[1]),
                                     PETAL_LENGTH = float(p[2]),
                                     PEPAL_WIDTH = float(p[3]),
                                     SPECIES = p[4]))

In [10]:
irisRDD4.take(4)

[Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.4, SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, SPECIES='setosa'),
 Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.4, SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, SPECIES='setosa'),
 Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.3, SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, SPECIES='setosa'),
 Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.5, SEPAL_LENGTH=4.6, SEPAL_WIDTH=3.1, SPECIES='setosa')]

In [11]:
irisDF = spSession.createDataFrame(irisRDD4)
irisDF.cache()

DataFrame[PEPAL_WIDTH: double, PETAL_LENGTH: double, SEPAL_LENGTH: double, SEPAL_WIDTH: double, SPECIES: string]

In [12]:
irisDF.show(3)

+-----------+------------+------------+-----------+-------+
|PEPAL_WIDTH|PETAL_LENGTH|SEPAL_LENGTH|SEPAL_WIDTH|SPECIES|
+-----------+------------+------------+-----------+-------+
|        0.2|         1.4|         5.1|        3.5| setosa|
|        0.2|         1.4|         4.9|        3.0| setosa|
|        0.2|         1.3|         4.7|        3.2| setosa|
+-----------+------------+------------+-----------+-------+
only showing top 3 rows



In [13]:
irisDF.take(3)

[Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.4, SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, SPECIES='setosa'),
 Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.4, SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, SPECIES='setosa'),
 Row(PEPAL_WIDTH=0.2, PETAL_LENGTH=1.3, SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, SPECIES='setosa')]

In [14]:
# criando um índice numérico para coluna target
stringIndexer = StringIndexer(inputCol = "SPECIES", outputCol = "IDX_SPECIES")

In [15]:
si_model = stringIndexer.fit(irisDF)

In [16]:
irisNormDT = si_model.transform(irisDF)

In [17]:
irisNormDT.filter("IDX_SPECIES not in  (2,0)").show()

+-----------+------------+------------+-----------+---------+-----------+
|PEPAL_WIDTH|PETAL_LENGTH|SEPAL_LENGTH|SEPAL_WIDTH|  SPECIES|IDX_SPECIES|
+-----------+------------+------------+-----------+---------+-----------+
|        2.5|         6.0|         6.3|        3.3|virginica|        1.0|
|        1.9|         5.1|         5.8|        2.7|virginica|        1.0|
|        2.1|         5.9|         7.1|        3.0|virginica|        1.0|
|        1.8|         5.6|         6.3|        2.9|virginica|        1.0|
|        2.2|         5.8|         6.5|        3.0|virginica|        1.0|
|        2.1|         6.6|         7.6|        3.0|virginica|        1.0|
|        1.7|         4.5|         4.9|        2.5|virginica|        1.0|
|        1.8|         6.3|         7.3|        2.9|virginica|        1.0|
|        1.8|         5.8|         6.7|        2.5|virginica|        1.0|
|        2.5|         6.1|         7.2|        3.6|virginica|        1.0|
|        2.0|         5.1|         6.5

In [18]:
irisNormDT.select("SPECIES", "IDX_SPECIES").distinct().collect()

[Row(SPECIES='versicolor', IDX_SPECIES=0.0),
 Row(SPECIES='setosa', IDX_SPECIES=2.0),
 Row(SPECIES='virginica', IDX_SPECIES=1.0)]

### Análise Exploratória de Dados

In [19]:
irisNormDT.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+------------------+
|summary|       PEPAL_WIDTH|      PETAL_LENGTH|      SEPAL_LENGTH|       SEPAL_WIDTH|  SPECIES|       IDX_SPECIES|
+-------+------------------+------------------+------------------+------------------+---------+------------------+
|  count|               150|               150|               150|               150|      150|               150|
|   mean|1.1993333333333331| 3.758000000000001| 5.843333333333332|3.0573333333333337|     null|               1.0|
| stddev|0.7622376689603467|1.7652982332594662|0.8280661279778634|0.4358662849366978|     null|0.8192319205190404|
|    min|               0.1|               1.0|               4.3|               2.0|   setosa|               0.0|
|    max|               2.5|               6.9|               7.9|               4.4|virginica|               2.0|
+-------+------------------+------------------+------------------+--------------

In [20]:
# correlação entre as variaveis
for i in irisNormDT.columns:
    if not(isinstance(irisNormDT.select(i).take(1)[0][0], str)):
        print("correlação IDX_SPECIES com ", i, irisNormDT.stat.corr("IDX_SPECIES", i))

correlação IDX_SPECIES com  PEPAL_WIDTH -0.5803770334306263
correlação IDX_SPECIES com  PETAL_LENGTH -0.649241830764174
correlação IDX_SPECIES com  SEPAL_LENGTH -0.46003915650023686
correlação IDX_SPECIES com  SEPAL_WIDTH 0.6183715308237433
correlação IDX_SPECIES com  IDX_SPECIES 1.0


### Pré-processamento dos dados

In [21]:
# Remove colunas não relevantes para o modelo 
# Remove colunas com baixa correlação
def transformaVar(row):
    obj = (row["SPECIES"], row["IDX_SPECIES"], Vectors.dense([row["SEPAL_LENGTH"],
                                                             row["SEPAL_WIDTH"],
                                                             row["PETAL_LENGTH"],
                                                             row["PEPAL_WIDTH"]])) 
    return obj

In [22]:
irisRDD5 = irisNormDT.rdd.map(transformaVar)
irisRDD5.take(5)

[('setosa', 2.0, DenseVector([5.1, 3.5, 1.4, 0.2])),
 ('setosa', 2.0, DenseVector([4.9, 3.0, 1.4, 0.2])),
 ('setosa', 2.0, DenseVector([4.7, 3.2, 1.3, 0.2])),
 ('setosa', 2.0, DenseVector([4.6, 3.1, 1.5, 0.2])),
 ('setosa', 2.0, DenseVector([5.0, 3.6, 1.4, 0.2]))]

In [23]:
irisDF = spSession.createDataFrame(irisRDD5, ["species","label", "features"])
irisDF.show(5)
irisDF.cache()

+-------+-----+-----------------+
|species|label|         features|
+-------+-----+-----------------+
| setosa|  2.0|[5.1,3.5,1.4,0.2]|
| setosa|  2.0|[4.9,3.0,1.4,0.2]|
| setosa|  2.0|[4.7,3.2,1.3,0.2]|
| setosa|  2.0|[4.6,3.1,1.5,0.2]|
| setosa|  2.0|[5.0,3.6,1.4,0.2]|
+-------+-----+-----------------+
only showing top 5 rows



DataFrame[species: string, label: double, features: vector]

### Machine Learning

In [24]:
(dados_treino, dados_teste) = irisDF.randomSplit([0.7, 0.3])

In [25]:
dados_treino.count()

105

In [26]:
dados_teste.count()

45

In [27]:
# treinando o modelo
classifier = DecisionTreeClassifier(maxDepth=2, labelCol = "label", featuresCol = "features")
model = classifier.fit(dados_treino)

In [28]:
model.numNodes

5

In [29]:
model.depth

2

In [30]:
# predicao do dataset de test
predictions = model.transform(dados_teste)
predictions.show()

+----------+-----+-----------------+--------------+--------------------+----------+
|   species|label|         features| rawPrediction|         probability|prediction|
+----------+-----+-----------------+--------------+--------------------+----------+
|    setosa|  2.0|[4.3,3.0,1.1,0.1]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.4,3.2,1.3,0.2]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.5,2.3,1.3,0.3]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.6,3.6,1.0,0.2]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.8,3.0,1.4,0.1]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.8,3.0,1.4,0.3]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.8,3.1,1.6,0.2]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.8,3.4,1.6,0.2]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|       2.0|
|    setosa|  2.0|[4.9,3.6,1.4,0.1]|[0.0,0.0,32.0]|       [0.0,0.0,1.0]|    

In [31]:
predictions.select("*").take(3)

[Row(species='setosa', label=2.0, features=DenseVector([4.3, 3.0, 1.1, 0.1]), rawPrediction=DenseVector([0.0, 0.0, 32.0]), probability=DenseVector([0.0, 0.0, 1.0]), prediction=2.0),
 Row(species='setosa', label=2.0, features=DenseVector([4.4, 3.2, 1.3, 0.2]), rawPrediction=DenseVector([0.0, 0.0, 32.0]), probability=DenseVector([0.0, 0.0, 1.0]), prediction=2.0),
 Row(species='setosa', label=2.0, features=DenseVector([4.5, 2.3, 1.3, 0.3]), rawPrediction=DenseVector([0.0, 0.0, 32.0]), probability=DenseVector([0.0, 0.0, 1.0]), prediction=2.0)]

In [32]:
### Avaliação do modelo

In [33]:
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction",
                                             labelCol = "label",
                                             metricName = "accuracy")

In [34]:
evaluator.evaluate(predictions)

0.9111111111111111

In [35]:
# confision matrix
predictions.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    8|
|  0.0|       1.0|    3|
|  2.0|       2.0|   18|
|  1.0|       0.0|    1|
|  0.0|       0.0|   15|
+-----+----------+-----+



In [36]:
# predicao de 1 instancia
df = spSession.createDataFrame(dados_teste.limit(1).take(1))
predictions = model.transform(df)
predictions.show()

+-------+-----+-----------------+--------------+-------------+----------+
|species|label|         features| rawPrediction|  probability|prediction|
+-------+-----+-----------------+--------------+-------------+----------+
| setosa|  2.0|[4.3,3.0,1.1,0.1]|[0.0,0.0,32.0]|[0.0,0.0,1.0]|       2.0|
+-------+-----+-----------------+--------------+-------------+----------+

