In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import VectorAssembler,MinMaxScaler,StandardScaler
from pyspark.ml.classification import LogisticRegression,OneVsRest, OneVsRestModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)

In [4]:
spark = SparkSession\
    .builder\
    .appName("MyApp")\
    .getOrCreate()

## Load the dataset

In [5]:
df = spark.read\
    .format("csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter",";")\
    .load("Datasets/Data-1.csv")#.repartition(3)

## Explore the dataset

In [6]:
df.show(5)

+----+----+----+---+-----+----+----+------+----+----+----+---+
| x_1| x_2| x_3|x_4|  x_5| x_6| x_7|   x_8| x_9|x_10|x_11|  y|
+----+----+----+---+-----+----+----+------+----+----+----+---+
| 7.4| 0.7| 0.0|1.9|0.076|11.0|34.0|0.9978|3.51|0.56| 9.4|  5|
| 7.8|0.88| 0.0|2.6|0.098|25.0|67.0|0.9968| 3.2|0.68| 9.8|  5|
| 7.8|0.76|0.04|2.3|0.092|15.0|54.0| 0.997|3.26|0.65| 9.8|  5|
|11.2|0.28|0.56|1.9|0.075|17.0|60.0| 0.998|3.16|0.58| 9.8|  6|
| 7.4| 0.7| 0.0|1.9|0.076|11.0|34.0|0.9978|3.51|0.56| 9.4|  5|
+----+----+----+---+-----+----+----+------+----+----+----+---+
only showing top 5 rows



In [7]:
df.printSchema()

root
 |-- x_1: double (nullable = true)
 |-- x_2: double (nullable = true)
 |-- x_3: double (nullable = true)
 |-- x_4: double (nullable = true)
 |-- x_5: double (nullable = true)
 |-- x_6: double (nullable = true)
 |-- x_7: double (nullable = true)
 |-- x_8: double (nullable = true)
 |-- x_9: double (nullable = true)
 |-- x_10: double (nullable = true)
 |-- x_11: double (nullable = true)
 |-- y: integer (nullable = true)



In [8]:
df.describe().toPandas()

Unnamed: 0,summary,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,y
0,count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
1,mean,8.319637273295838,0.5278205128205131,0.2709756097560964,2.538805503439652,0.0874665415884925,15.87492182614134,46.46779237023139,0.9967466791744832,3.311113195747343,0.6581488430268921,10.422983114446502,5.636022514071295
2,stddev,1.7410963181276948,0.1790597041535352,0.1948011374053182,1.40992805950728,0.04706530201009,10.46015696980971,32.89532447829907,0.0018873339538427,0.1543864649035427,0.1695069795901101,1.0656675818473935,0.8075694397347051
3,min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
4,max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
df.select("y").distinct().sort("y").show()

+---+
|  y|
+---+
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+



In [10]:
df.groupBy("y").count().sort("y").show()

+---+-----+
|  y|count|
+---+-----+
|  3|   10|
|  4|   53|
|  5|  681|
|  6|  638|
|  7|  199|
|  8|   18|
+---+-----+



## Apply Vector Assembler

In [11]:
features = df.columns[0:len(df.columns)-1]
features

['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11']

In [12]:
assembler = VectorAssembler()\
    .setInputCols(features)\
    .setOutputCol("features")

In [13]:
df_v = assembler.transform(df)\
    .select(col("features"), col("y").alias("label"))

df_v.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.4,0.7,0.0,1.9,...|    5|
|[7.8,0.88,0.0,2.6...|    5|
|[7.8,0.76,0.04,2....|    5|
|[11.2,0.28,0.56,1...|    6|
|[7.4,0.7,0.0,1.9,...|    5|
|[7.4,0.66,0.0,1.8...|    5|
|[7.9,0.6,0.06,1.6...|    5|
|[7.3,0.65,0.0,1.2...|    7|
|[7.8,0.58,0.02,2....|    7|
|[7.5,0.5,0.36,6.1...|    5|
+--------------------+-----+
only showing top 10 rows



## Split into train and test sets

In [14]:
trainingData, testData = df_v.randomSplit([0.7, 0.3])

## Scale the features

In [15]:
scaler = MinMaxScaler()\
    .setInputCol("features")\
    .setOutputCol("scaledFeatures")

In [16]:
s = scaler.fit(trainingData)

In [17]:
trainingData_s = s.transform(trainingData)
testData_s = s.transform(testData)

## Train the model

In [18]:
lr = LogisticRegression()\
    .setMaxIter(100)\
    .setFeaturesCol("scaledFeatures")

In [19]:
ovr = OneVsRest().setClassifier(lr)

In [20]:
model = ovr.fit(trainingData_s)

## Make predictions

In [21]:
predictions = model.transform(testData_s)
predictions.show(5)
#predictions.toPandas()

+--------------------+-----+--------------------+----------+
|            features|label|      scaledFeatures|prediction|
+--------------------+-----+--------------------+----------+
|[5.0,0.4,0.5,4.3,...|    6|[0.03539823008849...|       7.0|
|[5.0,0.42,0.24,2....|    8|[0.03539823008849...|       7.0|
|[5.0,1.04,0.24,1....|    5|[0.03539823008849...|       5.0|
|[5.1,0.42,0.0,1.8...|    7|[0.04424778761061...|       7.0|
|[5.1,0.585,0.0,1....|    7|[0.04424778761061...|       6.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows



In [22]:
predictions.stat\
    .crosstab("label", "prediction")\
    .sort("label_prediction").show()

+----------------+---+---+---+
|label_prediction|5.0|6.0|7.0|
+----------------+---+---+---+
|               3|  2|  1|  0|
|               4| 10|  1|  0|
|               5|149| 32|  2|
|               6| 72| 84| 21|
|               7|  2| 27| 15|
|               8|  0|  2|  4|
+----------------+---+---+---+



## Evaluate the model

In [23]:
evaluator = MulticlassClassificationEvaluator()\
    .setLabelCol("label")\
    .setPredictionCol("prediction")\
    .setMetricName("accuracy")

In [24]:
accuracy = evaluator.evaluate(predictions)
accuracy

0.5849056603773585

## Save and load the model

In [25]:
model.save("LR-Model")

In [26]:
model_2 = OneVsRestModel.load("LR-Model")

In [27]:
predictions_2 = model_2.transform(testData_s)
predictions_2.show(5)
#predictions_2.toPandas()

+--------------------+-----+--------------------+----------+
|            features|label|      scaledFeatures|prediction|
+--------------------+-----+--------------------+----------+
|[5.0,0.4,0.5,4.3,...|    6|[0.03539823008849...|       7.0|
|[5.0,0.42,0.24,2....|    8|[0.03539823008849...|       7.0|
|[5.0,1.04,0.24,1....|    5|[0.03539823008849...|       5.0|
|[5.1,0.42,0.0,1.8...|    7|[0.04424778761061...|       7.0|
|[5.1,0.585,0.0,1....|    7|[0.04424778761061...|       6.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows

