## Data preparation (standardizing)

In [1]:
import os
import numpy
import pyspark

In [2]:
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
os.getcwd()

'/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML'

In [5]:
path = '/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML'

In [6]:
emp_df = spark.read.csv('/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML/Exercise Files/Ch01/01_04/employee.txt',
                       header = True)

In [7]:
emp_df.show()

+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
| id|   last_name|               email|  gender|   department|  start_date|salary|           job_title|region_id|
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
|  1|    'Kelley'|'rkelley0@soundcl...|'Female'|  'Computers'| '10/2/2009'| 67470|'Structural Engin...|        2|
|  2| 'Armstrong'|'sarmstrong1@info...|  'Male'|     'Sports'| '3/31/2008'| 71869| 'Financial Advisor'|        2|
|  3|      'Carr'|'fcarr2@woothemes...|  'Male'| 'Automotive'| '7/12/2009'|101768|'Recruiting Manager'|        3|
|  4|    'Murray'|   'jmurray3@gov.uk'|'Female'|   'Jewelery'|'12/25/2014'| 96897|'Desktop Support ...|        3|
|  5|     'Ellis'|'jellis4@scienced...|'Female'|    'Grocery'| '9/19/2002'| 63702|'Software Enginee...|        7|
|  6|  'Phillips'|'bphillips5@time....|  'Male'|      'Tools'| '8/21/2013'|118497|'Execu

In [10]:
emp_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- department: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- region_id: string (nullable = true)



In [12]:
from pyspark.sql.types import IntegerType
# val df2 = df.withColumn("yearTmp", df.year.cast(IntegerType))
#     .drop("year")
#     .withColumnRenamed("yearTmp", "year")

In [5]:
from pyspark import SparkConf, SparkContext

In [6]:
conf = SparkConf().setMaster("local").setAppName("Employees")
sc = SparkContext(conf = conf)

In [17]:
def parseData(line):
    fields = line.split(',')
    custId = int(fields[0])
    name = str(fields[1])
    email = str(fields[2])
    gender = str(fields[3])
    dept = str(fields[4])
    start_date = str(fields[5])
    salary = int(fields[6])
    job_title = str(fields[7])
    region_id = int(fields[8])
    return (custId, name, email,dept,start_date,salary,job_title,region_id)

In [13]:
rdd = sc.textFile('/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML/Exercise Files/Ch01/01_04/employee.txt')

In [14]:
type(rdd)

pyspark.rdd.RDD

In [15]:
rdd.take(5)

['id,last_name,email,gender,department,start_date,salary,job_title,region_id',
 "1,'Kelley','rkelley0@soundcloud.com','Female','Computers','10/2/2009',67470,'Structural Engineer',2",
 "2,'Armstrong','sarmstrong1@infoseek.co.jp','Male','Sports','3/31/2008',71869,'Financial Advisor',2",
 "3,'Carr','fcarr2@woothemes.com','Male','Automotive','7/12/2009',101768,'Recruiting Manager',3",
 "4,'Murray','jmurray3@gov.uk','Female','Jewelery','12/25/2014',96897,'Desktop Support Technician',3"]

In [18]:
data = rdd.map(parseData)

In [20]:
data

PythonRDD[6] at RDD at PythonRDD.scala:53

In [8]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

In [10]:
fet_df = spark.createDataFrame([(1,Vectors.dense([10,10000,1]),),
                              (2,Vectors.dense([20,30000,2]),),
                              (3,Vectors.dense([30,40000,3]),)],
                               ['id','features'])
                                
                             

In [12]:
fet_df.show()

+---+------------------+
| id|          features|
+---+------------------+
|  1|[10.0,10000.0,1.0]|
|  2|[20.0,30000.0,2.0]|
|  3|[30.0,40000.0,3.0]|
+---+------------------+



In [13]:
feature_scalar = MinMaxScaler(inputCol='features',outputCol='sfeatures')
smodel = feature_scalar.fit(fet_df)

In [16]:
sfeatures_df = smodel.transform(fet_df)

In [17]:
sfeatures_df.show()

+---+------------------+--------------------+
| id|          features|           sfeatures|
+---+------------------+--------------------+
|  1|[10.0,10000.0,1.0]|       [0.0,0.0,0.0]|
|  2|[20.0,30000.0,2.0]|[0.5,0.6666666666...|
|  3|[30.0,40000.0,3.0]|       [1.0,1.0,1.0]|
+---+------------------+--------------------+



## K-Means Clustering

In [32]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import BisectingKMeans

In [19]:
kmeans_data = '/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML/Exercise Files/Ch03/03_02/clustering_dataset.csv'

In [20]:
cluster_df = spark.read.csv(kmeans_data, header=True,inferSchema=True)

In [21]:
cluster_df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   7|   4|   1|
|   7|   7|   9|
|   7|   9|   6|
|   1|   6|   5|
|   6|   7|   7|
|   7|   9|   4|
|   7|  10|   6|
|   7|   8|   2|
|   8|   3|   8|
|   4|  10|   5|
|   7|   4|   5|
|   7|   8|   4|
|   2|   5|   1|
|   2|   6|   2|
|   2|   3|   8|
|   3|   9|   1|
|   4|   2|   9|
|   1|   7|   1|
|   6|   2|   3|
|   4|   1|   9|
+----+----+----+
only showing top 20 rows



In [22]:
vectorAssembler = VectorAssembler(inputCols=['col1','col2','col3'],outputCol='features')

vcluster_df = vectorAssembler.transform(cluster_df)

In [24]:
vcluster_df.show(5)

+----+----+----+-------------+
|col1|col2|col3|     features|
+----+----+----+-------------+
|   7|   4|   1|[7.0,4.0,1.0]|
|   7|   7|   9|[7.0,7.0,9.0]|
|   7|   9|   6|[7.0,9.0,6.0]|
|   1|   6|   5|[1.0,6.0,5.0]|
|   6|   7|   7|[6.0,7.0,7.0]|
+----+----+----+-------------+
only showing top 5 rows



In [26]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)

In [27]:
kmodel = kmeans.fit(vcluster_df)

In [30]:
centers = kmodel.clusterCenters()

In [31]:
centers

[array([35.88461538, 31.46153846, 34.42307692]),
 array([5.12, 5.84, 4.84]),
 array([80.        , 79.20833333, 78.29166667])]

In [36]:
bikMeans = BisectingKMeans().setK(3).setSeed(1)

In [37]:
bkmodel = bikMeans.fit(vcluster_df)

In [39]:
bkmodel.clusterCenters()

[array([5.12, 5.84, 4.84]),
 array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667])]

## Classification

In [60]:
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer

In [70]:
iris_df = spark.read.csv('/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML/iris.txt',inferSchema=True)

In [71]:
iris_df.show()

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
|5.4|3.7|1.5|0.2|Iris-setosa|
|4.8|3.4|1.6|0.2|Iris-setosa|
|4.8|3.0|1.4|0.1|Iris-setosa|
|4.3|3.0|1.1|0.1|Iris-setosa|
|5.8|4.0|1.2|0.2|Iris-setosa|
|5.7|4.4|1.5|0.4|Iris-setosa|
|5.4|3.9|1.3|0.4|Iris-setosa|
|5.1|3.5|1.4|0.3|Iris-setosa|
|5.7|3.8|1.7|0.3|Iris-setosa|
|5.1|3.8|1.5|0.3|Iris-setosa|
+---+---+---+---+-----------+
only showing top 20 rows



In [72]:
iris_df = iris_df.select(col("_c0").alias("sepal_length"),
                        col("_c1").alias("sepal_width"),
                        col("_c2").alias('petal_length'),
                        col("_c3").alias("petal_width"),
                        col("_c4").alias("species")
                        )

In [73]:
iris_df.show(4)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 4 rows



In [74]:
vectorAssembler = VectorAssembler(inputCols=['sepal_length','sepal_width','petal_length','petal_width'],
                                  outputCol='features')

In [75]:
viris_df = vectorAssembler.transform(iris_df)

In [76]:
viris_df.show(4)

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
+------------+-----------+------------+-----------+-----------+-----------------+
only showing top 4 rows



In [77]:
indexer = StringIndexer(inputCol='species',outputCol='label')
iviris_df = indexer.fit(viris_df).transform(viris_df)

In [78]:
iviris_df.show(4)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 4 rows



In [79]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [82]:
splits = iviris_df.randomSplit([0.6,0.4],1)
train_df = splits[0]
test_df = splits[1]

In [85]:
train_df.count()

92

In [87]:
nb = NaiveBayes(modelType='multinomial')
nbmodel = nb.fit(train_df)

In [88]:
pred_df = nbmodel.transform(test_df)

In [92]:
pred_df.take(1)

[Row(sepal_length=4.5, sepal_width=2.3, petal_length=1.3, petal_width=0.3, species='Iris-setosa', features=DenseVector([4.5, 2.3, 1.3, 0.3]), label=0.0, rawPrediction=DenseVector([-10.3605, -11.0141, -11.7112]), probability=DenseVector([0.562, 0.2924, 0.1456]), prediction=0.0)]

In [96]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')

In [97]:
evaluator.evaluate(pred_df)

0.5862068965517241

In [98]:
iviris_df.show(3)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 3 rows



In [102]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [103]:
layers = [4,5,5,3]

In [105]:
mlp = MultilayerPerceptronClassifier(layers=layers,seed = 1)

In [106]:
mlp_model = mlp.fit(train_df)

In [107]:
mlp_predictions = mlp_model.transform(test_df)

In [108]:
mlp_predictions.take(1)

[Row(sepal_length=4.5, sepal_width=2.3, petal_length=1.3, petal_width=0.3, species='Iris-setosa', features=DenseVector([4.5, 2.3, 1.3, 0.3]), label=0.0, rawPrediction=DenseVector([42.0244, 0.4271, -44.8627]), probability=DenseVector([1.0, 0.0, 0.0]), prediction=0.0)]

In [109]:
mlp_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
mlp_accuracy = mlp_evaluator.evaluate(mlp_predictions)

In [110]:
mlp_accuracy

0.9482758620689655

## Linear Regression

In [111]:
pp_df = spark.read.csv('/Users/ameyapatankar/Desktop/SparkCourse/Spark Lynda/Spark ML/CCPP/power.csv',header = True,
                      inferSchema=True)

In [113]:
pp_df.show(4)

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH|    PE|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
+-----+-----+-------+-----+------+
only showing top 4 rows



In [115]:
pp_df.printSchema()

root
 |-- AT: double (nullable = true)
 |-- V: double (nullable = true)
 |-- AP: double (nullable = true)
 |-- RH: double (nullable = true)
 |-- PE: double (nullable = true)



In [118]:
vectorAssembler = VectorAssembler(inputCols=['AT','V','AP','RH'],outputCol='features')

In [119]:
vpp_df = vectorAssembler.transform(pp_df)

In [120]:
vpp_df.show(4)

+-----+-----+-------+-----+------+--------------------+
|   AT|    V|     AP|   RH|    PE|            features|
+-----+-----+-------+-----+------+--------------------+
|14.96|41.76|1024.07|73.17|463.26|[14.96,41.76,1024...|
|25.18|62.96|1020.04|59.08|444.37|[25.18,62.96,1020...|
| 5.11| 39.4|1012.16|92.14|488.56|[5.11,39.4,1012.1...|
|20.86|57.32|1010.24|76.64|446.48|[20.86,57.32,1010...|
+-----+-----+-------+-----+------+--------------------+
only showing top 4 rows



In [122]:
from pyspark.ml.regression import LinearRegression

In [124]:
lr = LinearRegression(featuresCol='features',labelCol='PE')
lr_model = lr.fit(vpp_df)

In [125]:
lr_model.coefficients

DenseVector([-1.9775, -0.2339, 0.0621, -0.1581])

In [126]:
lr_model.intercept

454.6092744523414

In [127]:
lr_model.summary.rootMeanSquaredError

4.557126016749488