In [1]:
from __future__ import print_function
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [2]:
if __name__ == "__main__":
    sc = SparkSession .builder\
        .appName("PCA")\
        .getOrCreate()

VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees.

In [11]:
df = sc.read.csv("Glass.data",header=True)

In [12]:
df.show()

+-----+-------+-----+----+----+-----+----+----+----+----+-----+
|index|     RI|   Na|  Mg|  Al|   Si|   K|  Ca|  Ba|  Fe|Class|
+-----+-------+-----+----+----+-----+----+----+----+----+-----+
|    1|1.52101|13.64|4.49|1.10|71.78|0.06|8.75|0.00|0.00|    1|
|    2|1.51761|13.89|3.60|1.36|72.73|0.48|7.83|0.00|0.00|    1|
|    3|1.51618|13.53|3.55|1.54|72.99|0.39|7.78|0.00|0.00|    1|
|    4|1.51766|13.21|3.69|1.29|72.61|0.57|8.22|0.00|0.00|    1|
|    5|1.51742|13.27|3.62|1.24|73.08|0.55|8.07|0.00|0.00|    1|
|    6|1.51596|12.79|3.61|1.62|72.97|0.64|8.07|0.00|0.26|    1|
|    7|1.51743|13.30|3.60|1.14|73.09|0.58|8.17|0.00|0.00|    1|
|    8|1.51756|13.15|3.61|1.05|73.24|0.57|8.24|0.00|0.00|    1|
|    9|1.51918|14.04|3.58|1.37|72.08|0.56|8.30|0.00|0.00|    1|
|   10|1.51755|13.00|3.60|1.36|72.99|0.57|8.40|0.00|0.11|    1|
|   11|1.51571|12.72|3.46|1.56|73.20|0.67|8.09|0.00|0.24|    1|
|   12|1.51763|12.80|3.66|1.27|73.01|0.60|8.56|0.00|0.00|    1|
|   13|1.51589|12.88|3.43|1.40|73.28|0.6

In [13]:
df.printSchema()

root
 |-- index: string (nullable = true)
 |-- RI: string (nullable = true)
 |-- Na: string (nullable = true)
 |-- Mg: string (nullable = true)
 |-- Al: string (nullable = true)
 |-- Si: string (nullable = true)
 |-- K: string (nullable = true)
 |-- Ca: string (nullable = true)
 |-- Ba: string (nullable = true)
 |-- Fe: string (nullable = true)
 |-- Class: string (nullable = true)



convert datatype of columns from string to float

In [15]:
from pyspark.sql.functions import col
new_data = df.select(*(col(c).cast("float").alias(c) for c in df.columns))

In [16]:
new_data.printSchema()

root
 |-- index: float (nullable = true)
 |-- RI: float (nullable = true)
 |-- Na: float (nullable = true)
 |-- Mg: float (nullable = true)
 |-- Al: float (nullable = true)
 |-- Si: float (nullable = true)
 |-- K: float (nullable = true)
 |-- Ca: float (nullable = true)
 |-- Ba: float (nullable = true)
 |-- Fe: float (nullable = true)
 |-- Class: float (nullable = true)



In [20]:
features = new_data.drop('Class')

In [21]:
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

In [22]:
output = assembler.transform(new_data)


In [24]:
output.select("features", "Class").show()

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[1.0,1.5210100412...|  1.0|
|[2.0,1.5176099538...|  1.0|
|[3.0,1.5161800384...|  1.0|
|[4.0,1.5176600217...|  1.0|
|[5.0,1.5174200534...|  1.0|
|[6.0,1.5159599781...|  1.0|
|[7.0,1.5174299478...|  1.0|
|[8.0,1.5175600051...|  1.0|
|[9.0,1.5191800594...|  1.0|
|[10.0,1.517549991...|  1.0|
|[11.0,1.515709996...|  1.0|
|[12.0,1.517629981...|  1.0|
|[13.0,1.515890002...|  1.0|
|[14.0,1.517480015...|  1.0|
|[15.0,1.517629981...|  1.0|
|[16.0,1.517609953...|  1.0|
|[17.0,1.517840027...|  1.0|
|[18.0,1.521960020...|  1.0|
|[19.0,1.519109964...|  1.0|
|[20.0,1.517349958...|  1.0|
+--------------------+-----+
only showing top 20 rows

