# PySpark: Transform Guide

In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

schema = StructType([
    StructField("long", FloatType(), nullable=True),
    StructField("lat", FloatType(), nullable=True),
    StructField("medage", FloatType(), nullable=True),
    StructField("totrooms", FloatType(), nullable=True),
    StructField("totbdrms", FloatType(), nullable=True),
    StructField("pop", FloatType(), nullable=True),
    StructField("houshlds", FloatType(), nullable=True),
    StructField("medinc", FloatType(), nullable=True),
    StructField("medhv", FloatType(), nullable=True)]
)

file_location = "/FileStore/tables/casas.csv"
df = spark.read.csv(path=file_location, schema=schema).cache()
display(df)

long,lat,medage,totrooms,totbdrms,pop,houshlds,medinc,medhv
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0


In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

featureCols = ["totbdrms", "pop", "houshlds", "medinc"]
assembler = VectorAssembler(inputCols=featureCols, outputCol="features") 
assembled_df = assembler.transform(df)
display(assembled_df)

long,lat,medage,totrooms,totbdrms,pop,houshlds,medinc,medhv,features
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,"Map(vectorType -> dense, length -> 4, values -> List(129.0, 322.0, 126.0, 8.325200080871582))"
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,"Map(vectorType -> dense, length -> 4, values -> List(1106.0, 2401.0, 1138.0, 8.301400184631348))"
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,"Map(vectorType -> dense, length -> 4, values -> List(190.0, 496.0, 177.0, 7.257400035858154))"
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,"Map(vectorType -> dense, length -> 4, values -> List(235.0, 558.0, 219.0, 5.643099784851074))"
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,"Map(vectorType -> dense, length -> 4, values -> List(280.0, 565.0, 259.0, 3.8461999893188477))"
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,"Map(vectorType -> dense, length -> 4, values -> List(213.0, 413.0, 193.0, 4.036799907684326))"
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,"Map(vectorType -> dense, length -> 4, values -> List(489.0, 1094.0, 514.0, 3.65910005569458))"
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,"Map(vectorType -> dense, length -> 4, values -> List(687.0, 1157.0, 647.0, 3.119999885559082))"
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,"Map(vectorType -> dense, length -> 4, values -> List(665.0, 1206.0, 595.0, 2.080399990081787))"
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,"Map(vectorType -> dense, length -> 4, values -> List(707.0, 1551.0, 714.0, 3.691200017929077))"


In [0]:
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
scaled_df = standardScaler.fit(assembled_df).transform(assembled_df)
display(scaled_df.select("features", "features_scaled"))

features,features_scaled
"Map(vectorType -> dense, length -> 4, values -> List(129.0, 322.0, 126.0, 8.325200080871582))","Map(vectorType -> dense, length -> 4, values -> List(0.30623297630686513, 0.2843362208866199, 0.3295584480852433, 4.38209543579743))"
"Map(vectorType -> dense, length -> 4, values -> List(1106.0, 2401.0, 1138.0, 8.301400184631348))","Map(vectorType -> dense, length -> 4, values -> List(2.6255323394991694, 2.1201592122632746, 2.9764882057222772, 4.36956799913841))"
"Map(vectorType -> dense, length -> 4, values -> List(190.0, 496.0, 177.0, 7.257400035858154))","Map(vectorType -> dense, length -> 4, values -> List(0.451040817816313, 0.4379837439744208, 0.4629511532626037, 3.820042673324032))"
"Map(vectorType -> dense, length -> 4, values -> List(235.0, 558.0, 219.0, 5.643099784851074))","Map(vectorType -> dense, length -> 4, values -> List(0.557866274667545, 0.4927317119712234, 0.5728039692910182, 2.970331231769803))"
"Map(vectorType -> dense, length -> 4, values -> List(280.0, 565.0, 259.0, 3.8461999893188477))","Map(vectorType -> dense, length -> 4, values -> List(0.664691731518777, 0.4989129341644108, 0.6774256988418891, 2.024505748166202))"
"Map(vectorType -> dense, length -> 4, values -> List(213.0, 413.0, 193.0, 4.036799907684326))","Map(vectorType -> dense, length -> 4, values -> List(0.5056404957624983, 0.364692109398056, 0.5047998450829521, 2.124830908428931))"
"Map(vectorType -> dense, length -> 4, values -> List(489.0, 1094.0, 514.0, 3.65910005569458))","Map(vectorType -> dense, length -> 4, values -> List(1.1608366311167213, 0.9660367256210006, 1.344389224728691, 1.9260228580003875))"
"Map(vectorType -> dense, length -> 4, values -> List(687.0, 1157.0, 647.0, 3.119999885559082))","Map(vectorType -> dense, length -> 4, values -> List(1.6308686412621423, 1.021667725359687, 1.6922564754853369, 1.6422593001231023))"
"Map(vectorType -> dense, length -> 4, values -> List(665.0, 1206.0, 595.0, 2.080399990081787))","Map(vectorType -> dense, length -> 4, values -> List(1.5786428623570954, 1.0649362807119989, 1.5562482270692046, 1.0950501144251168))"
"Map(vectorType -> dense, length -> 4, values -> List(707.0, 1551.0, 714.0, 3.691200017929077))","Map(vectorType -> dense, length -> 4, values -> List(1.678346622084912, 1.3695822316619488, 1.8674978724830456, 1.9429191603871925))"
