In [17]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler 
from pyspark.ml.linalg import Vectors


spark = SparkSession \
    .builder \
    .appName("Python Spark ML example") \
    .getOrCreate()

ad_data= spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("adult5.csv")
ad_data.createOrReplaceTempView("adult")
dataset = spark.table("adult")
cols = dataset.columns

spark string indexer for converting numerical features into numerical features.

In [20]:
indexer_age = StringIndexer(inputCol="age", outputCol="age_Index")
indexed = indexer_age.fit(dataset).transform(dataset)
indexed.select("age","age_Index").show()

+---+---------+
|age|age_Index|
+---+---------+
| 39|     23.0|
| 50|     17.0|
| 38|     26.0|
| 53|      4.0|
| 28|     14.0|
| 37|      5.0|
| 49|      2.0|
| 52|     28.0|
| 31|      3.0|
| 42|     25.0|
| 37|      5.0|
| 30|      0.0|
| 23|     10.0|
| 32|     12.0|
| 40|     11.0|
| 34|     15.0|
| 25|      7.0|
| 32|     12.0|
| 38|     26.0|
| 43|      6.0|
+---+---------+
only showing top 20 rows



StringIndexer converts string column of labels to column of label indices
Example : convert workclass column(_c1) into workclass_index

In [21]:
indexer_wc = StringIndexer(inputCol="workclass", outputCol="workclass_Index")
indexed = indexer_wc.fit(dataset).transform(dataset)
indexed.select("workclass","workclass_Index").show()

+-----------------+---------------+
|        workclass|workclass_Index|
+-----------------+---------------+
|        State-gov|            5.0|
| Self-emp-not-inc|            1.0|
|          Private|            0.0|
|          Private|            0.0|
|          Private|            0.0|
|          Private|            0.0|
|          Private|            0.0|
| Self-emp-not-inc|            1.0|
|          Private|            0.0|
|          Private|            0.0|
|          Private|            0.0|
|        State-gov|            5.0|
|          Private|            0.0|
|          Private|            0.0|
|          Private|            0.0|
|          Private|            0.0|
| Self-emp-not-inc|            1.0|
|          Private|            0.0|
|          Private|            0.0|
| Self-emp-not-inc|            1.0|
+-----------------+---------------+
only showing top 20 rows



Example : convert income column(_c2) into workclass_index

In [22]:
indexer_income = StringIndexer(inputCol="income", outputCol="income_Index")
indexed = indexer_income.fit(dataset).transform(dataset)
indexed.select("income","income_Index").show()

+------+------------+
|income|income_Index|
+------+------------+
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
|  >50K|         1.0|
|  >50K|         1.0|
|  >50K|         1.0|
|  >50K|         1.0|
|  >50K|         1.0|
| <=50K|         0.0|
| <=50K|         0.0|
|  >50K|         1.0|
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
| <=50K|         0.0|
|  >50K|         1.0|
+------+------------+
only showing top 20 rows

