In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.Builder().appName('adult').getOrCreate()

[Dataset](http://archive.ics.uci.edu/ml/datasets/Adult)


In [3]:
from pyspark.sql.types import StructType, StructField, DecimalType, StringType, IntegerType

In [4]:
data_schema = StructType([
    StructField('age', IntegerType()),
    StructField('workclass', StringType()),
    StructField('fnlwgt', IntegerType()),
    StructField('education', StringType()),
    StructField('education-num', IntegerType()),
    StructField('marital-status', StringType()),
    StructField('occupation', StringType()),
    StructField('relationship', StringType()),
    StructField('race', StringType()),
    StructField('sex', StringType()),
    StructField('capital-gain', IntegerType()),
    StructField('capital-loss', IntegerType()),
    StructField('hours-per-week', IntegerType()),
    StructField('native-country', StringType()),
    StructField('labels', StringType())
])

In [5]:
df = spark.read.load('adult.data', format='csv', sep=', ', header=False, schema=data_schema)

In [6]:
data = df.replace('?', None) # replace all strings '?' with null value

In [7]:
# data = data.na.drop() #remove all lines containing at least one null value
data = data.na.fill('unknown')

In [8]:
print(df.count())
print(data.count())

32561
32561


In [9]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [10]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = false)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = false)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = false)
 |-- occupation: string (nullable = false)
 |-- relationship: string (nullable = false)
 |-- race: string (nullable = false)
 |-- sex: string (nullable = false)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = false)
 |-- labels: string (nullable = false)



In [11]:
workclass_indexer = StringIndexer(inputCol='workclass', outputCol='workclass_index')
workclass_encoder = OneHotEncoder(inputCol='workclass_index', outputCol='workclass_encoded')

education_indexer = StringIndexer(inputCol='education', outputCol='education_index')
education_encoder = OneHotEncoder(inputCol='education_index', outputCol='education_encoded')

marital_status_indexer = StringIndexer(inputCol='marital-status', outputCol='marital-status_index')
marital_status_encoder = OneHotEncoder(inputCol='marital-status_index', outputCol='marital-status_encoded')

occupation_indexer = StringIndexer(inputCol='occupation', outputCol='occupation_index')
occupation_encoder = OneHotEncoder(inputCol='occupation_index', outputCol='occupation_encoded')

relationship_indexer = StringIndexer(inputCol='relationship', outputCol='relationship_index')
relationship_encoder = OneHotEncoder(inputCol='relationship_index', outputCol='relationship_encoded')

race_indexer = StringIndexer(inputCol='race', outputCol='race_index')
race_encoder = OneHotEncoder(inputCol='race_index', outputCol='race_encoded')

sex_indexer = StringIndexer(inputCol='sex', outputCol='sex_index') #only 2  possibilities

native_country_indexer = StringIndexer(inputCol='native-country', outputCol='native-country_index')
native_country_encoder = OneHotEncoder(inputCol='native-country_index', outputCol='native-country_encoded')

labels_indexer = StringIndexer(inputCol='labels', outputCol='labels_index') # only 2 possibilities

In [12]:
from pyspark.ml.feature import VectorAssembler

In [13]:
assembler = VectorAssembler(inputCols=['age',
 'workclass_encoded',
 'fnlwgt',
 'education_encoded',
 'education-num',
 'marital-status_encoded',
 'occupation_index',
 'relationship_encoded',
 'race_encoded',
 'sex_index',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country_encoded',
 'labels_index'], outputCol='features')

In [14]:
from pyspark.ml import Pipeline

In [15]:
pipeline = Pipeline(stages=[workclass_indexer,
                            workclass_encoder,
                            education_indexer,
                            education_encoder,
                            marital_status_indexer,
                            marital_status_encoder,
                            occupation_indexer,
                            occupation_encoder,
                            relationship_indexer,
                            relationship_encoder,
                            race_indexer,
                            race_encoder,
                            sex_indexer,
                            native_country_indexer,
                            native_country_encoder,
                            labels_indexer,
                            assembler
                            ])

In [82]:
prepared_data_pipeline = pipeline.fit(data)

In [83]:
prepared_data = prepared_data_pipeline.transform(data)

In [84]:
prepared_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = false)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = false)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = false)
 |-- occupation: string (nullable = false)
 |-- relationship: string (nullable = false)
 |-- race: string (nullable = false)
 |-- sex: string (nullable = false)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = false)
 |-- labels: string (nullable = false)
 |-- workclass_index: double (nullable = false)
 |-- workclass_encoded: vector (nullable = true)
 |-- education_index: double (nullable = false)
 |-- education_encoded: vector (nullable = true)
 |-- marital-status_index: double (nullable = false)
 |-- marital-status_encoded: vector (nullable = true)
 |-- occupation_index: double (nullable = false)
 |-- occu

In [86]:
prepared_data = prepared_data.select(['features', 'labels_index'])

In [87]:
prepared_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- labels_index: double (nullable = false)



In [20]:
train_data, test_data = prepared_data.randomSplit([0.7, 0.3])

In [21]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier

In [22]:
rf_classifier = RandomForestClassifier(featuresCol='features', labelCol='labels_index', numTrees=15)
# dc_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='labels_index')

In [23]:
model = rf_classifier.fit(train_data)
# model = dc_classifier.fit(train_data)

In [24]:
prediction = model.transform(test_data)

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [26]:
mc_evaluator = MulticlassClassificationEvaluator(
    predictionCol='prediction',
    labelCol='labels_index',
    metricName="accuracy")

In [27]:
results = mc_evaluator.evaluate(prediction)

In [28]:
results

0.9822874493927125

In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [30]:
bc_evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='labels_index', metricName='areaUnderROC')

In [31]:
bc_results = bc_evaluator.evaluate(prediction)

In [32]:
print(bc_results)

0.9999884876230039


In [88]:
df.show(3)

+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+----+------------+------------+--------------+--------------+------+
|age|       workclass|fnlwgt|education|education-num|    marital-status|       occupation| relationship| race| sex|capital-gain|capital-loss|hours-per-week|native-country|labels|
+---+----------------+------+---------+-------------+------------------+-----------------+-------------+-----+----+------------+------------+--------------+--------------+------+
| 39|       State-gov| 77516|Bachelors|           13|     Never-married|     Adm-clerical|Not-in-family|White|Male|        2174|           0|            40| United-States| <=50K|
| 50|Self-emp-not-inc| 83311|Bachelors|           13|Married-civ-spouse|  Exec-managerial|      Husband|White|Male|           0|           0|            13| United-States| <=50K|
| 38|         Private|215646|  HS-grad|            9|          Divorced|Handlers-cleaners|Not-in-family|W

In [78]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- labels: string (nullable = true)



In [80]:
prepared_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- labels_index: double (nullable = false)

