In [1]:
%fs ls databricks-datasets/adult/adult.data

<h4>Understand Data</h4>
* age: continuous
* workclass: Private,Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
* fnlwgt: continuous
* education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc...
* education-num: continuous
* marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent...
* occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners...
* relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried
* race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black
* sex: Female, Male
* capital-gain: continuous
* capital-loss: continuous
* hours-per-week: continuous
* native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany...
* Target/Label: - <=50K, >50K

<h3>Load data</h3>

In [4]:
import pyspark.sql.types as typ
labels = [
    ('age', typ.DoubleType()),
    ('workclass', typ.StringType()),
    ('fnlwgt', typ.DoubleType()),
    ('education', typ.StringType()),
    ('education-num', typ.DoubleType()),
    ('marital-status', typ.StringType()),
    ('occupation', typ.StringType()),
    ('relationship', typ.StringType()),
    ('race', typ.StringType()),
    ('sex', typ.StringType()),
    ('capital-gain', typ.DoubleType()),
    ('capital-loss', typ.DoubleType()),
    ('hours-per-week', typ.DoubleType()),
    ('native-country', typ.StringType()),
    ('income',typ.StringType())
]

schema = typ.StructType([ typ.StructField(e[0], e[1], False) for e in labels ])


In [5]:
data = spark.read.csv('databricks-datasets/adult/adult.data',header=False,schema=schema)

In [6]:
display(data)

<h3>Preprocess Data</h3>
* Converting categorical data into numerical values using StringIndex & one-hot-encoder.
* Simplification step - we will have more than 1 stages of feature transformations, we use a Pipeline to tie the stages together.

In [8]:
###One-Hot Encoding
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

In [9]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")
stages += [label_stringIdx]

In [10]:
# Transform all features into a vector using VectorAssembler
numericCols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [11]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(data)
dataset = pipelineModel.transform(data)

# Keep relevant columns
selectedcols = ["label", "features"] + data.columns
dataset = dataset.select(selectedcols)
display(dataset)

In [12]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

<h3>Fit and Evaluate Models</h3>
* Logistic Regression

In [14]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=100)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [15]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

In [16]:
# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)