In [6]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("mlib") \
    .getOrCreate()

1. Data loading

In [8]:
#Read csv file to dataframe
#=====your code here==========
DATA_PATH = "gs://mk4427hw2/adult.csv"

data = spark.read.csv(DATA_PATH,inferSchema=True, header=False)
print(data.count(),len(data.columns))
#===============================
data.show(3)

32561 15
+---+-----------------+--------+----------+----+-------------------+------------------+--------------+------+-----+------+----+----+--------------+------+
|_c0|              _c1|     _c2|       _c3| _c4|                _c5|               _c6|           _c7|   _c8|  _c9|  _c10|_c11|_c12|          _c13|  _c14|
+---+-----------------+--------+----------+----+-------------------+------------------+--------------+------+-----+------+----+----+--------------+------+
| 39|        State-gov| 77516.0| Bachelors|13.0|      Never-married|      Adm-clerical| Not-in-family| White| Male|2174.0| 0.0|40.0| United-States| <=50K|
| 50| Self-emp-not-inc| 83311.0| Bachelors|13.0| Married-civ-spouse|   Exec-managerial|       Husband| White| Male|   0.0| 0.0|13.0| United-States| <=50K|
| 38|          Private|215646.0|   HS-grad| 9.0|           Divorced| Handlers-cleaners| Not-in-family| White| Male|   0.0| 0.0|40.0| United-States| <=50K|
+---+-----------------+--------+----------+----+-------------

In [9]:
from functools import reduce

In [10]:
#change the column names of dataframe
df = data.withColumnRenamed('_c0','age').withColumnRenamed('_c1','workclass').withColumnRenamed('_c2','fnlwgt')\
.withColumnRenamed('_c3', 'education').withColumnRenamed('_c4', 'education_num')\
.withColumnRenamed('_c5','marital_status').withColumnRenamed('_c6', 'occupation').withColumnRenamed('_c7', 'relationship')\
.withColumnRenamed('_c8', 'race').withColumnRenamed('_c9', 'sex').withColumnRenamed('_c10', 'capital_gain')\
.withColumnRenamed('_c11', 'capital_loss').withColumnRenamed('_c12','hours_per_week')\
.withColumnRenamed('_c13', 'native_country').withColumnRenamed('_c14', 'income')

df.printSchema()
df.show(2)

dataset = df

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: double (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)

+---+-----------------+-------+----------+-------------+-------------------+----------------+--------------+------+-----+------------+------------+--------------+--------------+------+
|age|        workclass| fnlwgt| education|education_num|     marital_status|      occupation|  relationship|  race|  sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+-

2. Data preprocessing

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [12]:
#stages in our Pipeline
stages = []
categoricalColumns = ["workclass","education","marital_status","occupation","relationship","race","sex","native_country"]

In [13]:
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

In [14]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="income", outputCol="label")
stages += [label_stringIdx]

In [15]:
# Transform all features into a vector using VectorAssembler
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [16]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)

In [17]:
preppedDataDF.take(3)

[Row(age=39, workclass=' State-gov', fnlwgt=77516.0, education=' Bachelors', education_num=13.0, marital_status=' Never-married', occupation=' Adm-clerical', relationship=' Not-in-family', race=' White', sex=' Male', capital_gain=2174.0, capital_loss=0.0, hours_per_week=40.0, native_country=' United-States', income=' <=50K', workclassIndex=4.0, workclassclassVec=SparseVector(8, {4: 1.0}), educationIndex=2.0, educationclassVec=SparseVector(15, {2: 1.0}), marital_statusIndex=1.0, marital_statusclassVec=SparseVector(6, {1: 1.0}), occupationIndex=3.0, occupationclassVec=SparseVector(14, {3: 1.0}), relationshipIndex=1.0, relationshipclassVec=SparseVector(5, {1: 1.0}), raceIndex=0.0, raceclassVec=SparseVector(4, {0: 1.0}), sexIndex=0.0, sexclassVec=SparseVector(1, {0: 1.0}), native_countryIndex=0.0, native_countryclassVec=SparseVector(41, {0: 1.0}), label=0.0, features=SparseVector(100, {4: 1.0, 10: 1.0, 24: 1.0, 32: 1.0, 44: 1.0, 48: 1.0, 52: 1.0, 53: 1.0, 94: 39.0, 95: 77516.0, 96: 13.0, 9

In [18]:
# Keep relevant columns
cols = dataset.columns
selectedcols = ["label", "features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)

DataFrame[label: double, features: vector, age: int, workclass: string, fnlwgt: double, education: string, education_num: double, marital_status: string, occupation: string, relationship: string, race: string, sex: string, capital_gain: double, capital_loss: double, hours_per_week: double, native_country: string, income: string]

In [19]:
### Randomly split data into training and test sets. set seed for reproducibility
#=====your code here==========
train_split = 0.70
test_split  = 0.30
seed_val    = 100
trainingData, testData = dataset.randomSplit([train_split,
                                test_split],
                               seed = seed_val)
#===============================
print(trainingData.count())
print(testData.count())

22832
9729


3. Modeling

In [20]:
# Fit model to prepped data

#LogisticRegression model, maxIter=10
#=====your code here==========
global_accuracy = {}
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(trainingData)


#===============================


# select example rows to display.
predictions = lrModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Logistic Regression model"] = accuracy

+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|         probability|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[-1.6357510924782...|[0.163044

In [21]:
#Random Forest
from pyspark.ml.classification import RandomForestClassifier

rf =   RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(trainingData)

# select example rows to display.
predictions = rfModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Random Forest model"] = accuracy


+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|         probability|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[10.3314778610465...|[0.516573

In [22]:
#NaiveBayes
#=====your code here==========
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(featuresCol = 'features', labelCol = 'label', modelType="multinomial")
nbModel = nb.fit(trainingData)



#===============================


# select example rows to display.
predictions = nbModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Naive Bayes model"] = accuracy

+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+-----------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|probability|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+-----------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[-21080.497829550...|  [1.0,0.0]|       0.0|
|  0.0|(100,

In [23]:
#Decision Tree

from pyspark.ml.classification import DecisionTreeClassifier

dt =   DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label')
dtModel = dt.fit(trainingData)

# select example rows to display.
predictions = rfModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Decision Tree"] = accuracy


+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|         probability|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[10.3314778610465...|[0.516573

In [24]:
#Gradient Boosting Trees

from pyspark.ml.classification import GBTClassifier

gc = GBTClassifier(featuresCol = 'features', labelCol = 'label', maxIter=10)
gcModel = gc.fit(trainingData)

# select example rows to display.
predictions = gcModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Gradient Boosting Trees"] = accuracy

+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|         probability|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[-0.5281553908031...|[0.258015

In [25]:
# Multi-layer Perceptron

from pyspark.ml.classification import MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(featuresCol = 'features', labelCol = 'label', maxIter=10, layers=[100,64,2])
mlpModel = mlp.fit(trainingData)

# select example rows to display.
predictions = mlpModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Multi-layer Perceptron"] = accuracy


+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|         probability|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[0.44171721659030...|[0.750405

In [26]:
# Linear Support Vector Machine
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1, featuresCol = 'features', labelCol = 'label')
lsvcModel = lsvc.fit(trainingData)

# select example rows to display.
predictions = lsvcModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["Linear Support Vector Machine"] = accuracy


+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[0.50607114630645...|       0.0|
|  0.0|(100,[0,8,23,29,4...| 32|  Private|131224.0|  HS-grad

In [27]:
# One-vs-Rest

from pyspark.ml.classification import LogisticRegression, OneVsRest

lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

predictions = ovrModel.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
global_accuracy["One-vs-Rest Model"] = accuracy



+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+----------+
|label|            features|age|workclass|  fnlwgt|education|education_num|     marital_status|     occupation|relationship|  race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|       rawPrediction|prediction|
+-----+--------------------+---+---------+--------+---------+-------------+-------------------+---------------+------------+------+-------+------------+------------+--------------+--------------+------+--------------------+----------+
|  0.0|(100,[0,8,23,29,4...| 36|  Private|370767.0|  HS-grad|          9.0| Married-civ-spouse| Prof-specialty|     Husband| White|   Male|         0.0|      2377.0|          60.0| United-States| <=50K|[-1.6357510924782...|       1.0|
|  0.0|(100,[0,8,23,29,4...| 32|  Private|131224.0|  HS-grad

4. Comparison and analysis

In [37]:
# Rank models according to Test set accuracy
#=====your code here==========
for i in (sorted(global_accuracy.items(), key =
             lambda kv:(kv[1], kv[0]),reverse=True)):
  print(i)  
#===============================

('Gradient Boosting Trees', 0.8467468393462843)
('One-vs-Rest Model', 0.8446911296124987)
('Logistic Regression model', 0.8446911296124987)
('Random Forest model', 0.828245451742214)
('Decision Tree', 0.828245451742214)
('Naive Bayes model', 0.7796279165381849)
('Linear Support Vector Machine', 0.7692465823825676)
('Multi-layer Perceptron', 0.7570151094665434)


*your analysis*

### My Analysis

In my result Gradient Boosting Trees have performed the best with an accuracy of 0.8467.


Next best model was the One vs Rest model with an accuracy of 0.8447 which I used with a logistic regression classifier.


After this the next model was the logistic regression model which obtained an accuracy of 0.8447. I observe that a One vs Rest model used with a logistic regression classifier had a better accuracy than a model only using logistic regression classifier.


The next model that performed good was a Random Forest Model  which obtained an accuracy of 0.8282 which I had used with default parameters.


The next model that performed good was the Decision Tree model which performed almost same as the random forest model maybe due to the default parameters being used.

The next model in the list was the linear support vector machine which I used with regParam=0.1 on the data. It did not perform very well since there are a lot of features to be separated by a linear boundary.

The worst performing model was the Multi-layer Perceptron which only obtained an accuracy of 0.7570 which can be attributed to the 3 layers which did not learn well.