# Predicting US Census Income Category with Apache Spark

## CLASSIFICATION

Write the description here.

In [1]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
from pyspark.sql.functions import round, col
import pyspark.sql.functions as F

from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import Bucketizer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set(context='notebook', style='whitegrid')

In [4]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 4

In [5]:
# setting random seed for notebook replicability
rnd_seed=42
np.random.seed=42

## 1. Understanding the Data Set

## 2. Creating the Spark Session

In [6]:
os.environ['SPARK_HOME']

'/Users/anindyas/work/spark-2.2.0-bin-hadoop2.6'

In [7]:
spark = SparkSession.builder.master("local[2]").appName("predict-us-census-income").getOrCreate()
spark

In [8]:
sc = spark.sparkContext
sc

In [9]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

<pyspark.sql.context.SQLContext at 0x10f19e940>

## 3. Load The Data From a File Into a Dataframe

In [10]:
ADULT_TRAIN_DATA = 'data/adult-training.csv'
ADULT_TEST_DATA = 'data/adult-test.csv'

In [11]:
# define the schema, corresponding to a line in the csv data file.
schema = StructType([
    StructField("Age", IntegerType(), nullable=True),
    StructField("Workclass", StringType(), nullable=True),
    StructField("fnlgwt", DoubleType(), nullable=True),
    StructField("Education", StringType(), nullable=True),
    StructField("Education Num", DoubleType(), nullable=True),
    StructField("Marital Status", StringType(), nullable=True),
    StructField("Occupation", StringType(), nullable=True),
    StructField("Relationship", StringType(), nullable=True),
    StructField("Race", StringType(), nullable=True),
    StructField("Sex", StringType(), nullable=True),
    StructField("Capital Gain", DoubleType(), nullable=True),
    StructField("Capital Loss", DoubleType(), nullable=True),
    StructField("Hours/Week", DoubleType(), nullable=True),
    StructField("Native Country", StringType(), nullable=True),
    StructField("Income", StringType(), nullable=True)]
)

In [12]:
# Load training data
adult_train_df = (spark
                  .read
                  .csv(path=ADULT_TRAIN_DATA, schema=schema, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True))
adult_train_df = adult_train_df.withColumn('data_set', F.lit('train')).cache()

In [13]:
adult_train_df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Workclass: string (nullable = true)
 |-- fnlgwt: double (nullable = true)
 |-- Education: string (nullable = true)
 |-- Education Num: double (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Capital Gain: double (nullable = true)
 |-- Capital Loss: double (nullable = true)
 |-- Hours/Week: double (nullable = true)
 |-- Native Country: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- data_set: string (nullable = false)



In [14]:
# Load testing data
adult_test_df = (spark
                  .read
                  .csv(path=ADULT_TEST_DATA, schema=schema, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True))
adult_test_df = adult_test_df.withColumn('data_set', F.lit('test')).cache()

In [15]:
# sample the training data
adult_train_df.sample(withReplacement=False, fraction=0.01, seed=rnd_seed).toPandas().head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,49,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K,train
1,44,Private,198282.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024.0,0.0,60.0,United-States,>50K,train
2,53,Private,95647.0,9th,5.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,train
3,24,Private,388093.0,Bachelors,13.0,Never-married,Exec-managerial,Not-in-family,Black,Male,0.0,0.0,40.0,United-States,<=50K,train
4,20,?,214635.0,Some-college,10.0,Never-married,?,Own-child,White,Male,0.0,0.0,24.0,United-States,<=50K,train


In [16]:
# sample the test data
adult_test_df.sample(withReplacement=False, fraction=0.01, seed=rnd_seed).toPandas().head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,29,?,227026.0,HS-grad,9.0,Never-married,?,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K.,test
1,33,Private,202191.0,Some-college,10.0,Never-married,Adm-clerical,Unmarried,Black,Female,0.0,0.0,35.0,United-States,<=50K.,test
2,26,Private,206721.0,HS-grad,9.0,Never-married,Handlers-cleaners,Unmarried,White,Male,0.0,0.0,40.0,United-States,<=50K.,test
3,38,?,48976.0,HS-grad,9.0,Married-civ-spouse,?,Wife,White,Female,0.0,1887.0,10.0,United-States,>50K.,test
4,31,Private,213339.0,HS-grad,9.0,Separated,Tech-support,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K.,test


In [17]:
adult_union_df = adult_train_df.union(adult_test_df)

In [18]:
adult_union_df.count(), adult_union_df.filter(col('data_set') == 'train').count(), adult_union_df.filter(col('data_set') == 'test').count()

(48842, 32561, 16281)

In [19]:
# sample the test data
adult_union_df.stat.sampleBy('data_set', fractions={'test':0.7, 'train':0.3}).sample(withReplacement=False, fraction=0.0006, seed=rnd_seed).toPandas()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,29,Private,95465.0,HS-grad,9.0,Married-civ-spouse,Other-service,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K,train
1,28,Self-emp-not-inc,240172.0,Masters,14.0,Never-married,Prof-specialty,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50K,train
2,33,Private,183557.0,Assoc-acdm,12.0,Never-married,Prof-specialty,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K,train
3,48,Private,107373.0,10th,6.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,40.0,United-States,>50K,train
4,53,Private,123092.0,HS-grad,9.0,Widowed,Machine-op-inspct,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K,train
5,17,Private,401198.0,11th,7.0,Never-married,Sales,Own-child,White,Female,0.0,0.0,15.0,United-States,<=50K,train
6,18,?,157131.0,HS-grad,9.0,Never-married,?,Own-child,White,Female,0.0,0.0,12.0,United-States,<=50K,train
7,19,Private,106183.0,HS-grad,9.0,Never-married,Other-service,Own-child,Amer-Indian-Eskimo,Female,0.0,0.0,35.0,United-States,<=50K.,test
8,30,Private,117584.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,Black,Female,0.0,0.0,20.0,United-States,<=50K.,test
9,51,Private,165953.0,HS-grad,9.0,Separated,Handlers-cleaners,Not-in-family,Black,Male,0.0,0.0,45.0,United-States,<=50K.,test


### Treating Missing Values:

In [20]:
adult_union_df.select([F.count(F.when(col(c).contains('?'), c)).alias(c) for c in adult_union_df.columns]).toPandas()
#adult_union_df.select([F.count(F.when(F.isnan(c) | col(c).isNull(), c)).alias(c) for c in adult_union_df.columns]).toPandas()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,0,2799,0,0,0,0,2809,0,0,0,0,0,0,857,0,0


In [21]:
# There are significant missing values and we need to come up with a smart strategy for that, skipping the records for now
adult_union_df = (adult_union_df
                  .filter(~col('Workclass').contains('?') 
                          & ~col('Occupation').contains('?')
                          & ~col('Native Country').contains('?'))
                  .cache())

In [22]:
adult_union_df.count(), adult_union_df.filter(col('data_set') == 'train').count(), adult_union_df.filter(col('data_set') == 'test').count()

(45222, 30162, 15060)

In [23]:
(adult_union_df
 .filter(col('Workclass').contains('?'))
 .filter(col('Occupation').contains('?'))
 .filter(col('Native Country').contains('?'))
).count()

0

### Devise a better Strategy for Missing Values:

In [24]:
# Imputer Strategy, replace by the maximum value
#adult_union_df.describe('Occupation').filter(col('summary') == 'max').select('Occupation').show()

In [25]:
#adult_union_df.describe('Workclass').filter(col('summary') == 'max').select('Workclass').show()

In [26]:
#adult_union_df.describe('Native Country').filter(col('summary') == 'max').select('Native Country').show()

In [27]:
#adult_union_df.select([F.when(col(c).contains('?'), c).alias(c) for c in adult_union_df.columns]).toPandas()

In [28]:
#adult_union_df = adult_union_df.replace(to_replace=["?"], value=["Without-pay"], subset=["Workclass"])
#adult_union_df = adult_union_df.replace(to_replace=["?"], value=["Transport-moving"], subset=["Occupation"])
#adult_union_df = adult_union_df.replace(to_replace=["?"], value=["Yugoslavia"], subset=["Native Country"])

In [29]:
# There is a typo in the test set the values are '<=50K.' & '>50K.' instead of '<=50K' & '>50K'
adult_union_df = adult_union_df.replace(to_replace='<=50K.', value='<=50K')
adult_union_df = adult_union_df.replace(to_replace='>50K.', value='>50K')

In [30]:
# sample the test data
adult_union_df.stat.sampleBy('data_set', fractions={'test':0.7, 'train':0.3}).sample(withReplacement=False, fraction=0.0006, seed=rnd_seed).toPandas()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native Country,Income,data_set
0,38,Private,296125.0,HS-grad,9.0,Separated,Priv-house-serv,Unmarried,Black,Female,0.0,0.0,30.0,United-States,<=50K,train
1,30,Private,175856.0,Masters,14.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,38.0,United-States,<=50K,train
2,26,Private,485117.0,Bachelors,13.0,Never-married,Transport-moving,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K,train
3,21,Private,197918.0,HS-grad,9.0,Never-married,Transport-moving,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50K,train
4,28,Private,163772.0,HS-grad,9.0,Married-civ-spouse,Other-service,Husband,Other,Male,0.0,0.0,40.0,United-States,<=50K,train
5,48,Private,131309.0,Masters,14.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,45.0,United-States,>50K,train
6,30,Private,236543.0,9th,5.0,Married-civ-spouse,Other-service,Husband,White,Male,0.0,0.0,32.0,El-Salvador,>50K,test
7,29,Private,199411.0,Assoc-voc,11.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,40.0,United-States,>50K,test
8,36,Private,52532.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K,test
9,78,Private,163140.0,HS-grad,9.0,Widowed,Other-service,Not-in-family,White,Female,0.0,0.0,12.0,United-States,<=50K,test


In [31]:
income_indexer = StringIndexer(inputCol='Income', outputCol='Label').fit(adult_union_df)

In [32]:
income_indexer.labels

['<=50K', '>50K']

In [33]:
adult_union_df = income_indexer.transform(adult_union_df)

In [34]:
# check the income level and index mapping 
adult_union_df.select(["Income", "Label"]).distinct().show()

+------+-----+
|Income|Label|
+------+-----+
| <=50K|  0.0|
|  >50K|  1.0|
+------+-----+



In [35]:
# check the Education and Education Num 
adult_union_df.select(["Education", "Education Num"]).distinct().show()

+------------+-------------+
|   Education|Education Num|
+------------+-------------+
|   Preschool|          1.0|
|         9th|          5.0|
|   Assoc-voc|         11.0|
|   Bachelors|         13.0|
|     1st-4th|          2.0|
|     7th-8th|          4.0|
|        12th|          8.0|
|     5th-6th|          3.0|
|   Doctorate|         16.0|
| Prof-school|         15.0|
|  Assoc-acdm|         12.0|
|     Masters|         14.0|
|        11th|          7.0|
|     HS-grad|          9.0|
|Some-college|         10.0|
|        10th|          6.0|
+------------+-------------+



There is a one-to-one mapping between Education and Education Num. We will drop Education.

In [36]:
adult_union_df = adult_union_df.drop('Education')

In [37]:
adult_union_df.select(["Relationship"]).distinct().show()

+--------------+
|  Relationship|
+--------------+
|     Own-child|
| Not-in-family|
|     Unmarried|
|          Wife|
|Other-relative|
|       Husband|
+--------------+



In [38]:
adult_union_df.select(["Marital Status"]).distinct().show(truncate=False)

+---------------------+
|Marital Status       |
+---------------------+
|Separated            |
|Never-married        |
|Married-spouse-absent|
|Divorced             |
|Widowed              |
|Married-AF-spouse    |
|Married-civ-spouse   |
+---------------------+



In [39]:
adult_union_df.select(["Race"]).distinct().show()

+------------------+
|              Race|
+------------------+
|             Other|
|Amer-Indian-Eskimo|
|             White|
|Asian-Pac-Islander|
|             Black|
+------------------+



In [40]:
adult_union_df.select(["Sex"]).distinct().show()

+------+
|   Sex|
+------+
|Female|
|  Male|
+------+



In [41]:
adult_union_df.select(["Native Country"]).distinct().show()

+------------------+
|    Native Country|
+------------------+
|       Philippines|
|           Germany|
|          Cambodia|
|            France|
|            Greece|
|            Taiwan|
|           Ecuador|
|         Nicaragua|
|              Hong|
|              Peru|
|             India|
|             China|
|             Italy|
|Holand-Netherlands|
|              Cuba|
|             South|
|              Iran|
|           Ireland|
|          Thailand|
|              Laos|
+------------------+
only showing top 20 rows



In [43]:
adult_union_df.select(["Workclass"]).distinct().show()

+----------------+
|       Workclass|
+----------------+
|Self-emp-not-inc|
|       Local-gov|
|       State-gov|
|         Private|
|     Without-pay|
|     Federal-gov|
|    Self-emp-inc|
+----------------+



In [44]:
colName = "Workclass"

In [45]:
workclass_indexer = StringIndexer(inputCol=colName, outputCol="{0}_indexed".format(colName)).fit(adult_union_df)
workclass_indexed_df = workclass_indexer.transform(adult_union_df)

In [51]:
workclass_indexed_df.select('Workclass_indexed').limit(5).show()

+-----------------+
|Workclass_indexed|
+-----------------+
|              3.0|
|              1.0|
|              0.0|
|              0.0|
|              0.0|
+-----------------+



In [55]:
workclass_encoder = OneHotEncoder(inputCol="{0}_indexed".format(colName), outputCol="{0}_encoded".format(colName), dropLast=False)
workclass_encoded_df = workclass_encoder.transform(workclass_indexed_df)

In [56]:
workclass_encoded_df.select('Workclass_encoded').limit(5).show()

+-----------------+
|Workclass_encoded|
+-----------------+
|    (7,[3],[1.0])|
|    (7,[1],[1.0])|
|    (7,[0],[1.0])|
|    (7,[0],[1.0])|
|    (7,[0],[1.0])|
+-----------------+



In [168]:
# categorical columns
categorical_columns = ["Workclass", "Marital Status", "Occupation", "Relationship", "Race", "Sex", "Native Country"]
numerical_columns = ["Age", "Education Num", "Capital Gain", "Capital Loss", "Hours/Week"]

In [169]:
# String Indexers will encode string categorical columns into a column of numeric indices
string_indexers = [StringIndexer(inputCol=colName, outputCol="{0}_indexed".format(colName)).fit(adult_union_df) for colName in categorical_columns]

In [170]:
# OneHotEncoders map number indices column to column of binary vectors
onehot_encoders = [OneHotEncoder(inputCol="{0}_indexed".format(colName), outputCol="{0}_encoded".format(colName), dropLast=False) for colName in categorical_columns]

In [129]:
standard_scalers = [StandardScaler(inputCol=colName, outputCol="{0}_scaled".format(colName)) for colName in numerical_columns]

In [130]:
# Label the incomes into 0 and 1 based on whether <50K and >=50K
#income_labeler = StringIndexer(inputCol='Income', outputCol='Label').fit(adult_union_df)

In [131]:
["{0}_encoded".format(col) for col in categorical_columns] + ["{0}_scaled".format(col) for col in numerical_columns]

['Workclass_encoded',
 'Marital Status_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Sex_encoded',
 'Native Country_encoded',
 'Age_scaled',
 'Education Num_scaled',
 'Capital Gain_scaled',
 'Capital Loss_scaled',
 'Hours/Week_scaled']

In [132]:
#adult_train_df = adult_union_df.filter(col('data_set') == 'train')

In [133]:
#string_indexers

In [171]:
assembled_train_df = adult_union_df.filter(col('data_set') == 'train')

In [172]:
for string_indexer in (string_indexers):
    assembled_train_df = string_indexer.transform(assembled_train_df)

In [173]:
assembled_train_df.columns

['Age',
 'Workclass',
 'fnlgwt',
 'Education Num',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Capital Gain',
 'Capital Loss',
 'Hours/Week',
 'Native Country',
 'Income',
 'data_set',
 'Label',
 'Workclass_indexed',
 'Marital Status_indexed',
 'Occupation_indexed',
 'Relationship_indexed',
 'Race_indexed',
 'Sex_indexed',
 'Native Country_indexed']

In [174]:
for onehot_encoder in (onehot_encoders):
    assembled_train_df = onehot_encoder.transform(assembled_train_df)

In [175]:
assembled_train_df.columns

['Age',
 'Workclass',
 'fnlgwt',
 'Education Num',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Capital Gain',
 'Capital Loss',
 'Hours/Week',
 'Native Country',
 'Income',
 'data_set',
 'Label',
 'Workclass_indexed',
 'Marital Status_indexed',
 'Occupation_indexed',
 'Relationship_indexed',
 'Race_indexed',
 'Sex_indexed',
 'Native Country_indexed',
 'Workclass_encoded',
 'Marital Status_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Sex_encoded',
 'Native Country_encoded']

In [176]:
#for standard_scaler in (standard_scalers):
#    assembled_train_df = standard_scaler.fit(assembled_train_df)

In [177]:
assembled_train_df.columns

['Age',
 'Workclass',
 'fnlgwt',
 'Education Num',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Capital Gain',
 'Capital Loss',
 'Hours/Week',
 'Native Country',
 'Income',
 'data_set',
 'Label',
 'Workclass_indexed',
 'Marital Status_indexed',
 'Occupation_indexed',
 'Relationship_indexed',
 'Race_indexed',
 'Sex_indexed',
 'Native Country_indexed',
 'Workclass_encoded',
 'Marital Status_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Sex_encoded',
 'Native Country_encoded']

In [178]:
#featureCols = ["{0}_encoded".format(col) for col in categorical_columns]# + ["{0}_scaled".format(col) for col in numerical_columns]
featureCols = ["{0}_encoded".format(col) for col in categorical_columns] + numerical_columns

In [179]:
featureCols

['Workclass_encoded',
 'Marital Status_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Sex_encoded',
 'Native Country_encoded',
 'Age',
 'Education Num',
 'Capital Gain',
 'Capital Loss',
 'Hours/Week']

In [180]:
# The VectorAssembler combines a given list of columns into a single feature vector #column.
assembler = VectorAssembler(inputCols=featureCols, outputCol="Features")

In [181]:
assembled_train_df = assembler.transform(assembled_train_df)

In [182]:
assembled_train_df.columns

['Age',
 'Workclass',
 'fnlgwt',
 'Education Num',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Capital Gain',
 'Capital Loss',
 'Hours/Week',
 'Native Country',
 'Income',
 'data_set',
 'Label',
 'Workclass_indexed',
 'Marital Status_indexed',
 'Occupation_indexed',
 'Relationship_indexed',
 'Race_indexed',
 'Sex_indexed',
 'Native Country_indexed',
 'Workclass_encoded',
 'Marital Status_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Sex_encoded',
 'Native Country_encoded',
 'Features']

In [183]:
assembled_train_df.select('Features', 'Label').show(truncate=False)

+------------------------------------------------------------------------------------------+-----+
|Features                                                                                  |Label|
+------------------------------------------------------------------------------------------+-----+
|(87,[3,8,17,29,34,39,41,82,83,84,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,39.0,13.0,2174.0,40.0]) |0.0  |
|(87,[1,7,16,28,34,39,41,82,83,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,13.0,13.0])           |0.0  |
|(87,[0,9,22,29,34,39,41,82,83,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,38.0,9.0,40.0])            |0.0  |
|(87,[0,7,22,28,35,39,41,82,83,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,53.0,7.0,40.0])            |0.0  |
|(87,[0,7,15,32,35,40,49,82,83,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,28.0,13.0,40.0])           |0.0  |
|(87,[0,7,16,32,34,40,41,82,83,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,37.0,14.0,40.0])           |0.0  |
|(87,[0,12,19,29,35,40,52,82,83,86],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,49.0,5.0,16.0])           |0.0  |
|(87,[1,7,

In [184]:
from pyspark.ml.classification import LogisticRegression

In [185]:
log_reg = LogisticRegression(featuresCol='Features', labelCol='Label', rawPredictionCol='rawPrediction', maxIter=20, regParam=0.3, elasticNetParam=1.0, family='binomial')

In [186]:
model = log_reg.fit(assembled_train_df)

In [187]:
predictions = model.transform(assembled_train_df)

In [188]:
predictions.columns

['Age',
 'Workclass',
 'fnlgwt',
 'Education Num',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Capital Gain',
 'Capital Loss',
 'Hours/Week',
 'Native Country',
 'Income',
 'data_set',
 'Label',
 'Workclass_indexed',
 'Marital Status_indexed',
 'Occupation_indexed',
 'Relationship_indexed',
 'Race_indexed',
 'Sex_indexed',
 'Native Country_indexed',
 'Workclass_encoded',
 'Marital Status_encoded',
 'Occupation_encoded',
 'Relationship_encoded',
 'Race_encoded',
 'Sex_encoded',
 'Native Country_encoded',
 'Features',
 'rawPrediction',
 'probability',
 'prediction']

In [189]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [190]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Label', metricName='areaUnderROC')

In [191]:
evaluator.evaluate(predictions)

0.5

In [192]:
predlbls = predictions.select("prediction", "Label")

In [193]:
predlbls.show()

+----------+-----+
|prediction|Label|
+----------+-----+
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  1.0|
|       0.0|  1.0|
+----------+-----+
only showing top 20 rows



In [194]:
counttotal = predictions.count()
counttotal

30162

In [196]:
correct = predlbls.filter(col('Label') == col("prediction")).count()
correct

22654

In [197]:
wrong = predlbls.filter(col('Label') != col("prediction")).count()
wrong

7508

In [198]:
ratioCorrect = float(correct)/counttotal
print("Accuracy: {0}".format(ratioCorrect))

Accuracy: 0.7510775147536636


In [203]:
predlbls.filter(col("Label") == 0.0).count()

22654

In [201]:
predlbls.filter(col("Label") == 1.0).count()

7508

In [205]:
# NUll Accuracy
max(predlbls.filter(col("Label") == 0.0).count(), predlbls.filter(col("Label") == 1.0).count()) / counttotal

0.7510775147536636

In [87]:
steps = string_indexers + onehot_encoders + standard_scalers + [assembler, log_reg]
steps

[StringIndexer_43fab969dedc192a7534,
 StringIndexer_4424856a24db439bc1db,
 StringIndexer_4ce4b4652eaeec7029c1,
 StringIndexer_4dd0aae2c26fa254069f,
 StringIndexer_4f6db70fc02e15b37597,
 StringIndexer_4b42a2c5f92f3663bfc6,
 StringIndexer_4599a985539e6b70e538,
 OneHotEncoder_4d27b685e5b62f85fb0f,
 OneHotEncoder_45e7b0076b489e6fe698,
 OneHotEncoder_4ba3b342932a42b651ea,
 OneHotEncoder_44aaa0057631c754793f,
 OneHotEncoder_4e9d9a9a136b94c4bb9f,
 OneHotEncoder_40a4aca477b79a19c559,
 OneHotEncoder_4cf4a5e43fa1d8a2a411,
 StandardScaler_49028329173a9cf2fc3a,
 StandardScaler_4b519703cb8765e132f5,
 StandardScaler_46cb8b6b14be6bf1e2d5,
 StandardScaler_4de49455b12ed238278d,
 StandardScaler_4363aa317141b8f2a54a,
 VectorAssembler_409292ee2dd1ff1056e3,
 LogisticRegression_4dc8a66086fe671a48c6]

In [85]:
pipeline = Pipeline(stages=steps)

In [89]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Label', metricName='accuracy')

In [95]:
model = pipeline.fit(adult_union_df.filter(col('data_set') == 'train'))

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/Users/anindyas/anaconda/envs/pyspark/lib/python3.5/site-packages/pyspark/ml/wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
AttributeError: 'BinaryClassificationEvaluator' object has no attribute '_java_obj'


IllegalArgumentException: 'requirement failed: Column Age must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually IntegerType.'

In [92]:
# Set up 3-fold cross validation with paramGrid
crossVal = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=None, numFolds=3)

In [93]:
cvModel = crossVal.fit(adult_union_df.filter(col('data_set') == 'train'))

TypeError: object of type 'NoneType' has no len()

In [42]:
#spark.stop()