In [175]:
from IPython.display import display

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").config("spark-master", "local").getOrCreate()
spark

In [201]:
from pyspark.sql.types import *
struct_fields_list = [
    StructField("age", StringType(), True),
    StructField("class_of_worker", StringType(), True),
    StructField("industry_code", StringType(), True),
    StructField("occupation_code", StringType(), True),
    StructField("education", StringType(), True),
    StructField("wage_per_hr", StringType(), True),
    StructField("enrolled_in_edu_inst_last_wk", StringType(), True),
    StructField("marital_status", StringType(), True),
    StructField("major_industry_code", StringType(), True),
    StructField("major_occupation_code", StringType(), True),
    StructField("race", StringType(), True),
    StructField("hispanic_origin", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("mem_labour_union", StringType(), True),
    StructField("unemployment_reason", StringType(), True),
    StructField("employment_status", StringType(), True),
    StructField("capital_gain", StringType(), True),
    StructField("capital_loss", StringType(), True),
    StructField("stock_dividends", StringType(), True),
    StructField("tax_filer_status", StringType(), True),
    StructField("prev_region", StringType(), True),
    StructField("prev_state", StringType(), True),
    StructField("household_status", StringType(), True),
    StructField("household_summary", StringType(), True),
    StructField("instance_weight", StringType(), True),
    StructField("migration_code_msa", StringType(), True),
    StructField("migration_code_region", StringType(), True),
    StructField("miragion_code_within_region", StringType(), True),
    StructField("live_in_this_house_one_year_ago", StringType(), True),
    StructField("migration_prev_res_in_sunbelt", StringType(), True),
    StructField("num_persons_for_employer", StringType(), True),
    StructField("parent", StringType(), True),
    StructField("birth_country_father", StringType(), True),
    StructField("birth_country_mother", StringType(), True),
    StructField("birth_country_self", StringType(), True),
    StructField("citizenship", StringType(), True),
    StructField("own_business", StringType(), True),
    StructField("veteran_QA", StringType(), True),
    StructField("veteran_benefits", StringType(), True),
    StructField("weeks_worked_in_yr", StringType(), True),
    StructField("year", StringType(), True),
    StructField("income", StringType(), True),
]

schema = StructType(struct_fields_list)

spark_train = spark.read.csv("census-income.data", schema=schema)
spark_test = spark.read.csv("census-income.test", schema=schema)

TRAIN_SIZE = spark_train.count()
TEST_SIZE = spark_test.count()

print("Train set size:", TRAIN_SIZE)
print("Test set size:", TEST_SIZE)

Train set size: 199523
Test set size: 99762


In [202]:
# Get full dataset 
spark_ds = spark_train.union(spark_test)

In [203]:
# Drop instance_weight column (according to dataset description)
spark_ds = spark_ds.drop("instance_weight")

In [204]:
# Cast to correct type
# We previously read all coluns as String because some entries has empty string in numeric columns => NumberFormatException error occurred
# We will now convert these columns to correct type (continous values)
numeric_columns = ["age", "wage_per_hr", "capital_gain", "capital_loss", "stock_dividends", "num_persons_for_employer", "weeks_worked_in_yr"]

spark_ds = spark_ds \
            .withColumn('age', spark_ds['age'].cast('int')) \
            .withColumn('wage_per_hr', spark_ds['wage_per_hr'].cast('double')) \
            .withColumn('capital_gain', spark_ds['capital_gain'].cast('double')) \
            .withColumn('capital_loss', spark_ds['capital_loss'].cast('double')) \
            .withColumn('stock_dividends', spark_ds['stock_dividends'].cast('double')) \
            .withColumn('num_persons_for_employer', spark_ds['num_persons_for_employer'].cast('long')) \
            .withColumn('weeks_worked_in_yr', spark_ds['weeks_worked_in_yr'].cast('long'))

In [205]:
spark_ds.show(5, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------
 age                             | 73                                        
 class_of_worker                 |  Not in universe                          
 industry_code                   |  0                                        
 occupation_code                 |  0                                        
 education                       |  High school graduate                     
 wage_per_hr                     | 0.0                                       
 enrolled_in_edu_inst_last_wk    |  Not in universe                          
 marital_status                  |  Widowed                                  
 major_industry_code             |  Not in universe or children              
 major_occupation_code           |  Not in universe                          
 race                            |  White                                    
 hispanic_origin                 |  All other                   

In [183]:
spark_ds.printSchema()

root
 |-- age: integer (nullable = true)
 |-- class_of_worker: string (nullable = true)
 |-- industry_code: string (nullable = true)
 |-- occupation_code: string (nullable = true)
 |-- education: string (nullable = true)
 |-- wage_per_hr: double (nullable = true)
 |-- enrolled_in_edu_inst_last_wk: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- major_industry_code: string (nullable = true)
 |-- major_occupation_code: string (nullable = true)
 |-- race: string (nullable = true)
 |-- hispanic_origin: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- mem_labour_union: string (nullable = true)
 |-- unemployment_reason: string (nullable = true)
 |-- employment_status: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- stock_dividends: double (nullable = true)
 |-- tax_filer_status: string (nullable = true)
 |-- prev_region: string (nullable = true)
 |-- prev_state: string (nullable 

In [178]:
# Shape of dataset
spark_ds.count()
len(spark_ds.columns)
# pretty print
display(spark_ds.limit(5).toPandas())

299285

41

Unnamed: 0,age,class_of_worker,industry_code,occupation_code,education,wage_per_hr,enrolled_in_edu_inst_last_wk,marital_status,major_industry_code,major_occupation_code,...,birth_country_father,birth_country_mother,birth_country_self,citizenship,own_business,veteran_QA,veteran_benefits,weeks_worked_in_yr,year,income
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [179]:
# Pandas dataframe
pd_dataset = spark_ds.toPandas()
# To get train and test set
#
#
pd_dataset.dtypes

age                                object
class_of_worker                    object
industry_code                      object
occupation_code                    object
education                          object
wage_per_hr                        object
enrolled_in_edu_inst_last_wk       object
marital_status                     object
major_industry_code                object
major_occupation_code              object
race                               object
hispanic_origin                    object
sex                                object
mem_labour_union                   object
unemployment_reason                object
employment_status                  object
capital_gain                       object
capital_loss                       object
stock_dividends                    object
tax_filer_status                   object
prev_region                        object
prev_state                         object
household_status                   object
household_summary                 