In [1]:
from IPython.display import display

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").config("spark-master", "local").getOrCreate()
spark

### Set up schema and read data from CSV

In [57]:
from pyspark.sql.types import *
struct_fields_list = [
    StructField("age", IntegerType(), True),
    StructField("class_of_worker", StringType(), True),
    StructField("industry_code", StringType(), True),
    StructField("occupation_code", StringType(), True),
    StructField("education", StringType(), True),
    StructField("wage_per_hr", DoubleType(), True),
    StructField("enrolled_in_edu_inst_last_wk", StringType(), True),
    StructField("marital_status", StringType(), True),
    StructField("major_industry_code", StringType(), True),
    StructField("major_occupation_code", StringType(), True),
    StructField("race", StringType(), True),
    StructField("hispanic_origin", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("mem_labour_union", StringType(), True),
    StructField("unemployment_reason", StringType(), True),
    StructField("employment_status", StringType(), True),
    StructField("capital_gain", DoubleType(), True),
    StructField("capital_loss", DoubleType(), True),
    StructField("stock_dividends", DoubleType(), True),
    StructField("tax_filer_status", StringType(), True),
    StructField("prev_region", StringType(), True),
    StructField("prev_state", StringType(), True),
    StructField("household_status", StringType(), True),
    StructField("household_summary", StringType(), True),
    StructField("instance_weight", DoubleType(), True),
    StructField("migration_code_msa", StringType(), True),
    StructField("migration_code_region", StringType(), True),
    StructField("miragion_code_within_region", StringType(), True),
    StructField("live_in_this_house_one_year_ago", StringType(), True),
    StructField("migration_prev_res_in_sunbelt", StringType(), True),
    StructField("num_persons_for_employer", IntegerType(), True),
    StructField("parent", StringType(), True),
    StructField("birth_country_father", StringType(), True),
    StructField("birth_country_mother", StringType(), True),
    StructField("birth_country_self", StringType(), True),
    StructField("citizenship", StringType(), True),
    StructField("own_business", StringType(), True),
    StructField("veteran_QA", StringType(), True),
    StructField("veteran_benefits", StringType(), True),
    StructField("weeks_worked_in_yr", IntegerType(), True),
    StructField("year", StringType(), True),
    StructField("income", StringType(), True),
]

schema = StructType(struct_fields_list)

# read data, remove trailing and leading whitespace, set null value to ?
spark_train = spark.read.csv("census-income.data", 
                             schema=schema, 
                             ignoreLeadingWhiteSpace=True,
                             ignoreTrailingWhiteSpace=True,
                             nullValue="?")
spark_test = spark.read.csv("census-income.test", schema=schema)

TRAIN_SIZE = spark_train.count()
TEST_SIZE = spark_test.count()

print("Train set shape: ({},{})".format(TRAIN_SIZE, len(spark_train.columns)))
print("Test set shape: ({}, {})".format(TEST_SIZE, len(spark_test.columns)))

Train set shape: (199523,42)
Test set shape: (99762, 42)


In [59]:
# Get full dataset 
spark_ds = spark_train.union(spark_test)
# Drop instance_weight column (according to dataset description)
spark_ds = spark_ds.drop("instance_weight")
# Full dataset shape
print("Dataset shape: ({}, {})".format(spark_ds.count(), len(spark_ds.columns)))
print()
# Print first 5 rows
spark_ds.show(5, truncate=False, vertical=True)

Dataset shape: (299285, 41)

-RECORD 0-------------------------------------------------------------------
 age                             | 73                                       
 class_of_worker                 | Not in universe                          
 industry_code                   | 0                                        
 occupation_code                 | 0                                        
 education                       | High school graduate                     
 wage_per_hr                     | 0.0                                      
 enrolled_in_edu_inst_last_wk    | Not in universe                          
 marital_status                  | Widowed                                  
 major_industry_code             | Not in universe or children              
 major_occupation_code           | Not in universe                          
 race                            | White                                    
 hispanic_origin                 | All other   

In [60]:
target_col = "income"
nominal_cols = [x[0] for x in spark_ds.dtypes if x[1] == "string" and x[0] != target_col]
numeric_cols = [x[0] for x in spark_ds.dtypes if x[1] != "string"]

print("Nominal columns:", nominal_cols)
print()
print("Numeric columns:", numeric_cols)

Nominal columns: ['class_of_worker', 'industry_code', 'occupation_code', 'education', 'enrolled_in_edu_inst_last_wk', 'marital_status', 'major_industry_code', 'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'mem_labour_union', 'unemployment_reason', 'employment_status', 'tax_filer_status', 'prev_region', 'prev_state', 'household_status', 'household_summary', 'migration_code_msa', 'migration_code_region', 'miragion_code_within_region', 'live_in_this_house_one_year_ago', 'migration_prev_res_in_sunbelt', 'parent', 'birth_country_father', 'birth_country_mother', 'birth_country_self', 'citizenship', 'own_business', 'veteran_QA', 'veteran_benefits', 'year']

Numeric columns: ['age', 'wage_per_hr', 'capital_gain', 'capital_loss', 'stock_dividends', 'num_persons_for_employer', 'weeks_worked_in_yr']


In [87]:
from pyspark.sql.functions import *

spark_ds.select([count(when(col(c).isNull(), c)).alias(c) for c in numeric_cols]).show()

# df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----+-----------+------------+------------+---------------+------------------------+------------------+
|  age|wage_per_hr|capital_gain|capital_loss|stock_dividends|num_persons_for_employer|weeks_worked_in_yr|
+-----+-----------+------------+------------+---------------+------------------------+------------------+
|99762|      99762|       99762|       99762|          99762|                   99762|             99762|
+-----+-----------+------------+------------+---------------+------------------------+------------------+



In [89]:
spark_test.count()

99762

### Convert to pandas dataframe

In [82]:
import pandas as pd
pd.set_option('display.max_columns', 50)
# Pandas dataframe
pd_dataset = spark_ds.toPandas()
# To get train and test set
pd_train = pd_dataset.iloc[:TRAIN_SIZE, :]
pd_test = pd_dataset.iloc[TRAIN_SIZE:, :]

pd_train.shape
pd_test.shape
# pd_dataset.dtypes

(199523, 41)

(99762, 41)