# Dataset Processing

Basic dataset processing code for adult classification data.

## Step 0: Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

## Step 1: Import data from https://archive.ics.uci.edu/ml/datasets/Adult & put in a dataframe.

In [2]:
df = pd.read_csv("../data/adult_data.csv")
df.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


Oh no — no columns are set. We set the columns of the dataframe equal to the ones defined by the data publishers.

In [3]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'result']

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,result
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


Remove extra spaces.

In [5]:
df.replace({' ': ''}, regex=True, inplace=True)

https://stackoverflow.com/questions/21720022/find-all-columns-of-dataframe-in-pandas-whose-type-is-float-or-a-particular-typ

## Step 2: Create dataframe of object type columns.

In [6]:
objectColumns = df.loc[:, df.dtypes == object]
objectNames = objectColumns.columns
objectColumns.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,result
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
1,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
4,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K


In [7]:
enc = preprocessing.LabelEncoder()

Using the imported LabelEncoder, encode a number to every row value in each column.

In [8]:
df_object = pd.DataFrame()
for feature in objectNames:
    df_object[feature] = enc.fit_transform(df[feature])

In [9]:
df_object.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,result
0,6,9,2,4,0,4,1,39,0
1,4,11,0,6,1,4,1,39,0
2,4,1,2,6,0,2,1,39,0
3,4,9,2,10,5,2,0,5,0
4,4,12,2,4,5,4,0,39,0


## Step 3: Create dataframe of int type columns.

In [10]:
intColumns = df.loc[:, df.dtypes == int]
intNames = intColumns.columns
intColumns.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,50,83311,13,0,0,13
1,38,215646,9,0,0,40
2,53,234721,7,0,0,40
3,28,338409,13,0,0,40
4,37,284582,14,0,0,40


In [11]:
scaler = preprocessing.StandardScaler()

Using the scaler, convert each column to a standard distribution.

In [12]:
df_int = pd.DataFrame()
for feature in intNames:
    df_int[feature] = np.ravel(scaler.fit_transform(df[feature].values.reshape(-1, 1))) # https://stackoverflow.com/questions/18200052/how-to-convert-ndarray-to-array



In [13]:
df_int.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.837097,-1.008742,1.134779,-0.145914,-0.216663,-2.22212
1,-0.04264,0.245046,-0.420027,-0.145914,-0.216663,-0.03543
2,1.057031,0.42577,-1.197429,-0.145914,-0.216663,-0.03543
3,-0.775755,1.408146,1.134779,-0.145914,-0.216663,-0.03543
4,-0.115952,0.89817,1.52348,-0.145914,-0.216663,-0.03543


## Step 4: Concatenate the int and object dataframes into our final_df

In [14]:
final_df = pd.concat([df_int, df_object], axis=1)

In [15]:
final_df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country,result
0,0.837097,-1.008742,1.134779,-0.145914,-0.216663,-2.22212,6,9,2,4,0,4,1,39,0
1,-0.04264,0.245046,-0.420027,-0.145914,-0.216663,-0.03543,4,11,0,6,1,4,1,39,0
2,1.057031,0.42577,-1.197429,-0.145914,-0.216663,-0.03543,4,1,2,6,0,2,1,39,0
3,-0.775755,1.408146,1.134779,-0.145914,-0.216663,-0.03543,4,9,2,10,5,2,0,5,0
4,-0.115952,0.89817,1.52348,-0.145914,-0.216663,-0.03543,4,12,2,4,5,4,0,39,0
