In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PlotFunctions
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

sns.set()

In [2]:
# List of column names
column_names = ["age","work class","fnlwgt","education","education-num","marital-status","occupation",
                "relationship","race","sex","capital-gain","capital-loss","hours-per-week",
                "native-country", "income class"]

Merge the test and train datasets since I'll probably use cross validation rather than a train-test split

In [3]:
data_path = "Data/"
raw_data1 = pd.read_csv(data_path+"adult.data", names=column_names,skipinitialspace=True)
raw_data2 = pd.read_csv(data_path+"adult.test", names=column_names,skipinitialspace=True)
raw_data_combined = pd.concat([raw_data1, raw_data2])
raw_data_combined.reset_index(drop=True, inplace=True)
raw_data_combined.head()

Unnamed: 0,age,work class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
raw_data_combined.shape

(48842, 15)

In [5]:
data = raw_data_combined.copy()

In [6]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


## Deal with missing data

In [7]:
# Look for how missing values are denoted
for col in data:
    print(np.unique(getattr(data,col).values))

[17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
 89 90]
['?' 'Federal-gov' 'Local-gov' 'Never-worked' 'Private' 'Self-emp-inc'
 'Self-emp-not-inc' 'State-gov' 'Without-pay']
[  12285   13492   13769 ... 1455435 1484705 1490400]
['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college']
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed']
['?' 'Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'Other-service'
 'Priv-house-serv' 'Prof-specialty' 'Protective-serv' 'Sales'
 'Tech-support' 'Transport-moving']
['Husband' 'Not-in-fam

In [8]:
indices_to_drop = []

for col in data:
    for i,val in enumerate(data[col]):
        if(val == '?' or val == ' ?'):
            indices_to_drop.append(i)

indices_to_drop = list(set(indices_to_drop))
print(indices_to_drop)
print(len(indices_to_drop))

[8193, 24579, 24580, 14, 16399, 8207, 24592, 16404, 32789, 32790, 16410, 27, 8222, 24606, 8225, 16417, 40993, 32806, 38, 32814, 8241, 51, 16439, 32827, 24636, 61, 32829, 32835, 24644, 69, 16454, 16456, 77, 24662, 8282, 93, 32866, 16488, 8297, 106, 16490, 32877, 24686, 16500, 24694, 24695, 32890, 32892, 128, 24704, 8322, 16515, 24714, 16523, 32911, 149, 16535, 154, 24735, 160, 8356, 32939, 8364, 8365, 24750, 24752, 16566, 24760, 41145, 187, 32955, 32958, 8387, 32964, 24773, 16583, 41160, 201, 24779, 24781, 32974, 16595, 24787, 41172, 32981, 16602, 24796, 221, 32990, 41182, 226, 32995, 41187, 32998, 24809, 41194, 41196, 8429, 24813, 243, 24821, 245, 41209, 249, 16635, 24828, 8446, 8447, 24832, 16642, 41221, 33031, 16647, 33032, 266, 41228, 16659, 8472, 41242, 8477, 16679, 41255, 297, 33066, 24875, 24876, 41260, 41263, 8499, 33076, 33077, 41271, 312, 41274, 24890, 24891, 24894, 41282, 326, 24902, 24913, 8532, 16725, 346, 347, 16731, 24923, 8543, 354, 41314, 33124, 16743, 41321, 16748, 413

In [9]:
print("Percent to drop:\n", (len(indices_to_drop) / data.shape[0]) * 100, "%")

Percent to drop:
 7.411653904426519 %


This is a large portion of data to drop, but I think it will be worth it. Using cross-validation will help mitigate this loss of data.

In [10]:
data_no_mv = data.copy()

In [11]:
data_no_mv = data_no_mv.drop(indices_to_drop, axis=0)
data_no_mv = data_no_mv.reset_index(drop=True)
data_no_mv

Unnamed: 0,age,work class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
45218,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
45219,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
45220,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [12]:
data_no_mv.shape

(45222, 15)

## Categorical data

In [13]:
categorical_names = ["work class", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

categorical_df = data_no_mv.copy()
category_only = categorical_df.loc[:,categorical_names]

categorical_df.head()

Unnamed: 0,age,work class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
# We'll start with income class 
categorical_df["income class"] = categorical_df["income class"].map({"<=50K": 0, "<=50K.": 0, ">50K": 1, ">50K.": 1})
print(np.unique(categorical_df["income class"]))

categorical_df.head()

[0 1]


Unnamed: 0,age,work class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [15]:
# List so we know how each category names are mapped
mappings_list = []

for name in categorical_names: 
    mappings = []
    label_enc = LabelEncoder()
    labels = label_enc.fit_transform(categorical_df[name])
    col_name = name + " labels"
    categorical_df[col_name] = labels
    
    for index, label in enumerate(label_enc.classes_):
        mappings.append({label: index})
    mappings_list.append(mappings)
    
categorical_df.head()

Unnamed: 0,age,work class,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,hours-per-week,native-country,income class,work class labels,marital-status labels,occupation labels,relationship labels,race labels,sex labels,native-country labels
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,40,United-States,0,5,4,0,1,4,1,38
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,13,United-States,0,4,2,3,0,4,1,38
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,40,United-States,0,2,0,5,1,4,1,38
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,40,United-States,0,2,2,5,0,2,1,38
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,40,Cuba,0,2,2,9,5,2,0,4


In [16]:
print(mappings_list)

[[{'Federal-gov': 0}, {'Local-gov': 1}, {'Private': 2}, {'Self-emp-inc': 3}, {'Self-emp-not-inc': 4}, {'State-gov': 5}, {'Without-pay': 6}], [{'Divorced': 0}, {'Married-AF-spouse': 1}, {'Married-civ-spouse': 2}, {'Married-spouse-absent': 3}, {'Never-married': 4}, {'Separated': 5}, {'Widowed': 6}], [{'Adm-clerical': 0}, {'Armed-Forces': 1}, {'Craft-repair': 2}, {'Exec-managerial': 3}, {'Farming-fishing': 4}, {'Handlers-cleaners': 5}, {'Machine-op-inspct': 6}, {'Other-service': 7}, {'Priv-house-serv': 8}, {'Prof-specialty': 9}, {'Protective-serv': 10}, {'Sales': 11}, {'Tech-support': 12}, {'Transport-moving': 13}], [{'Husband': 0}, {'Not-in-family': 1}, {'Other-relative': 2}, {'Own-child': 3}, {'Unmarried': 4}, {'Wife': 5}], [{'Amer-Indian-Eskimo': 0}, {'Asian-Pac-Islander': 1}, {'Black': 2}, {'Other': 3}, {'White': 4}], [{'Female': 0}, {'Male': 1}], [{'Cambodia': 0}, {'Canada': 1}, {'China': 2}, {'Columbia': 3}, {'Cuba': 4}, {'Dominican-Republic': 5}, {'Ecuador': 6}, {'El-Salvador': 7},

In [17]:
# Let's remove the rows that have the actual names
categorical_df.drop(categorical_names, axis=1,inplace=True)
categorical_df.reset_index(drop=True)
categorical_df.head()

Unnamed: 0,age,fnlwgt,education,education-num,capital-gain,capital-loss,hours-per-week,income class,work class labels,marital-status labels,occupation labels,relationship labels,race labels,sex labels,native-country labels
0,39,77516,Bachelors,13,2174,0,40,0,5,4,0,1,4,1,38
1,50,83311,Bachelors,13,0,0,13,0,4,2,3,0,4,1,38
2,38,215646,HS-grad,9,0,0,40,0,2,0,5,1,4,1,38
3,53,234721,11th,7,0,0,40,0,2,2,5,0,2,1,38
4,28,338409,Bachelors,13,0,0,40,0,2,2,9,5,2,0,4


In [18]:
# Rearrange cols. Drop education since we have eduction num
categorical_df = categorical_df[["age","fnlwgt","education-num","capital-gain","capital-loss","hours-per-week",
                                 "work class labels", "marital-status labels", "occupation labels", 
                                 "relationship labels", "race labels", "sex labels", "native-country labels",
                                 "income class"]]
categorical_df.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,work class labels,marital-status labels,occupation labels,relationship labels,race labels,sex labels,native-country labels,income class
0,39,77516,13,2174,0,40,5,4,0,1,4,1,38,0
1,50,83311,13,0,0,13,4,2,3,0,4,1,38,0
2,38,215646,9,0,0,40,2,0,5,1,4,1,38,0
3,53,234721,7,0,0,40,2,2,5,0,2,1,38,0
4,28,338409,13,0,0,40,2,2,9,5,2,0,4,0


## Check Distributions

In [None]:
P

In [19]:
# Save to csv
categorical_df.to_csv(data_path+"preprocessed.csv", index=False)

In [23]:
# Going to save the mappings to a csv as well
pd.DataFrame(mappings_list).to_csv(data_path+"mappings.csv", index=False)