In [23]:
import numpy as np
import csv
from collections import Counter

In [3]:
train_dict = []
with open('train_data.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        train_dict.append(row)

In [4]:
test_dict = []
with open('test_data.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        test_dict.append(row)

In [7]:
headings = []
train_data = []

In [8]:
with open('train_data.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        train_data.append(row)

In [9]:
headings = np.array(train_data[0])
train_data = np.array(train_data[1:])

In [10]:
test_data = []
with open('test_data.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        test_data.append(row)
test_data = np.array(test_data[1:])

In [11]:
headings

array(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'label'], 
      dtype='<U14')

In [12]:
gdp = {'Cambodia': 267.63, "Canada": 19785.68, "China": 469.21, "Columbia": 2274.89, "Cuba": 2615.02,
"Dominican-Republic": 1852.24, "Ecuador": 2050.32, "El-Salvador": 1424.09, "England": 19531.98, 
"France": 23626.09, "Germany": 27087.56, "Greece": 11089.90, "Guatemala": 1330.74, "Haiti": 276.29,
"Honduras": 628.96, "Hong": 22502.58, "Hungary": 4150.06, "India": 354.85, "Iran": 1125.12, 
"Ireland": 15887.05, "Italy": 19280.86, "Jamaica": 2007.37, "Japan": 38814.89, 
"Laos": 324.89, "Mexico": 5637.27, "Nicaragua": 847.72, "Outlying-US(Guam-USVI-etc)": 3500, "Peru": 1859.52, 
"Philippines": 941.72, "Poland": 2819.70, "Portugal": 9978.30, "Puerto-Rico": 10876.42, 
"Scotland": 19531.00, "South": 3546.67, "Taiwan": 12108.8, "Thailand": 2467.18, "Trinadad&Tobago": 3956.76, 
 "United-States": 27776.81, "Vietnam": 229.95, "Yugoslavia": 3549, "Holand-Netherlands": 24236.55}

In [16]:
def onehot(labels, person_dict, category, best):
    person_label = person_dict[category]
    if person_label == "?":
        person_label = best
    onehot = np.zeros((len(labels),))
    onehot[labels.index(person_label)] = 1
    return onehot

In [17]:
def onehot_stack(labels, people_dict, category, best):
    output = np.zeros((len(people_dict), len(labels)))
    for i, person_dict in enumerate(people_dict):
        output[i] = onehot(labels, person_dict, category, best)
    return output

In [228]:
def get_fnlwgt(data_list):
    x = data_list[:,2].astype("int")//10000
    return x.reshape((len(x), 1))
def get_education_num(data_list):
    return data_list[:,4].reshape((len(data_list),1)).astype("int")
def get_capital_gain(data_list):
    return data_list[:,10].reshape((len(data_list),1)).astype("int")
def get_capital_loss(data_list):
    return data_list[:,11].reshape((len(data_list),1)).astype("int")
def get_hours_per_week(data_list):
    return data_list[:,12].reshape((len(data_list),1)).astype("int")

In [229]:
def construct_design_matrix(data_dict, data_list):
    age_stack = get_age(data_dict)
    work_class_onehot_stack = onehot_stack(work_classes_list, data_dict, "workclass", "Private")
    fnlwgt_stack = get_fnlwgt(data_list)
    educations_onehot_stack = onehot_stack(educations_list, data_dict, "education", "HS-grad")
    education_num_stack = get_education_num(data_list)
    marital_status_onehot_stack = onehot_stack(marital_status_list, data_dict, "marital-status", "Married-civ-spouse")
    occupation_onehot_stack = onehot_stack(occupation_list, data_dict, "occupation", "Exec-managerial")
    relationship_onehot_stack = onehot_stack(relationship_list, data_dict, "relationship", "Husband")
    race_onehot_stack = onehot_stack(race_list, data_dict, "race", "White")
    sex_stack = get_sex(data_dict)
    capital_gain_stack = get_capital_gain(data_list)
    capital_loss_stack = get_capital_loss(data_list)
    hours_per_week_stack = get_hours_per_week(data_list)
    gdp_stack = get_gdp(data_dict)
    return np.concatenate((age_stack, work_class_onehot_stack, fnlwgt_stack, educations_onehot_stack, 
                           education_num_stack, marital_status_onehot_stack, occupation_onehot_stack, 
                          relationship_onehot_stack, race_onehot_stack, sex_stack, capital_gain_stack,
                          capital_loss_stack, hours_per_week_stack, gdp_stack), axis=1)

In [230]:
X_train = construct_design_matrix(train_dict, train_data)
X_test = construct_design_matrix(test_dict, test_data)

In [233]:
census_dict = {}
census_dict["training_data"] = X_train
census_dict["test_data"] = X_test
census_dict["training_labels"] = labels_train.astype("int")

In [234]:
import scipy
from scipy import io
io.savemat("census_data.mat", census_dict)

**AGE**

In [111]:
mean_age = np.mean(train_data[:,0].astype("int"))
def get_age(data):
    mean_age = 38.6681945972
    ages = np.zeros(len(data))
    for i, person in enumerate(data):
        age_i = int(person["age"])
        if age_i == "?":
            age_i = mean_age
        diff = abs(age_i-mean_age)
        ages[i] = diff
    return ages.reshape((len(ages),1))

**WORK CLASS**

In [92]:
work_classes = train_data[:,1]
Counter(work_classes)

Counter({'?': 1868,
         'Federal-gov': 936,
         'Local-gov': 2082,
         'Never-worked': 7,
         'Private': 22792,
         'Self-emp-inc': 1140,
         'Self-emp-not-inc': 2593,
         'State-gov': 1293,
         'Without-pay': 13})

In [19]:
work_classes_set = set(work_classes)
work_classes_set.remove("?")
work_classes_list = list(work_classes_set)

In [20]:
work_classes_list

['Without-pay',
 'Never-worked',
 'Local-gov',
 'Federal-gov',
 'Self-emp-not-inc',
 'Private',
 'State-gov',
 'Self-emp-inc']

In [21]:
work_class_onehot_stack = onehot_stack(work_classes_list, train_dict, "workclass", "Private")

In [55]:
work_class_onehot_stack.shape

(32724, 8)

**EDUCATION**

In [26]:
educations = train_data[:,3]
Counter(educations)

Counter({'10th': 959,
         '11th': 1185,
         '12th': 432,
         '1st-4th': 163,
         '5th-6th': 352,
         '7th-8th': 634,
         '9th': 505,
         'Assoc-acdm': 1075,
         'Assoc-voc': 1395,
         'Bachelors': 5366,
         'Doctorate': 391,
         'HS-grad': 10570,
         'Masters': 1779,
         'Preschool': 55,
         'Prof-school': 562,
         'Some-college': 7301})

In [27]:
educations_list = list(set(educations))

In [30]:
educations_onehot_stack = onehot_stack(educations_list, train_dict, "education", "HS-grad")

**MARITAL STATUS**

In [33]:
marital_status = train_data[:,5]
Counter(marital_status)

Counter({'Divorced': 4443,
         'Married-AF-spouse': 25,
         'Married-civ-spouse': 15066,
         'Married-spouse-absent': 422,
         'Never-married': 10733,
         'Separated': 1013,
         'Widowed': 1022})

In [34]:
marital_status_list = list(set(marital_status))
marital_status_onehot_stack = onehot_stack(marital_status_list, train_dict, "marital-status", "Married-civ-spouse")

**OCCUPATION**

In [36]:
occupation = train_data[:,6]
Counter(occupation)

Counter({'?': 1875,
         'Adm-clerical': 3736,
         'Armed-Forces': 12,
         'Craft-repair': 4030,
         'Exec-managerial': 4097,
         'Farming-fishing': 1031,
         'Handlers-cleaners': 1383,
         'Machine-op-inspct': 2060,
         'Other-service': 3320,
         'Priv-house-serv': 164,
         'Prof-specialty': 4087,
         'Protective-serv': 648,
         'Sales': 3740,
         'Tech-support': 965,
         'Transport-moving': 1576})

In [37]:
occupation_set = set(occupation)
occupation_set.remove("?")
occupation_list = list(occupation_set)

In [38]:
occupation_onehot_stack = onehot_stack(occupation_list, train_dict, "occupation", "Exec-managerial")

**RELATIONSHIP**

In [43]:
relationship = train_data[:,7]
Counter(relationship)

Counter({'Husband': 13277,
         'Not-in-family': 8307,
         'Other-relative': 981,
         'Own-child': 5105,
         'Unmarried': 3483,
         'Wife': 1571})

In [44]:
relationship_list = list(set(relationship))
relationship_onehot_stack = onehot_stack(relationship_list, train_dict, "relationship", "Husband")

**RACE**

In [48]:
race = train_data[:,8]
Counter(race)

Counter({'Amer-Indian-Eskimo': 325,
         'Asian-Pac-Islander': 1043,
         'Black': 3147,
         'Other': 283,
         'White': 27926})

In [49]:
race_list = list(set(race))
race_onehot_stack = onehot_stack(race_list, train_dict, "race", "White")

**SEX**

In [51]:
sex = train_data[:,9]
Counter(sex)

Counter({'Female': 10856, 'Male': 21868})

In [78]:
def get_sex(data):
    sex_stack = np.zeros(len(data))
    for i, person in enumerate(data):
        if person["sex"] == "Male":
            sex_stack[i] = 1
    return sex_stack.reshape((len(sex_stack),1))

In [79]:
get_sex(train_dict)

array([[ 1.],
       [ 0.],
       [ 0.],
       ..., 
       [ 1.],
       [ 1.],
       [ 1.]])

**NATIVE COUNTRY**

In [75]:
def get_gdp(data):
    country_gdps = np.zeros(len(data))
    for i, person in enumerate(data):
        country = person["native-country"]
        if country == "?":
            country = "United-States"
        country_gdps[i] = gdp[country]
    return country_gdps.reshape((len(country_gdps),1))

In [77]:
get_gdp(train_dict)

array([[ 27776.81],
       [ 27776.81],
       [ 27776.81],
       ..., 
       [ 27776.81],
       [ 27776.81],
       [ 27776.81]])

In [213]:
countries = train_data[:,-2]

In [218]:
countries_set = set(countries)
countries_set.remove("?")
countries_list = list(countries_set)

In [220]:
countries_onehot_stack = onehot_stack(countries_list, train_dict, "native-country", "United-States")