In [None]:
# # Data set description
# **Data income, classification problem**
# The original data has attributes:
# income (>50K, <=50K),  age( continuous),  workclass:( Private, Self-emp-not-inc...), fnlwgt: continuous, education (Bachelors,...), education-num (continuous),
# marital-status (Married-civ-spouse...), occupation (Tech-support,..), relationship (Wife,...), race (White,...), sex (Female, Male) capital-gain (continuous) capital-loss (continuous), hours-per-week  (continuous) and native-country.
#
# I removed attributes that contain just minor categories. I kept attributes that have larrge categories,  for example for race white and black are large categories and for native-country United States is the main caegory.
#
# So my final attributes are:
# income, age, education-num, marital-status, sex, capital-gain, capital-loss, hours per week, native country.
# Here I cleaned the data set everything so it has just numerical variables.
#
# https://archive.ics.uci.edu/ml/datasets/Adult

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
adult_file_path = './Data/adult.csv'
adult_df = pd.read_csv(adult_file_path)
adult_df.columns = adult_df.columns.str.strip().str.lower().str.replace('.', '_')
adult_df = adult_df.replace({'?': np.nan}).dropna()
adult_df.head()

# Preprocessing
adult_df.income = [1 if income == ">50K" else 0 for income in adult_df.income]
adult_df.sex = [1 if sex == "Male" else 0 for sex in adult_df.sex]
white = [1 if race == "White" else 0 for race in adult_df.race]
black = [1 if race == "Black" else 0 for race in adult_df.race]
native_american = [1 if native_country == "United-States" else 0 for native_country in adult_df.native_country]
single = [1 if marital_status == "Never-married" else 0 for marital_status in adult_df.race]
married = [1 if marital_status == "Married-civ-spouse" else 0 for marital_status in adult_df.marital_status]
separated = [1 if marital_status == "Separated" else 0 for marital_status in adult_df.marital_status]
divorced = [1 if marital_status == "Divorced" else 0 for marital_status in adult_df.marital_status]
widowed = [1 if marital_status == "Widowed" else 0 for marital_status in adult_df.marital_status]
high_degree = [1 if education in ['Masters', 'Doctorate'] else 0 for education in adult_df.education]
adult_df['white'] = white
adult_df['black'] = black
adult_df['native_american'] = native_american
adult_df['single'] = single
adult_df['married'] = married
adult_df['separated'] = separated
adult_df['divorced'] = divorced
adult_df['widowed'] = widowed
adult_df['high_degree'] = high_degree
adult_features = ['age', 'sex', 'education_num', 'hours_per_week', 'native_american', 'white', 'black', 'single', 'married',
                  'separated', 'divorced', 'widowed', 'high_degree', 'capital_gain', 'capital_loss', 'income']
adult_df = adult_df[adult_features]
adult_df.head()

X = adult_df.drop(['income'], axis=1).values
y = adult_df['income'].values.reshape(-1, 1)

In [5]:
X

array([[  82,    0,    9, ...,    0,    0, 4356],
       [  54,    0,    4, ...,    0,    0, 3900],
       [  41,    0,   10, ...,    0,    0, 3900],
       ...,
       [  40,    1,    9, ...,    0,    0,    0],
       [  58,    0,    9, ...,    0,    0,    0],
       [  22,    1,    9, ...,    0,    0,    0]])

In [6]:
y

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])