# Melek Machine Learning Case Study : 1994 U.S. Census Data

#### author : Baiq Nurul Haqiqi

Description of the dataset :
In this project, you will employ several supervised algorithms of your choice to accurately model individuals' income using data collected from the 1994 U.S. Census. You will then choose the best candidate algorithm from preliminary results and further optimize this algorithm to best model the data. Your goal with this implementation is to construct a model that accurately predicts whether an individual makes more than $50,000. This sort of task can arise in a non-profit setting, where organizations survive on donations. Understanding an individual's income can help a non-profit better understand how large of a donation to request, or whether or not they should reach out to begin with. While it can be difficult to determine an individual's general income bracket directly from public sources, we can (as we will see) infer this value from other publically available features.


In [1]:
# Importing the library
import numpy as np
import pandas as pd

In [2]:
# Loading the dataset
data = pd.read_csv('census.csv')
data.head(10)

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
5,37,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
6,49,Private,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
8,31,Private,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
9,42,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [3]:
# Checking missing value
check_missing = pd.concat([data.isnull().sum()],axis = 1, keys = ["Column"])
check_missing[check_missing.sum(axis=1) > 0]

Unnamed: 0,Column


In [4]:
# Preparing the dependent variable
data['income'] = np.where(data.income == '<=50K', 0, 1)
data['income'] = data['income'].apply(str)
labels = data.pop('income')

In [5]:
# Processing the categorical data
for col in data.dtypes[data.dtypes == "object"].index:
    # deleting the columns
    for_dummy = data.pop(col)
    # get_dummies
    data_dummies = pd.get_dummies(for_dummy, prefix = col)
    # concating the data
    data = pd.concat([data, data_dummies], axis = 1)

In [6]:
data.head(10)

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,13.0,2174.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13.0,0.0,0.0,13.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,38,9.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,7.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,37,14.0,0.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
6,49,5.0,0.0,0.0,16.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,52,9.0,0.0,0.0,45.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
8,31,14.0,14084.0,0.0,50.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9,42,13.0,5178.0,0.0,40.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# Splitting the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, labels,test_size = 0.25)

In [8]:
x_train.shape

(33916, 103)

In [9]:
x_test.shape

(11306, 103)

In [10]:
y_train.shape

(33916,)

In [11]:
y_test.shape

(11306,)

In [12]:
# Decision Tree Prediction
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
# fitting
dt.fit(x_train,y_train)
# prediction
y_pred = dt.predict(x_test)
y_pred
# Print accuracy
from sklearn.metrics import accuracy_score
print("Accuracy untuk data train: " + str(accuracy_score(y_train, dt.predict(x_train))))
print("Accuracy untuk data test: " + str(accuracy_score(y_test, dt.predict(x_test))))

Accuracy untuk data train: 0.9735228210873924
Accuracy untuk data test: 0.8169113744914205


In [17]:
# Logistic Regression Prediction
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
# fitting
lr.fit(x_train,y_train)
# prediction
y_pred = lr.predict(x_test)
y_pred
# Print accuracy
from sklearn.metrics import accuracy_score
print("Accuracy untuk data train: " + str(accuracy_score(y_train, lr.predict(x_train))))
print("Accuracy untuk data test: " + str(accuracy_score(y_test, lr.predict(x_test))))

Accuracy untuk data train: 0.8458839485788419
Accuracy untuk data test: 0.8522023704227843


In [14]:
# K-NN Prediction
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
# fitting
knn.fit(x_train,y_train)
# prediction
y_pred = knn.predict(x_test)
y_pred
# Print accuracy
from sklearn.metrics import accuracy_score
print("Accuracy untuk data train: " + str(accuracy_score(y_train, knn.predict(x_train))))
print("Accuracy untuk data test: " + str(accuracy_score(y_test, knn.predict(x_test))))

Accuracy untuk data train: 0.8898160160396273
Accuracy untuk data test: 0.8427383690076066


In [15]:
# Random Forest Prediction
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
# fitting
rf.fit(x_train,y_train)
# prediction
y_pred = rf.predict(x_test)
y_pred
# Print accuracy
from sklearn.metrics import accuracy_score
print("Accuracy untuk data train: " + str(accuracy_score(y_train, rf.predict(x_train))))
print("Accuracy untuk data test: " + str(accuracy_score(y_test, rf.predict(x_test))))

Accuracy untuk data train: 0.9734933364783583
Accuracy untuk data test: 0.8454802759596675


In [None]:
# SVM Prediction
from sklearn import svm
sv = svm.SVC()
# fitting
sv.fit(x_train,y_train)
# prediction
y_pred = sv.predict(x_test)
y_pred
# Print accuracy
from sklearn.metrics import accuracy_score
print("Accuracy untuk data train: " + str(accuracy_score(y_train, sv.predict(x_train))))
print("Accuracy untuk data test: " + str(accuracy_score(y_test, sv.predict(x_test))))