# Predicting Income with Random Forests

In this project, we will be using a dataset containing census information from UCI’s Machine Learning Repository.

By using this census data with a random forest, we will try to predict whether or not a person makes more than $50,000.

Let’s get started!

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# load the data
income_data = pd.read_csv('income.csv', header = 0, delimiter=', ')

# print the first row in its entirety
print(income_data.iloc[0])

# separate the labels from the rest of the data
labels = income_data['income']

# transform data for later use
income_data['sex-int'] = income_data['sex'].apply(lambda row: 0 if row == 'Male' else 1)
income_data['country-int'] = income_data['native-country'].apply(lambda row: 0 if row == 'United-States' else 1)

# pick which columns to use for income prediction
data = income_data[['age', 'capital-gain', 'capital-loss', 'hours-per-week', 'sex-int']]

# split our data and labels into a training set and a test set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=1)

# create a RandomForestClassifier
forest = RandomForestClassifier(random_state=1)

# fit the model using the training data and training labels to train the random forest
forest.fit(train_data, train_labels)

# test the model's accuracy
print(forest.score(test_data, test_labels))
#print(income_data['native-country'].value_counts())

age                          39
workclass             State-gov
fnlwgt                    77516
education             Bachelors
education-num                13
marital-status    Never-married
occupation         Adm-clerical
relationship      Not-in-family
race                      White
sex                        Male
capital-gain               2174
capital-loss                  0
hours-per-week               40
native-country    United-States
income                    <=50K
Name: 0, dtype: object
0.8272939442328953


In [2]:
# add 'country-int' to the columns used when creating data. How does this change the accuracy of your model?
data = income_data[['age', 'capital-gain', 'capital-loss', 'hours-per-week', 'sex-int', 'country-int']]

# split our data and labels into a training set and a test set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=1)

# create a RandomForestClassifier
forest = RandomForestClassifier(random_state=1)

# fit the model using the training data and training labels to train the random forest
forest.fit(train_data, train_labels)

# test the model's accuracy
print(forest.score(test_data, test_labels))
#print(income_data['native-country'].value_counts())

0.8225033779633951
