# Random Forest — more or less than 50K/year

### Importing
First, we import the libraries needed to read the data, to use a `RandomForestClassifier`, a `LabelEncoder`, and `GridSearchCV`.

In [1]:
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import numpy as np

### Data parsing
Then, we parse the data from `adult_data.csv` and we clarify which columns should be used as training data — we exclude fnlwgt and the result, as those both relate to income, which is what our output is (so we don't want them as an input).

In [2]:
df = pandas.read_csv("adult_data.csv")

In [3]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'result']

In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,result
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
5,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
9,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


In [5]:
test_features = ['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

In [6]:
all_features = df.columns

In [7]:
df[test_features]

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
1,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
2,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
3,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
4,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
5,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica
6,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
7,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States
8,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States
9,37,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States


In [8]:
df.shape

(32560, 15)

In [9]:
num_batches = 10

We then set aside some data for training with and some for testing with.

In [10]:
num_testing = int((1/num_batches)*df.shape[0])

In [11]:
le = preprocessing.LabelEncoder()
df_transformed = pandas.DataFrame()
for feature in all_features:
    df_transformed[feature] = le.fit_transform(df[feature])

In [12]:
df_shuffled = df_transformed.sample(frac=1)
testing_batches = []
training_batches = []

In [13]:
for i in range(0, num_batches):
    testing_start_index = i*num_testing
    testing_stop_index = ((i+1)*num_testing)
    testing_batches.append(df_shuffled.iloc[testing_start_index:testing_stop_index,])
    first_training_batch = df_shuffled.iloc[:testing_start_index,]
    second_training_batch = df_shuffled.iloc[testing_stop_index:,]
    concatenated = pandas.concat([first_training_batch, second_training_batch])
    training_batches.append(concatenated)
#     print(str(first_training_batch.shape[0]) + " size and " + str(second_training_batch.shape[0]))
#     print(str(testing_start_index) + " through " + str(testing_stop_index))

In [14]:
clf = RandomForestClassifier()
accuracies = []
for i in range(0, num_batches):
    X = training_batches[i][test_features]
    Y = training_batches[i]['result']
    clf.fit(X, Y)
    predictions = clf.predict(testing_batches[i][test_features])
    results = testing_batches[i]['result']
    corrects = predictions == results
    accuracies.append(sum(corrects))

The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)
The shape of X is (29304, 13)


In [15]:
accuracies

[2773, 2763, 2771, 2723, 2723, 2754, 2740, 2785, 2714, 2757]