# Playing with Binary Classification problem
dataset - https://s3-ap-southeast-1.amazonaws.com/he-public-data/datafiles19cdaf8.zip

In [2]:
import numpy as np
import pandas as pd

In [4]:
# loading the data
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [5]:
# let's see info? 
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
train.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
age               16281 non-null int64
workclass         15318 non-null object
fnlwgt            16281 non-null int64
education         16281 non-null object
education.num     16281 non-null int64
marital.status    16281 non-null object
occupation        15315 non-null object
relationship      16281 non-null object
race              16281 non-null object
sex               16281 non-null object
capital.gain      16281 non-null int64
capital.loss      16281 non-null int64
hours.per.week    16281 non-null int64
native.country    16007 non-null object
target            16281 non-null object
dtypes: int64(6), object(9)
memory usage: 1.9+ MB


** So, as we see, train has 32561 rows and test has 16281 rows **

Let's have a glimpse at our data.

In [8]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K.


**Missing data values**

As you can notice in test set above, there are NaN values. We can't have missing values and do our analysis properly.'

Let's find out all missing value in train and test data set.
We will use ```dropna()``` for this. [See more details here](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html)

In [10]:
# return total rows
train.shape[0]

32561

In [11]:
# return rows after dropping rows with NaN elements
train.dropna().shape[0]

30162

In [12]:
# number of rows having missing values
nans = train.shape[0] - train.dropna().shape[0]

In [13]:
print str(nans)+" rows have missing values in train data"

2399 rows have missing values in train data


In [14]:
nansTest = test.shape[0] - test.dropna().shape[0]
print str(nansTest)+" rows have missing values in test data"

1221 rows have missing values in test data


** We should find out in which all columns these missing values occur **

Becuase that is a lot of missing values bro. We'll use ```isnull()``` of pandas [described here](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.isnull.html)

In [15]:
train.isnull()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


As we see, it returns True for whenever there is a NaN or None value in a cell, and otherwise False. Let's sum this up to know better results.'

In [16]:
train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

**So, only 3 columns have NaN values**

Let's select all columns which have data type as Object

In [18]:
cat = train.select_dtypes(include=['O'])
cat

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,target
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica,<=50K
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,>50K
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K


We can see that only a few keywords are there for each column and they keep on repeating. 

**Let's find out how many such unique entries are there for each column**

In [19]:
cat.apply(pd.Series.nunique)

workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

`occupation`, `native.country` and `workclass` had missing values. These have limited set of values here. 

Let's see them and refill the NaNs using them? 

**We will refill our NaNs using mode of each of those columns.**

mode = most frequently occurring one.

In [21]:
# Find the count of each occurrence of value in workclass column
train.workclass.value_counts(sort=True)

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [22]:
# replace NaNs in workclass column with most occuring one - Private
train.workclass.fillna('Private',inplace=True)

In [23]:
# DO SIMILARLY FOR occupation and Native country
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)

train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)

In [24]:
# Check again if any missing values left
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

**Let's look at the column that nees to be classified. How many possible values are there in it, and how many of each?**

In [26]:
# Total count of all possible values in a column
train.target.value_counts()

 <=50K    24720
 >50K      7841
Name: target, dtype: int64

**Wow, 24000+ rows of <=50K means if I classify everything as <=50K, then also accuracy could be good**

In [27]:
# Find how much percentage of total is <=50K
train.target.value_counts()/train.shape[0]*100

 <=50K    75.919044
 >50K     24.080956
Name: target, dtype: float64

**So, 75% accuracy is a no-brainer in this case.**

------------------

**Now let's see how one column actually influences another.**

We'll make a `crosstabe` of education and target to see that.

In [28]:
pd.crosstab(train.education, train.target, margins=True)

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,871,62,933
11th,1115,60,1175
12th,400,33,433
1st-4th,162,6,168
5th-6th,317,16,333
7th-8th,606,40,646
9th,487,27,514
Assoc-acdm,802,265,1067
Assoc-voc,1021,361,1382
Bachelors,3134,2221,5355


**Observation** : So, basically the above thing tells total 24720 rows of <=50K out of which 871 are in 10th standard and so on...

If You look at this, it makes sense. People who are HS-Grad or less should be having <=50K salary.

In [29]:
# We could see this in percentage as well. 
pd.crosstab(train.education, train.target,margins=True)/train.shape[0]

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,0.02675,0.001904,0.028654
11th,0.034243,0.001843,0.036086
12th,0.012285,0.001013,0.013298
1st-4th,0.004975,0.000184,0.00516
5th-6th,0.009736,0.000491,0.010227
7th-8th,0.018611,0.001228,0.01984
9th,0.014957,0.000829,0.015786
Assoc-acdm,0.024631,0.008139,0.032769
Assoc-voc,0.031357,0.011087,0.042443
Bachelors,0.09625,0.06821,0.164461


**Observation** : We see that out of 75% people with <=50K salary, 27% people are high school graduates, which is correct as people with lower levels of education are expected to earn less. On the other hand, out of 25% people with >=50K salary, 6% are bachelors and 5% are high-school grads.

---------------------

# Random Forrest Using Scikit

One thing to note is, Scikit only accepts data in numeric format. We have a lot of columns which have character data. We need to convert that now. Since we already know we have a limited set of choices for values of each column, this won't be hard. 

For example: 10th could be 0, 11th could be 1, and so on...for Education.

**This sort of encoding can be done automatically by `labelencoder` function**

In [31]:
# load sklearn 
from sklearn import preprocessing

In [34]:
# Encode all columns and their values to numeric data
for col in train.columns:
    if train[col].dtype == 'object':
        labelencoder = preprocessing.LabelEncoder()
        labelencoder.fit(list(train[col].values))
        train[col] = labelencoder.transform(list(train[col].values)) 

In [35]:
# check values again
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [36]:
# AWESOME \m/ 

**All values have been converted to numeric data. Even the target one**

In [37]:
train.target.value_counts()

0    24720
1     7841
Name: target, dtype: int64

### Time to Build Random Forest Model

In [38]:
#imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score



In [39]:
# Assign the classification variable to y
y = train['target']

# Remove target from training data to form the data on which model will be there
del train['target']
X = train

**Now we're gonna split data into training and testing data. We'll use  `train_test_split` function.** 

However, I am passing ***stratify=y*** here. Why?

This **stratify** parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.

For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

In [42]:
# split into training and testing data - 0.3 for testing. That means 70% to train model, rest to test.
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

**Let's train a Random Forest Classifier**

For the purpose of this example, I'm just configuring two parameters:
- `n_estimators` : no. of Trees
- `max_depth` : Maximum depth for any Tree

In [43]:
classifier = RandomForestClassifier(n_estimators= 500,
                                   max_depth = 6)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

** Model is done. Let's check accuracy**

In [44]:
prediction = classifier.predict(X_test)

In [45]:
prediction

array([0, 1, 0, ..., 0, 0, 0])

In [46]:
accuracy = accuracy_score(np.array(y_test),prediction)

In [47]:
print "The accuracy of Random Forest is "+str(accuracy)

The accuracy of Random Forest is 0.852287849319
