# ML Dataset exploration

It is a binary classification problem. We need to predict if the salary of a given person is less than or more than 50K.

In [16]:
import numpy as np
import pandas as pd
import sklearn

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


#### This dataset has 32561 rows (excluding the label names) and 15 columns. Out of 15 columns, 6 columns are integer classes whereas 9 columns are object (or character) classes. 

#### An alternative way is by calling the .shape function

In [10]:
print("The training data has", train.shape)
print("The testing data has", test.shape)

The training data has (32561, 15)
The testing data has (16281, 15)


In [11]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Checking for missing values

In [12]:
nans = train.shape[0] - train.dropna().shape[0]
print("%d rows have missing values in training data" %nans)

nand = test.shape[0] - test.dropna().shape[0]
print("%d rows have missing values in test data" %nand)

2399 rows have missing values in training data
1221 rows have missing values in test data


#### Which columns have missing values

In [29]:
train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

#### Count number of unique values in classes

In [5]:
category = train.select_dtypes(include=['O']) #O for Object
category.apply(pd.Series.nunique)

workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

#### Impute missing values with their respective modes.

In [6]:
train.workclass.value_counts(sort=True)

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [12]:
train.workclass.fillna('Private', inplace=True)

In [7]:
train.occupation.value_counts(sort=True)

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [13]:
train.occupation.fillna('Prof-specialty', inplace=True)

In [9]:
train['native.country'].value_counts(sort=True)

 United-States                 29170
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 France                           29
 

In [14]:
train['native.country'].fillna('United-States', inplace=True)

In [15]:
train.isnull().sum() #check for missing values

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

#### Check the target variable to investigate if this data is imbalanced or not.

In [20]:
train.target.value_counts()/train.shape[0]

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

#### Create a cross tab of the target variable with education. With this, we'll try to understand the influence of education on the target variable.

In [37]:
pd.crosstab(train.education, train.target, margins=True)/train.shape[0]


target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,0.02675,0.001904,0.028654
11th,0.034243,0.001843,0.036086
12th,0.012285,0.001013,0.013298
1st-4th,0.004975,0.000184,0.00516
5th-6th,0.009736,0.000491,0.010227
7th-8th,0.018611,0.001228,0.01984
9th,0.014957,0.000829,0.015786
Assoc-acdm,0.024631,0.008139,0.032769
Assoc-voc,0.031357,0.011087,0.042443
Bachelors,0.09625,0.06821,0.164461


We see that out of 75% people with <=50K salary, 27% people are high school graduates, which is correct as people with lower levels of education are expected to earn less. On the other hand, out of 25% people with >=50K salary, 6% are bachelors and 5% are high-school grads. Now, this pattern seems to be a matter of concern. That's why we'll have to consider more variables before coming to a conclusion.

In [38]:
pd.crosstab(train['native.country'], train.target, margins=True)/train.shape[0]

target,<=50K,>50K,All
native.country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cambodia,0.000369,0.000215,0.000584
Canada,0.002518,0.001198,0.003716
China,0.001689,0.000614,0.002303
Columbia,0.001751,6.1e-05,0.001812
Cuba,0.00215,0.000768,0.002918
Dominican-Republic,0.002088,6.1e-05,0.00215
Ecuador,0.000737,0.000123,0.00086
El-Salvador,0.002979,0.000276,0.003255
England,0.001843,0.000921,0.002764
France,0.000522,0.000369,0.000891


In [39]:
pd.crosstab(train.sex, train.target, margins=True)/train.shape[0]

target,<=50K,>50K,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,0.294586,0.036209,0.330795
Male,0.464605,0.204601,0.669205
All,0.75919,0.24081,1.0


#### Label encoding for columns with unique values

In [41]:
from sklearn import preprocessing

for x in train.columns:
    if train[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[x].values))
        train[x] = lbl.transform(list(train[x].values))
    

In [42]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [44]:
train.target.value_counts()

0    24720
1     7841
Name: target, dtype: int64

#### Building Random Forest Model

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

In [48]:
y = train['target']
del train['target']

X = train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [49]:
clf = RandomForestClassifier(n_estimators=500, max_depth=6)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
prediction = clf.predict(X_test)

In [52]:
acc = accuracy_score(np.array(y_test), prediction)
print("The accuracy using Random Forest is {}".format(acc))

The accuracy using Random Forest is 0.8523902139420616
