### Dataset - UCI Machine learning 

In [4]:
#importing libraries

import pandas as pd
import numpy as np

In [104]:
#reading datasets file using pandas

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [105]:
print(train_data.shape)
print(test_data.shape)

(32561, 15)
(16281, 15)


In [106]:
# checking trainig head to see how our data looks like
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [107]:
#info is used to check the dtype w.r.t.columns
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  target          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [108]:
#counting nan values in rows - using dropna function
train_data_nan_rows =  len(train_data) - len(train_data.dropna())
print(train_data_nan_rows)
test_data_nan_rows =  len(test_data) - len(test_data.dropna())
print(test_data_nan_rows)

2399
1221


In [109]:
#checking number of NA values in each columns
train_data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

### Need to impute the nan values. Techniques which can be use for categorical variables are :
    1. Replace it with maximum occuring enteries.

In [110]:
print(train_data.workclass.value_counts())
print(train_data.occupation.value_counts())
print(train_data['native.country'].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64
 United-States                 29170
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England    

In [111]:
#using fillna to fill Na values for a column.

#Education 
train_data.workclass.value_counts(sort=True)
train_data.workclass.fillna('Private',inplace=True)


#Occupation
train_data.occupation.value_counts(sort=True)
train_data.occupation.fillna('Prof-specialty',inplace=True)


#Native Country
train_data['native.country'].value_counts(sort=True)
train_data['native.country'].fillna('United-States',inplace=True)

In [112]:
#checking nan values one more time just to be sure
train_data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

### Checking for Imbalanced data

In [113]:
train_data.target.value_counts()/train_data.shape[0]

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

In [114]:
train_data.iloc[:,-1]

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: target, Length: 32561, dtype: object

In [115]:
## How education affects the salary using cross tab
pd.crosstab(train_data.education, train_data.target, margins=True)/train_data.shape[0]

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,0.02675,0.001904,0.028654
11th,0.034243,0.001843,0.036086
12th,0.012285,0.001013,0.013298
1st-4th,0.004975,0.000184,0.00516
5th-6th,0.009736,0.000491,0.010227
7th-8th,0.018611,0.001228,0.01984
9th,0.014957,0.000829,0.015786
Assoc-acdm,0.024631,0.008139,0.032769
Assoc-voc,0.031357,0.011087,0.042443
Bachelors,0.09625,0.06821,0.164461


##### Above table that out of 75% people with salary less than 50k, 27% are HS grad as expected

### To apply any machine learning algo, we first need to encode the categorical variable to numeric numbers. Using label encoder in this case.

In [116]:
from sklearn import preprocessing

for x in train_data.columns:
    if train_data[x].dtype == 'object':
        l_encoder = preprocessing.LabelEncoder()
        l_encoder.fit(train_data[x].values)
        train_data[x] = l_encoder.transform(list(train_data[x].values))

In [117]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [118]:
train_data.iloc[:,-1].value_counts()/len(train_data)

0    0.75919
1    0.24081
Name: target, dtype: float64

### Checking importance of each feature w.r.t to target variable

In [119]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

best_feat = SelectKBest(score_func=chi2, k=14)
fit = best_feat.fit(train_data.iloc[:,:-1], train_data.iloc[:,-1])
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(train_data.iloc[:,:-1].columns)
feature_score = pd.concat([dfcolumns, dfscores], axis=1)
feature_score.columns = ['Features', 'Score']
feature_score.sort_values(by=['Score'])

Unnamed: 0,Features,Score
6,occupation,11.08499
13,native.country,17.71112
8,race,33.03131
1,workclass,73.34183
3,education,297.9423
9,sex,502.4394
5,marital.status,1123.47
4,education.num,2401.422
7,relationship,3659.143
12,hours.per.week,6476.409


### Above table shows that all feature are important to predict the target value, Lets create our RandomForrest classifier based on all features.

In [120]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

y = train_data['target']
del train_data['target']

x = train_data

In [121]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.3, random_state=1, stratify=y)
clf = RandomForestClassifier(n_estimators=500, max_depth=6)
clf.fit(X_train, Y_train)

RandomForestClassifier(max_depth=6, n_estimators=500)

In [126]:
import numpy as np
prediction = clf.predict(X_test)
acc = accuracy_score(np.array(Y_test), prediction)
print("Accuracy is - {}".format(acc))

Accuracy is - 0.8522878493192753


#### Above accuracy can be improved by different values in RandomForrestClassifier like depht, features to select