# Data Preprocessing

## Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load Dataset

In [2]:
dataset = pd.read_csv('adult.csv')
# Drop Education-num column because it corresponds directly to Education Column
dataset = dataset.drop(columns='Education-num')
#X = dataset.iloc[:, :-1].values
#y = dataset.iloc[:, -1].values

In [3]:
print(dataset)

       Age          Workclass  Fnlwgt    Education       Marital-status  \
0       39          State-gov   77516    Bachelors        Never-married   
1       50   Self-emp-not-inc   83311    Bachelors   Married-civ-spouse   
2       38            Private  215646      HS-grad             Divorced   
3       53            Private  234721         11th   Married-civ-spouse   
4       28            Private  338409    Bachelors   Married-civ-spouse   
...    ...                ...     ...          ...                  ...   
32556   27            Private  257302   Assoc-acdm   Married-civ-spouse   
32557   40            Private  154374      HS-grad   Married-civ-spouse   
32558   58            Private  151910      HS-grad              Widowed   
32559   22            Private  201490      HS-grad        Never-married   
32560   52       Self-emp-inc  287927      HS-grad   Married-civ-spouse   

               Occupation    Relationship    Race      Sex  Capital-gain  \
0            Adm-cleric

In [4]:
dataset.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Prediction
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
dataset.shape

(32561, 14)

In [6]:
dataset.dtypes 

Age                int64
Workclass         object
Fnlwgt             int64
Education         object
Marital-status    object
Occupation        object
Relationship      object
Race              object
Sex               object
Capital-gain       int64
Capital-loss       int64
Hours-per-week     int64
Native-country    object
Prediction        object
dtype: object

In [7]:
#Workclass, Education, Marital-status, Occupation,Relationship, Race,Sex,Native-country, Prediction are categorical Data

## Handle Missing Values

In [8]:
dataset.isnull().sum()

Age               0
Workclass         0
Fnlwgt            0
Education         0
Marital-status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Prediction        0
dtype: int64

In [9]:
#Although there are no Null values, there is however '?' values and we need to process that
dataset[dataset ==' ?'] = np.nan

In [10]:
dataset.isnull().sum()

Age                  0
Workclass         1836
Fnlwgt               0
Education            0
Marital-status       0
Occupation        1843
Relationship         0
Race                 0
Sex                  0
Capital-gain         0
Capital-loss         0
Hours-per-week       0
Native-country     583
Prediction           0
dtype: int64

In [11]:
#As a result, there is null values populated in Workclass, Occupation and Native-country

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       30725 non-null  object
 2   Fnlwgt          32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Marital-status  32561 non-null  object
 5   Occupation      30718 non-null  object
 6   Relationship    32561 non-null  object
 7   Race            32561 non-null  object
 8   Sex             32561 non-null  object
 9   Capital-gain    32561 non-null  int64 
 10  Capital-loss    32561 non-null  int64 
 11  Hours-per-week  32561 non-null  int64 
 12  Native-country  31978 non-null  object
 13  Prediction      32561 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


In [13]:
#Replace Null values with Mode
for col in ['Workclass','Occupation', 'Native-country']:
    dataset[col].fillna(dataset[col].mode()[0], inplace =True)

In [14]:
#All values has been populated 
dataset.isnull().sum()

Age               0
Workclass         0
Fnlwgt            0
Education         0
Marital-status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Prediction        0
dtype: int64

## Feature Engineering

In [15]:
#Before
dataset.nunique()

Age                  73
Workclass             8
Fnlwgt            21648
Education            16
Marital-status        7
Occupation           14
Relationship          6
Race                  5
Sex                   2
Capital-gain        119
Capital-loss         92
Hours-per-week       94
Native-country       41
Prediction            2
dtype: int64

In [16]:
#Before
dataset['Workclass'].value_counts()

 Private             24532
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Workclass, dtype: int64

In [17]:
dataset['Workclass'] = dataset['Workclass'].replace([' Local-gov',' State-gov',' Federal-gov'], 'Government')
dataset['Workclass'] = dataset['Workclass'].replace([' Self-emp-not-inc',' Self-emp-inc'], 'Self-Employed')
dataset['Workclass'] = dataset['Workclass'].replace(' Private','Private')
dataset['Workclass'] = dataset['Workclass'].replace([' Without-pay', ' Never-worked'],'Unemployed')

In [18]:
#After
dataset['Workclass'].value_counts()

Private          24532
Government        4351
Self-Employed     3657
Unemployed          21
Name: Workclass, dtype: int64

In [19]:
#Before
dataset['Education'].value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: Education, dtype: int64

In [20]:
dataset['Education'] = dataset['Education'].replace([' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th',' 12th'], 'Grade-school')
dataset['Education'] = dataset['Education'].replace(' HS-grad','High-school')
dataset['Education'] = dataset['Education'].replace([' Assoc-voc',' Assoc-acdm',' Some-college'], 'Associate')
dataset['Education'] = dataset['Education'].replace(' Bachelors','Undergraduate')
dataset['Education'] = dataset['Education'].replace([' Masters',' Prof-school',' Doctorate'], 'Postgraduate')

In [21]:
#After
dataset['Education'].value_counts()

High-school      10501
Associate         9740
Undergraduate     5355
Grade-school      4253
Postgraduate      2712
Name: Education, dtype: int64

In [22]:
#Before
dataset['Marital-status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: Marital-status, dtype: int64

In [23]:
dataset['Marital-status'] = dataset['Marital-status'].replace([' Married-civ-spouse',' Married-AF-spouse'], 'Married')
dataset['Marital-status'] = dataset['Marital-status'].replace([' Divorced',' Separated',' Married-spouse-absent',' Widowed'], 'Separated')
dataset['Marital-status'] = dataset['Marital-status'].replace(' Never-married','Single')

In [24]:
#After
dataset['Marital-status'].value_counts()

Married      14999
Single       10683
Separated     6879
Name: Marital-status, dtype: int64

In [25]:
#After
dataset.nunique()

Age                  73
Workclass             4
Fnlwgt            21648
Education             5
Marital-status        3
Occupation           14
Relationship          6
Race                  5
Sex                   2
Capital-gain        119
Capital-loss         92
Hours-per-week       94
Native-country       41
Prediction            2
dtype: int64

## Label Encoding on Categorical Variables

In [26]:
categorical = dataset.select_dtypes(include = "object").columns
print (categorical)

Index(['Workclass', 'Education', 'Marital-status', 'Occupation',
       'Relationship', 'Race', 'Sex', 'Native-country', 'Prediction'],
      dtype='object')


In [27]:
from sklearn.preprocessing import LabelEncoder
for feature in categorical:
    le = LabelEncoder()
    dataset[feature] = le.fit_transform(dataset[feature])

In [28]:
dataset['Prediction'].value_counts()

0    24720
1     7841
Name: Prediction, dtype: int64

## Splitting the dataset into the Training set and Test set

In [29]:
#Splitting the dataset based on 70% Train and 30% Test, and set random state = 0 so that results for every run will be the same

In [30]:
from sklearn.model_selection import train_test_split
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Feature Scaling

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[:,:] = scaler.fit_transform(X_train[:,:])
X_test[:,:] = scaler.transform(X_test[:,:])

## Dimensionality reduction

In [32]:
#Principal Component Analysis is employed to solve the effect of the Curse of Dimensionality, while restoring about 95% of the Explained Variance
#Feature Extraction
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Classification Models

## Support Vector Machine

### Building Model (Training)

In [33]:
%%time
from sklearn.svm import SVC
#Using RBF function
classifier = SVC(random_state = 0) 
classifier.fit(X_train, y_train)

Wall time: 30.6 s


SVC(random_state=0)

### Evaluate Model (Testing)

In [34]:
%%time
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy='{:.2%}'.format(accuracy_score(y_test, y_pred))
print('Accuracy rate: ' , accuracy)
f1score = '{:.2%}'.format(f1_score(y_test, y_pred, average='weighted'))
print('Weighted F-Measure: ' , f1score)

[[7151  256]
 [1613  749]]
Accuracy rate:  80.87%
Weighted F-Measure:  77.82%
Wall time: 18.2 s


### K-Fold Cross Validation (Testing)

In [35]:
%%time
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(estimator = classifier,X = X_train, y = y_train, cv = 10,scoring = "accuracy" )
f1score = cross_val_score(estimator = classifier,scoring = "f1_weighted" ,X = X_train, y = y_train, cv = 10)
print("Accuracy rate: {:.2%}".format(accuracy.mean()))
print("Weighted F-Measure: {:.2%}".format(f1score.mean()))

Accuracy rate: 80.94%
Weighted F-Measure: 77.86%
Wall time: 9min 37s


## Random Forest

### Building Model (Training)

In [36]:
%%time
from sklearn.ensemble import RandomForestClassifier
#Forest utilizes 100 decision trees to make a decision,uses entropy function to split 
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0) 
classifier.fit(X_train, y_train)

Wall time: 7.81 s


RandomForestClassifier(criterion='entropy', random_state=0)

### Evaluate Model (Testing)

In [37]:
%%time
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy='{:.2%}'.format(accuracy_score(y_test, y_pred))
print('Accuracy rate: ' , accuracy)
f1score = '{:.2%}'.format(f1_score(y_test, y_pred, average='weighted'))
print('Weighted F-Measure: ' , f1score)

[[6883  524]
 [1347 1015]]
Accuracy rate:  80.85%
Weighted F-Measure:  79.33%
Wall time: 388 ms


### K-Fold Cross Validation (Testing)

In [38]:
%%time
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(estimator = classifier,X = X_train, y = y_train, cv = 10,scoring = "accuracy" )
f1score = cross_val_score(estimator = classifier,scoring = "f1_weighted" ,X = X_train, y = y_train, cv = 10)
print("Accuracy rate: {:.2%}".format(accuracy.mean()))
print("Weighted F-Measure: {:.2%}".format(f1score.mean()))

Accuracy rate: 80.66%
Weighted F-Measure: 79.17%
Wall time: 2min 22s
