In [1]:
import pandas as pd
import matplotlib as plt 

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data_folder = 'data/'
dataset= pd.read_csv(data_folder+'data.csv')

# I. Helping Steps to get through the implementation
### - Reading the dataset
### - Visualizing Uncleaned Data from the dataset.
### - Acknowledging the Labels
### - Acknowledging the Features

### - Cleaning the data
### - Building the Pipeline to clean the data from the dataset.
### - Visualizing cleaned data from the dataset.

## Features/Labels
### - labels | 1/0
### - 5 numerical features
### - 8 categorical features

In [3]:
dataset.head(50)

Unnamed: 0,age,workSector,education,educationNum,statusMarriage,career,relationship,race,sex,gainedCapital,lostCapital,hoursPerWeek,country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


# II. Data Preprocessing

In [4]:
# Creating the DataFrame for the preprocessed cleaned Dataset
cleaned_dataset = pd.DataFrame(columns=dataset.columns)

In [5]:
# Categorizing Features 
numerical_features = ['age','educationNum','gainedCapital','lostCapital','hoursPerWeek']
categorical_features = ['workSector','education','statusMarriage','career','relationship','race','sex','country']
label = ['income']
label_encoders = {}

In [6]:
# Repopulating cleaned dataset
for feature in dataset.columns:
    if feature in numerical_features:
        cleaned_dataset[feature] = dataset[feature]
        
# Transforming categorical features to one hot encoded
for feature in dataset.columns:
    if feature in categorical_features+label:
        label_encoders[feature] = LabelEncoder().fit(dataset[feature].astype(str))

# Repopulating cleaned dataset with the one hot encoded features
for feature in label_encoders.keys():
    cleaned_dataset[feature] = label_encoders[feature].transform(dataset[feature].astype(str))

In [7]:
# Visualizing Dataset
cleaned_dataset.head(50)

Unnamed: 0,age,workSector,education,educationNum,statusMarriage,career,relationship,race,sex,gainedCapital,lostCapital,hoursPerWeek,country,income
0,39,5,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,9,13,2,9,5,2,0,0,0,40,4,0
5,37,2,12,14,2,3,5,4,0,0,0,40,38,0
6,49,2,6,5,3,7,1,2,0,0,0,16,22,0
7,52,4,11,9,2,3,0,4,1,0,0,45,38,1
8,31,2,12,14,4,9,1,4,0,14084,0,50,38,1
9,42,2,9,13,2,3,0,4,1,5178,0,40,38,1


In [8]:
# Removing outliers to enhance the dataset's features correlation importance during the training 
# part of the model's creation pipeline
    
print('Removing Outliers')

clf = IsolationForest(max_samples = 100, random_state = 0)
clf.fit(cleaned_dataset)
y_noano = clf.predict(cleaned_dataset)
y_noano = pd.DataFrame(y_noano, columns = [""'Top'])
y_noano[y_noano['Top'] == 1].index.values

cleaned_dataset = cleaned_dataset.iloc[y_noano[y_noano['Top'] == 1].index.values]
cleaned_dataset.reset_index(drop = True, inplace = True)
print("Number of Outliers: {}".format(y_noano[y_noano['Top'] == -1].shape[0]))
print("Number of rows without outliers: {}".format(cleaned_dataset.shape[0]))

Removing Outliers
Number of Outliers: 8474
Number of rows without outliers: 36748


In [9]:
# Creating Features and Label Datasets
Y = cleaned_dataset.pop(label[0])
X = cleaned_dataset

In [10]:
# Creating Train, Val, Test Datasets
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X, Y, test_size=0.30)

X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.30)

In [11]:
# Saving data due to outliers removal from the datasets

print('Saving Train/Val/Test Datasets')

X_train.to_csv(data_folder+'X_train.csv',index=False,header=1)
X_val.to_csv(data_folder+'X_val.csv',index=False,header=1)
X_test.to_csv(data_folder+'X_test.csv',index=False,header=1)

Y_train.to_csv(data_folder+'Y_train.csv',index=False,header=1)
Y_val.to_csv(data_folder+'Y_val.csv',index=False,header=1)
Y_test.to_csv(data_folder+'Y_test.csv',index=False,header=1)

Saving Train/Val/Test Datasets
