In [1]:
!nvidia-smi

Tue Aug  2 14:25:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
#A Colab pro environment should have >20Gb of total memory.
from psutil import virtual_memory
colab_pro = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(colab_pro))

if colab_pro < 20:
  print('Not using a high-RAM runtime')
  # train model with lower settings
else:
  print('You are using a high-RAM runtime!')
  # train model with higher settings

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
######## import all necessity functions ##########
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, classification_report, silhouette_score

In [4]:
######## import the dataset #########
df = pd.read_csv('/content/adult.csv', na_values = ['?', '??', '???'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
###### check the unique value of a target class #######
print(df.loc[:, 'income'].value_counts())
####### Convert the target class in a label encoding #########
df.loc[:, 'income'] = df.loc[:, 'income'].map({'<=50K': 0, '>50K': 1})
####### print the dataframe ########
df.head()

<=50K    37155
>50K     11687
Name: income, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,0


In [6]:
####### Check missing value present in the dataset or not ########
if df.isnull().sum().sum() > 0:
  print("Dataset contains NaN value")
else:
  print("Dataset does not contain NaN value")

Dataset contains NaN value


NaN Value Handle in this Dataset 


In [7]:
####### Check which column contains null value ########
for column_, value_ in zip(df.isnull().sum()[df.isnull().sum() > 0].index, df.isnull().sum()[df.isnull().sum() > 0].values):
  print(f"{column_:15} column NaN value total = {value_}")

workclass       column NaN value total = 2799
occupation      column NaN value total = 2809
native-country  column NaN value total = 857


In [8]:
###### Here, workclass and occupation column will be done by "random sample imputation" #######
def randomSampleImputation(feature_):
  random_value_ = df.loc[:, feature_].dropna().sample(df.loc[:, feature_].isnull().sum(), random_state = 42).values
  NaN_index_    = df.loc[df.loc[:, feature_].isnull(), :].index
  for index_, nan_index_ in enumerate(NaN_index_):
    df.loc[nan_index_, feature_] = random_value_[index_]
  print(feature_, " is done.\n")

###### call the function and do the imputation ######
for feature_ in ['workclass', 'occupation', 'native-country']:
  randomSampleImputation(feature_)

print("----------------------------------------------------\n")
###### Check NaN value exist or not ######
if df.isnull().sum().sum() > 0:
  print("NaN value exist in the dataset.")
else:
  print("There is no NaN value in the dataset.")

workclass  is done.

occupation  is done.

native-country  is done.

----------------------------------------------------

There is no NaN value in the dataset.


Handle Categorical Data in the Dataset


In [9]:
##### Do the One Hot Encoding in marital-status,marital-status, relationship, race, gender ######
df = pd.get_dummies(df, columns = ['marital-status', 'occupation', 'relationship', 'race', 'gender'], drop_first = True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,capital-gain,capital-loss,hours-per-week,native-country,income,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
0,25,Private,226802,11th,7,0,0,40,United-States,0,...,0,0,1,0,0,0,1,0,0,1
1,38,Private,89814,HS-grad,9,0,0,50,United-States,0,...,0,0,0,0,0,0,0,0,1,1
2,28,Local-gov,336951,Assoc-acdm,12,0,0,40,United-States,1,...,0,0,0,0,0,0,0,0,1,1
3,44,Private,160323,Some-college,10,7688,0,40,United-States,1,...,0,0,0,0,0,0,1,0,0,1
4,18,Private,103497,Some-college,10,0,0,30,United-States,0,...,0,0,1,0,0,0,0,0,1,0


In [10]:
###### Target Guided Label Encoding in workclass and education ######
def targetGuidedOrdinalEncoding(feature_):
  target_guided_index_ = df.groupby([feature_])['income'].mean().sort_values(ascending = True).index
  df.loc[:, feature_] = df.loc[:, feature_].map({value_: index_ for index_, value_ in enumerate(target_guided_index_)})
  print(feature_, " is done.\n")

###### Mean Encoding in workclass and education ######
def meanEncoding(feature_):
  df.loc[:, feature_] = df.loc[:, feature_].map(df.groupby(feature_)['income'].mean().sort_values(ascending = True).to_dict())
  print(feature_, " is done.\n")

###### Call the function and do this #######
for feature_ in ['workclass', 'education']:
  targetGuidedOrdinalEncoding(feature_)

###### Call the function and do this #######
for feature_ in ['native-country']:
  meanEncoding(feature_)

###### print the dataset ########
df.head()

workclass  is done.

education  is done.

native-country  is done.



Unnamed: 0,age,workclass,fnlwgt,education,educational-num,capital-gain,capital-loss,hours-per-week,native-country,income,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
0,25,2,226802,2,7,0,0,40,0.244351,0,...,0,0,1,0,0,0,1,0,0,1
1,38,2,89814,8,9,0,0,50,0.244351,0,...,0,0,0,0,0,0,0,0,1,1
2,28,5,336951,11,12,0,0,40,0.244351,1,...,0,0,0,0,0,0,0,0,1,1
3,44,2,160323,9,10,7688,0,40,0.244351,1,...,0,0,0,0,0,0,1,0,0,1
4,18,2,103497,9,10,0,0,30,0.244351,0,...,0,0,1,0,0,0,0,0,1,0


In [11]:
###### Check whether dataset is imbalanced or not #######
df.loc[:, 'income'].value_counts()
###### DataSet is imbalanced ######

0    37155
1    11687
Name: income, dtype: int64

To balanced the dataset we will use ADADYN

In [12]:
###### Split the dataset into X and y #######
y = df.loc[:, 'income']
###### Drop the income and fnlwgt ######
df.drop(columns = ['income','fnlwgt'], axis = 1, inplace = True)
X = df.iloc[:].values
###### Scaling the dataset using MinMax scaler #######
minMaxScaler_ = MinMaxScaler()
X = minMaxScaler_.fit_transform(X)
###### Balanced the dataset #######
ADASYN_ = ADASYN(random_state = 42,  n_jobs = -1)
X, y = ADASYN_.fit_resample(X, y)
print(X.shape, y.shape)

(74345, 37) (74345,)


In [13]:
####### Train and Test split the dataset #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [14]:
###### Import the dataset #######
# df = pd.read_csv('/content/adult_new.csv')
# df.head()

In [15]:
###### Split the dataset into X and y #######
# y = df.loc[:, 'income']
# ###### Drop the income and fnlwgt ######
# df.drop(columns = ['income'], axis = 1, inplace = True)
# X = df.iloc[:].values
# ###### Scaling the dataset using MinMax scaler #######
# minMaxScaler_ = MinMaxScaler()
# X = minMaxScaler_.fit_transform(X)
# ###### Balanced the dataset #######
# ADASYN_ = ADASYN(random_state = 42,  n_jobs = -1)
# X, y = ADASYN_.fit_resample(X, y)
# print(X.shape, y.shape)

In [16]:
###### Split the dataset ######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [17]:
##### Now using Stacking and Blending for prediction ######

##### Stacking and Blending #####

estimators_ = [
    ('Decision_tree', DecisionTreeClassifier(random_state = 42, criterion = 'entropy')),
    ('XgBoost', XGBClassifier()),
    ('KNN', KNeighborsClassifier(n_neighbors = 5)),
    ('Random_forest', RandomForestClassifier())
]

stacking_blending_ = StackingClassifier(estimators = estimators_, final_estimator = XGBClassifier(), cv = 10)
stacking_blending_.fit(X_train, y_train)

StackingClassifier(cv=10,
                   estimators=[('Decision_tree',
                                DecisionTreeClassifier(criterion='entropy',
                                                       random_state=42)),
                               ('XgBoost', XGBClassifier()),
                               ('KNN', KNeighborsClassifier()),
                               ('Random_forest', RandomForestClassifier())],
                   final_estimator=XGBClassifier())

In [18]:
predicted_ = stacking_blending_.predict(X_test)
print("accuracy is   = ", accuracy_score(predicted_, y_test))
print("recall is     = ", recall_score(predicted_, y_test))
print("precision is  = ", precision_score(predicted_, y_test))
print("f1 score is   = ", f1_score(predicted_, y_test))

accuracy is   =  0.8869709469153515
recall is     =  0.8762036956710332
precision is  =  0.9022690727175272
f1 score is   =  0.8890453765239206


Do the Same thing in Deep Learning

In [19]:
import tensorflow as tf
from tensorflow import keras
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, ReLU, LeakyReLU, PReLU, ELU, Dropout
from tensorflow.keras.initializers import GlorotNormal, GlorotUniform, HeNormal, HeUniform
from tensorflow.keras.regularizers import L1, L2, L1L2
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import SGD, Adadelta, Adagrad, RMSprop, Adam, Nadam, Adamax
from tensorflow.keras.activations import relu, sigmoid, tanh, softmax, sigmoid

In [20]:
X.shape

(74345, 37)

In [21]:
model = Sequential()
##### Declare Input Layer #####
model.add(Dense(units = 64, activation = 'relu', input_dim = X.shape[1], kernel_regularizer=L2()))
model.add(Dropout(0.5))
##### First Hidden Layer #####
model.add(Dense(units = 16, activation = 'relu', kernel_initializer = HeNormal(), kernel_regularizer = L2()))
model.add(Dropout(0.5))
##### Output Layer #####
model.add(Dense(units = 1,  activation = 'sigmoid', kernel_initializer = GlorotNormal()))

model.compile(loss = 'binary_crossentropy', optimizer = RMSprop(), metrics = ['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                2432      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 16)                1040      
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 3,489
Trainable params: 3,489
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(X_train, y_train, batch_size = 32, validation_data = (X_test, y_test), epochs = 10, verbose = 1, shuffle = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb56326f7d0>