In [1]:
# import sys; sys.path.insert(0, '..')
# import lib.functions
#del sys.modules['functions']

import pandas as pd
import numpy as np
from numpy.testing import assert_equal
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

from sklearn.utils import check_X_y
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,StackingClassifier,\
                             VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, \
                            accuracy_score, f1_score, log_loss,\
                            precision_recall_curve, roc_auc_score, auc

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV, LassoCV


In [2]:
salaries = pd.read_csv('../../DataScience/data/salaries_final.csv.gz')

In [3]:
salaries

Unnamed: 0,Age,Education,Occupation,Relationship,Race,Sex,Target
0,39,Bachelors,Adm-clerical,Not-in-family,White,Male,<=50K
1,50,Bachelors,Exec-managerial,Husband,White,Male,<=50K
2,38,HS-grad,Handlers-cleaners,Not-in-family,White,Male,<=50K
3,53,11th,Handlers-cleaners,Husband,Black,Male,<=50K
4,28,Bachelors,Prof-specialty,Wife,Black,Female,<=50K
...,...,...,...,...,...,...,...
32556,27,Assoc-acdm,Tech-support,Wife,White,Female,<=50K
32557,40,HS-grad,Machine-op-inspct,Husband,White,Male,>50K
32558,58,HS-grad,Adm-clerical,Unmarried,White,Female,<=50K
32559,22,HS-grad,Adm-clerical,Own-child,White,Male,<=50K


In [4]:
salaries.Target.unique()

array(['<=50K', '>50K'], dtype=object)

In [5]:
salaries_cat = salaries[['Education', 'Occupation', 'Relationship','Race', 'Sex']]

In [6]:
ohe = OneHotEncoder(handle_unknown='ignore')

encoded = ohe.fit_transform(salaries_cat).toarray()

In [7]:
dummy = pd.DataFrame(encoded, columns=ohe.get_feature_names(salaries_cat.columns))
df = pd.concat([salaries['Age'], dummy, salaries['Target']], axis=1)
del dummy, salaries_cat, salaries

In [8]:
le = LabelEncoder()

target_encoded = le.fit_transform(df.Target)
le.classes_

array(['<=50K', '>50K'], dtype=object)

In [9]:
df.head()

Unnamed: 0,Age,Education_10th,Education_11th,Education_12th,Education_1st-4th,Education_5th-6th,Education_7th-8th,Education_9th,Education_Assoc-acdm,Education_Assoc-voc,...,Relationship_Unmarried,Relationship_Wife,Race_Amer-Indian-Eskimo,Race_Asian-Pac-Islander,Race_Black,Race_Other,Race_White,Sex_Female,Sex_Male,Target
0,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,<=50K
1,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,<=50K
2,38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,<=50K
3,53,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,<=50K
4,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,<=50K


In [10]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:-1], 
                                                    target_encoded, #df.iloc[:,-1]
                                                    test_size=.3,
                                                    stratify=target_encoded,
                                                    random_state=3103)

In [11]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((22792, 45), (9769, 45), (22792,), (9769,))

In [12]:
check_X_y(x_train, y_train)

(array([[31.,  1.,  0., ...,  1.,  0.,  1.],
        [45.,  0.,  0., ...,  0.,  0.,  1.],
        [19.,  0.,  0., ...,  0.,  0.,  1.],
        ...,
        [50.,  0.,  0., ...,  1.,  1.,  0.],
        [50.,  0.,  0., ...,  1.,  0.,  1.],
        [41.,  0.,  0., ...,  1.,  0.,  1.]]),
 array([0, 1, 0, ..., 0, 1, 0]))

## Voting Classifier using Sklearn

In [13]:
classifiers = list()

In [14]:
classifiers.append(('DTC', DecisionTreeClassifier()))
# classifiers.append(('svm', SVC(probability=True)))
classifiers.append(('GNB', GaussianNB()))
classifiers.append(('KNN', KNeighborsClassifier()))

In [15]:
vote = VotingClassifier(estimators=classifiers, voting='hard')

In [16]:
vote.fit(x_train, y_train)

VotingClassifier(estimators=[('DTC', DecisionTreeClassifier()),
                             ('GNB', GaussianNB()),
                             ('KNN', KNeighborsClassifier())])

In [17]:
y_pred_voting = vote.predict(x_test)

In [18]:
print(f"{'Accuracy is'} {accuracy_score(y_test, y_pred_voting)}")
print(f"{'Log Loss is'} {log_loss(y_test, y_pred_voting)}")
print(f"{'F1 Score is'} {log_loss(y_test, y_pred_voting)}")

Accuracy is 0.8034599242501791
Log Loss is 6.788335333899725
F1 Score is 6.788335333899725


In [19]:
vote.score(x_test, y_pred_voting)

1.0

## Boosting Algorithms

## AdaBoost (Adaptive Boosting)

In [20]:
ada = AdaBoostClassifier(n_estimators=45, random_state=2021)
ada.fit(x_train, y_train)

y_pred_ada = ada.predict(x_test)

print(accuracy_score(y_test, y_pred_ada))

0.8316101955164296


## Gradient Boosting Classifier

In [21]:
learning_rate = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
for rate in learning_rate:
    gbc1 = GradientBoostingClassifier(learning_rate=rate, n_estimators=45, max_features=2, random_state=2021)
    gbc1.fit(x_train, y_train)
    
    print("Learning rate: ", rate)
    print("Accuracy score (training): {0:.3f}".format(gbc1.score(x_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gbc1.score(x_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.761
Accuracy score (validation): 0.760
Learning rate:  0.075
Accuracy score (training): 0.787
Accuracy score (validation): 0.787
Learning rate:  0.1
Accuracy score (training): 0.807
Accuracy score (validation): 0.805
Learning rate:  0.25
Accuracy score (training): 0.831
Accuracy score (validation): 0.828
Learning rate:  0.5
Accuracy score (training): 0.836
Accuracy score (validation): 0.829
Learning rate:  0.75
Accuracy score (training): 0.838
Accuracy score (validation): 0.830
Learning rate:  1
Accuracy score (training): 0.838
Accuracy score (validation): 0.828


In [22]:
gbc2 = GradientBoostingClassifier(learning_rate=0.75, n_estimators=45, max_features=2, random_state=2021)

gbc2.fit(x_train, y_train)
y_pred_gbc2 = gbc2.predict(x_test)

print(classification_report(y_test,y_pred_gbc2))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89      7417
           1       0.68      0.56      0.61      2352

    accuracy                           0.83      9769
   macro avg       0.77      0.74      0.75      9769
weighted avg       0.82      0.83      0.82      9769



## XGBoost

## Stacking Classifier

In [32]:
level0 = list()
# level0.append(('lr', LogisticRegression()))
# level0.append(('knn', KNeighborsClassifier()))
# level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))

SyntaxError: positional argument follows keyword argument (<ipython-input-32-735afced2491>, line 5)

In [24]:
level1 = LogisticRegression()

In [25]:
sc_model = StackingClassifier(estimators=level0, final_estimator=level1, stack_method='auto', cv=10, n_jobs=3)

In [26]:
sc_model.fit(x_train, y_train)

CPU times: user 148 ms, sys: 117 ms, total: 265 ms
Wall time: 1min 41s


StackingClassifier(cv=10, estimators=[('svm', SVC()), ('bayes', GaussianNB())],
                   final_estimator=LogisticRegression(), n_jobs=3)

In [27]:
y_pred_sc_model = sc_model.predict(x_test)

In [28]:
sc_model.score(x_test, y_pred_sc_model)

1.0

In [29]:
print(classification_report(y_test, y_pred_sc_model))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      7417
           1       0.68      0.53      0.59      2352

    accuracy                           0.83      9769
   macro avg       0.77      0.73      0.74      9769
weighted avg       0.82      0.83      0.82      9769



## Bagging with default parameters

In [58]:
bagging1 = BaggingClassifier(base_estimator=KNeighborsClassifier(), random_state=3103)

In [None]:
bagging1.fit(x_train, y_train)

In [None]:
bagging1_y_pred = bagging1.predict(x_train)

In [59]:
print(accuracy_score(y_train, bagging1_y_pred))

0.8644261144261144


In [84]:
bagging2 = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                            n_estimators=20,
                            max_samples=.67,
                            max_features=.67,
                            bootstrap=True,
                            oob_score=True,
                            n_jobs=4,
                            random_state=3104)

In [85]:
bagging2.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=0.67,
                  max_samples=0.67, n_estimators=20, n_jobs=4, oob_score=True,
                  random_state=3104)

In [86]:
bagging2_y_pred = bagging2.predict(x_train)

In [87]:
print(accuracy_score(y_train, bagging2_y_pred))

0.8858810108810109
