## Simple Ensemble Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

def income_model_data_prep(data):
    data = pd.get_dummies(
        income.assign(
                target = np.where(data["SalStat"]==" less than or equal to 50,000", 0,1),
                nativecountry = data["nativecountry"].str.replace(" Holand-Netherlands",
                                                         " Germany"),
                JobType = data["JobType"].replace({" Never-worked":" Without-pay"}),
                occupation = data["occupation"].str.replace(" Armed-Forces"," ?")
                ).drop("SalStat",axis=1),
        drop_first=True
    )
    X = data.drop(columns=["target"],axis=1)
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    return X_train, X_test, y_train, y_test

In [3]:
## Split train and test datasets

income = pd.read_csv("../Data/income.csv")

X_train, X_test, y_train, y_test = income_model_data_prep(income)

In [5]:
## Scale the features (needed by KNN and Logistic models)
from sklearn.preprocessing import StandardScaler

sd = StandardScaler()
X_train_std = sd.fit_transform(X_train)
X_test_std = sd.transform(X_test)

In [7]:
## fit the dataset with a KNN model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train_std, y_train)

print('KNN Accuracy:', knn.score(X_test_std, y_test))


KNN Accuracy: 0.8338023764853033


In [8]:
## fit the dataset with Logistic Regression model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.6, penalty='l2',solver='saga',max_iter=1000)
lr.fit(X_train_std, y_train)

print('Logistic Accuracy:', lr.score(X_test_std, y_test))

Logistic Accuracy: 0.8558474046278924


In [9]:
## fit the dataset with a Decision tree model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=25)
dt.fit(X_train_std, y_train)

print('Decision Tree Accuracy:', dt.score(X_test_std, y_test))

Decision Tree Accuracy: 0.8595997498436523


In [15]:
## fit the dataset with Ensemble model
from sklearn.ensemble import VotingClassifier

hard_ensemble = VotingClassifier(
            estimators=[
                ('knn',knn),
                ('lr', lr),
                ('dt', dt)
            ],
    voting="hard"
    )

he = hard_ensemble.fit(X_train_std, y_train)

print('Hard Ensemble Accuracy:', he.score(X_test_std, y_test))

Hard Ensemble Accuracy: 0.8625703564727955


In [16]:
soft_ensemble = VotingClassifier(
            estimators=[
                ('knn',knn),
                ('lr', lr),
                ('dt', dt)
            ],
    voting="soft"
    )

se = hard_ensemble.fit(X_train_std, y_train)

print('Soft Voting Ensemble Accuracy:', se.score(X_test_std, y_test))

Soft Voting Ensemble Accuracy: 0.8625703564727955
