# **Setup**

## Import notebooks

In [1]:
%%capture
# Note the python import here
import reuse, sys

# This is the Ipython hook
sys.meta_path.append(reuse.NotebookFinder())
from dataset_balancing import X_train, y_train, X_test, y_test

## Import libraries

In [2]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.metrics import classification_report

In [3]:
X_train = X_train.drop(['fnlwgt', 'sex','workclass_selfempl'],1)
X_test = X_test.drop(['fnlwgt', 'sex', 'workclass_selfempl'],1)

# **Test models**

## SVM

In [4]:
model = svm.SVC().fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
print(f"SVM\n{result}")

## KMeans

In [5]:
model = KMeans(n_clusters=1, random_state=0).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'], zero_division=1)
print(f"KMeans\n{result}")

## KNN

In [6]:
model = KNeighborsClassifier(n_neighbors=13).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
print(f"KNN\n{result}")

## Naive Bayes

In [7]:
model = GaussianNB().fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
print(f"Naive Bayes\n{result}")

## AdaBoost

In [8]:
model = AdaBoostClassifier(n_estimators=70).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
print(f"Adaboost\n{result}")

Adaboost
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     11360
           1       0.78      0.58      0.66      3700

    accuracy                           0.86     15060
   macro avg       0.83      0.76      0.79     15060
weighted avg       0.85      0.86      0.85     15060



## Bagging

In [9]:
model = BaggingClassifier(base_estimator=SVC(),n_estimators=2, random_state=0).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
print(f"Bagging\n{result}")

## Stacking

In [10]:
model = StackingClassifier([('ab',AdaBoostClassifier()),('nb', GaussianNB())], final_estimator=LogisticRegression()).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
print(f"Stacking\n{result}") 