# **Setup**

## Import notebooks

In [1]:
%%capture
# Note the python import here
import reuse, sys

# This is the Ipython hook
sys.meta_path.append(reuse.NotebookFinder())
from dataset_balancing import X_train, y_train, X_test, y_test

## Import libraries

In [2]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.metrics import classification_report

In [3]:
X_train = X_train.drop(['fnlwgt', 'capital-gain'],1)
X_test = X_test.drop(['fnlwgt', 'capital-gain'],1)

results = []

# SVM

In [4]:
model = svm.SVC().fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
results.append(result)
print(f"SVM\n{result}")

SVM
              precision    recall  f1-score   support

           0       0.77      0.99      0.87     11360
           1       0.76      0.09      0.15      3700

    accuracy                           0.77     15060
   macro avg       0.77      0.54      0.51     15060
weighted avg       0.77      0.77      0.69     15060



# KMeans

In [5]:
model = KMeans(n_clusters=1, random_state=0).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'], zero_division=1)
results.append(result)
print(f"KMeans\n{result}")

KMeans
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     11360
           1       1.00      0.00      0.00      3700

    accuracy                           0.75     15060
   macro avg       0.88      0.50      0.43     15060
weighted avg       0.81      0.75      0.65     15060



# KNN

In [6]:
model = KNeighborsClassifier().fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
results.append(result)
print(f"KNN\n{result}")

KNN
              precision    recall  f1-score   support

           0       0.87      0.90      0.88     11360
           1       0.65      0.58      0.61      3700

    accuracy                           0.82     15060
   macro avg       0.76      0.74      0.75     15060
weighted avg       0.81      0.82      0.82     15060



# Naive Bayes

In [7]:
model = GaussianNB().fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
results.append(result)
print(f"Naive Bayes\n{result}")

Naive Bayes
              precision    recall  f1-score   support

           0       0.94      0.67      0.78     11360
           1       0.46      0.87      0.60      3700

    accuracy                           0.72     15060
   macro avg       0.70      0.77      0.69     15060
weighted avg       0.82      0.72      0.74     15060



# AdaBoost

In [8]:
model = AdaBoostClassifier().fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
results.append(result)
print(f"Adaboost\n{result}")

Adaboost
              precision    recall  f1-score   support

           0       0.87      0.92      0.89     11360
           1       0.70      0.57      0.63      3700

    accuracy                           0.83     15060
   macro avg       0.78      0.75      0.76     15060
weighted avg       0.83      0.83      0.83     15060



# Bagging

In [9]:
model = BaggingClassifier(base_estimator=SVC(),n_estimators=2, random_state=0).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
results.append(result)
print(f"Bagging\n{result}")

Bagging
              precision    recall  f1-score   support

           0       0.77      0.99      0.87     11360
           1       0.76      0.09      0.15      3700

    accuracy                           0.77     15060
   macro avg       0.77      0.54      0.51     15060
weighted avg       0.77      0.77      0.69     15060



# Stacking

In [10]:
model = StackingClassifier([('ab',AdaBoostClassifier()),('nb', GaussianNB())], final_estimator=LogisticRegression()).fit(X_train,y_train)
predictions = model.predict(X_test)
df = pd.DataFrame({'actual':y_test,'predicted':predictions})
result = classification_report(df['actual'],df['predicted'])
results.append(result)
print(f"Stacking\n{result}") 

Stacking
              precision    recall  f1-score   support

           0       0.84      0.96      0.89     11360
           1       0.77      0.44      0.56      3700

    accuracy                           0.83     15060
   macro avg       0.80      0.70      0.72     15060
weighted avg       0.82      0.83      0.81     15060



In [11]:
for res in results:
    print(res.split("\n")[5].strip().split(" ")[27], end=",")

0.77,0.75,0.82,0.72,0.83,0.77,0.83,