# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

# Load train and test dataset

In [2]:
training = pd.read_csv("training.csv")
test = pd.read_csv("test.csv")

# Remove unnecessary columns

In [3]:
training = training.drop("education", 1)
test = test.drop("education", 1)
training = training.drop("fnlwgt", 1)
test = test.drop("fnlwgt", 1)
training = training.drop("marital-status", 1)
test = test.drop("marital-status", 1)
training = training.drop("native-country", 1)
test = test.drop("native-country", 1)

# Replace all '?' by NaN and drop rows containing the latter

In [4]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

# Turning non-numerical values into numerical

In [5]:
ord_enc = OrdinalEncoder()
for i in ['workclass','occupation','relationship','race','sex','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [6]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']

## SVM

In [21]:
def svm_classifier(X_train):
    model = svm.SVC().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted']))

In [22]:
svm_classifier(X_train)

SVM -> 0.79614757
              precision    recall  f1-score   support

           0       0.80      0.97      0.88     11543
           1       0.74      0.27      0.39      3772

    accuracy                           0.80     15315
   macro avg       0.77      0.62      0.64     15315
weighted avg       0.79      0.80      0.76     15315



## KMeans

In [23]:
def kmeans_classifier(X_train):
    model = KMeans(n_clusters=1, random_state=0).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted'],zero_division=1))

In [24]:
kmeans_classifier(X_train)

SVM -> 0.75370552
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     11543
           1       1.00      0.00      0.00      3772

    accuracy                           0.75     15315
   macro avg       0.88      0.50      0.43     15315
weighted avg       0.81      0.75      0.65     15315



## KNN

In [11]:
def knn_classifier(X_train):
    model = KNeighborsClassifier().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted']))

In [12]:
knn_classifier(X_train)

              precision    recall  f1-score   support

           0       0.88      0.91      0.89     11543
           1       0.69      0.61      0.64      3772

    accuracy                           0.84     15315
   macro avg       0.78      0.76      0.77     15315
weighted avg       0.83      0.84      0.83     15315



## Naive Bayes

In [13]:
def naivebayes_classifier(X_train):
    model = GaussianNB().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted']))

In [14]:
naivebayes_classifier(X_train)

Naive Bayes -> 0.78994450
              precision    recall  f1-score   support

           0       0.81      0.95      0.87     11543
           1       0.66      0.31      0.42      3772

    accuracy                           0.79     15315
   macro avg       0.73      0.63      0.65     15315
weighted avg       0.77      0.79      0.76     15315



## AdaBoost

In [15]:
def adaboost_classifier(X_train):
    model = AdaBoostClassifier().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted']))

In [16]:
adaboost_classifier(X_train)

AdaBoost -> 0.85510937
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     11543
           1       0.77      0.59      0.67      3772

    accuracy                           0.86     15315
   macro avg       0.82      0.77      0.79     15315
weighted avg       0.85      0.86      0.85     15315



## Bagging

In [17]:
def bagging_classifier(X_train):
    model = BaggingClassifier(base_estimator=SVC(),n_estimators=2, random_state=0).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted']))

In [18]:
bagging_classifier(X_train)

Bagging SVC -> 0.79614757
              precision    recall  f1-score   support

           0       0.80      0.97      0.88     11543
           1       0.74      0.27      0.39      3772

    accuracy                           0.80     15315
   macro avg       0.77      0.62      0.64     15315
weighted avg       0.79      0.80      0.76     15315



## Stacking

In [19]:
def stacking_classifier(X_train):
    model = StackingClassifier([('ab',AdaBoostClassifier()),('nb', GaussianNB())], final_estimator=LogisticRegression()).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    print(classification_report(df['actual'], df['predicted']))

In [20]:
stacking_classifier(X_train)

Stacking -> 0.80652955
              precision    recall  f1-score   support

           0       0.81      0.97      0.88     11543
           1       0.77      0.31      0.44      3772

    accuracy                           0.81     15315
   macro avg       0.79      0.64      0.66     15315
weighted avg       0.80      0.81      0.77     15315

