# Import libraries

In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn import preprocessing

# Load train and test dataset

In [2]:
training = pd.read_csv("training.csv")
test = pd.read_csv("test.csv")

# Remove unnecessary columns

In [3]:
training = training.drop("education", 1)
test = test.drop("education", 1)

# Replace all '?' by NaN and drop rows containing the latter

In [4]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

# Turning non-numerical values into numerical

In [5]:
ord_enc = OrdinalEncoder()
for i in ['workclass','marital-status','occupation','relationship','race','sex','native-country','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [6]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']

# SelectKBest

In [7]:
selector = SelectKBest(f_classif, k=13)
selector.fit(X_train,y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])
X_train = X_train[cols_names]
X_test = X_test[cols_names] 

# SVM

In [36]:
def svm_classifier(X_train):
    model = svm.SVC().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("SVM -> %0.8f" % result)

# KMeans

In [11]:
def kmeans_classifier(X_train):
    model = KMeans(n_clusters=1, random_state=0).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("KMeans -> %0.8f" % result)

# KNN

In [12]:
def knn_classifier(X_train):
    model = KNeighborsClassifier().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("KNN -> %0.8f" % result)

# Naive Bayes

In [13]:
def naivebayes_classifier(X_train):
    model = GaussianNB().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("Naive Bayes -> %0.8f" % result)

# AdaBoost

In [17]:
def adaboost_classifier(X_train):
    model = AdaBoostClassifier().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("AdaBoost -> %0.8f" % result)  

# Bagging

In [42]:
def bagging_classifier(X_train):
    model = BaggingClassifier(base_estimator=SVC(),n_estimators=2, random_state=0).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("Bagging SVC -> %0.8f" % result)  

# Stacking

In [54]:
def stacking_classifier(X_train):
    model = StackingClassifier([('ab',AdaBoostClassifier()),('nb', GaussianNB())], final_estimator=LogisticRegression()).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("Stacking -> %0.8f" % result)  

# Normalization

In [67]:
normalizer = preprocessing.Normalizer(norm='l1') 
values_normalized = normalizer.transform(X_train.values)
data_normalized = pd.DataFrame(values_normalized, columns=X_train.columns)
values_normalized = normalizer.transform(X_test.values)
X_test = pd.DataFrame(values_normalized, columns=X_test.columns)
print("---------Normalization l1---------")
svm_classifier(data_normalized)
kmeans_classifier(data_normalized)
knn_classifier(data_normalized)
naivebayes_classifier(data_normalized)
adaboost_classifier(data_normalized)
bagging_classifier(data_normalized)
stacking_classifier(data_normalized)

---------Normalization l1---------
KMeans -> 0.75431607
KNN -> 0.74276228
Naive Bayes -> 0.78984064
AdaBoost -> 0.80670651
SVM -> 0.76102258
Bagging SVC -> 0.78512616
Stacking -> 0.79077025


In [66]:
normalizer = preprocessing.Normalizer(norm='l2') 
values_normalized = normalizer.transform(X_train.values)
data_normalized = pd.DataFrame(values_normalized, columns=X_train.columns)
values_normalized = normalizer.transform(X_test.values)
X_test = pd.DataFrame(values_normalized, columns=X_test.columns)
print("---------Normalization l2---------")
svm_classifier(data_normalized)
kmeans_classifier(data_normalized)
knn_classifier(data_normalized)
naivebayes_classifier(data_normalized)
adaboost_classifier(data_normalized)
bagging_classifier(data_normalized)
stacking_classifier(data_normalized)

---------Normalization---------
KMeans -> 0.75431607
KNN -> 0.74203187
Naive Bayes -> 0.79169987
AdaBoost -> 0.80239044
SVM -> 0.76102258
Bagging SVC -> 0.78187251
Stacking -> 0.79203187


# Standardization

In [56]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
values_standardized = scaler.fit_transform(X_train)
X_train = pd.DataFrame(values_standardized, columns=X_train.columns);
values_standardized = scaler.fit_transform(X_test)
X_test = pd.DataFrame(values_standardized, columns=X_test.columns);
print("--------Standardization--------")
svm_classifier(X_train)
kmeans_classifier(X_train)
knn_classifier(X_train)
naivebayes_classifier(X_train)
adaboost_classifier(X_train)
bagging_classifier(X_train)
stacking_classifier(X_train)

--------Standardization--------
KMeans -> 0.75431607
KNN -> 0.76009296
Naive Bayes -> 0.79223108
AdaBoost -> 0.78645418
SVM -> 0.76102258
Bagging SVC -> 0.77470120
Stacking -> 0.78877822
