In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.metrics import classification_report

In [2]:
training = pd.read_csv("training.csv")
test = pd.read_csv("test.csv")

In [3]:
test.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-classification
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [4]:
training = training.drop("education", 1)
test = test.drop("education", 1)
training = training.drop("fnlwgt", 1)
test = test.drop("fnlwgt", 1)
training = training.drop("marital-status", 1)
test = test.drop("marital-status", 1)
training = training.drop("native-country", 1)
test = test.drop("native-country", 1)

In [5]:
for data in training:
    training[data].replace(' ?', np.nan, inplace=True)
    
for data in test:
    test[data].replace(' ?', np.nan, inplace=True)

training.dropna(inplace=True)
test.dropna(inplace=True)

In [6]:
ord_enc = OrdinalEncoder()
for i in ['workclass','occupation','relationship','race','sex','salary-classification']:
    training[i] = ord_enc.fit_transform(training[[i]]).astype(int)
    test[i] = ord_enc.fit_transform(test[[i]]).astype(int)

In [7]:
X_train = training.drop('salary-classification',1)
y_train = training['salary-classification']
X_test = test.drop('salary-classification',1) 
y_test = test['salary-classification']

## SelectKBest

In [8]:
selector = SelectKBest(f_classif, k=10)
selector.fit(X_train,y_train)
cols = selector.get_support(indices=True)
cols_names = list(X_train.columns[cols])
X_train = X_train[cols_names]
X_test = X_test[cols_names] 

## SVM

In [9]:
def svm_classifier(X_train):
    model = svm.SVC().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("SVM -> %0.8f" % result)
    print(classification_report(df['actual'], df['predicted']))

In [10]:
svm_classifier(X_train)

SVM -> 0.79614757
              precision    recall  f1-score   support

           0       0.80      0.97      0.88     11543
           1       0.74      0.27      0.39      3772

    accuracy                           0.80     15315
   macro avg       0.77      0.62      0.64     15315
weighted avg       0.79      0.80      0.76     15315



## KMeans

In [11]:
def kmeans_classifier(X_train):
    model = KMeans(n_clusters=1, random_state=0).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("KMeans -> %0.8f" % result)
    print(classification_report(df['actual'], df['predicted'],zero_division=1))

In [12]:
kmeans_classifier(X_train)

KMeans -> 0.75370552
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     11543
           1       1.00      0.00      0.00      3772

    accuracy                           0.75     15315
   macro avg       0.88      0.50      0.43     15315
weighted avg       0.81      0.75      0.65     15315



## KNN

In [13]:
def knn_classifier(X_train):
    model = KNeighborsClassifier().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("KNN -> %0.8f" % result)
    print(classification_report(df['actual'], df['predicted']))

In [14]:
knn_classifier(X_train)

KNN -> 0.83506366
              precision    recall  f1-score   support

           0       0.88      0.91      0.89     11543
           1       0.69      0.61      0.64      3772

    accuracy                           0.84     15315
   macro avg       0.78      0.76      0.77     15315
weighted avg       0.83      0.84      0.83     15315



## Naive Bayes

In [15]:
def naivebayes_classifier(X_train):
    model = GaussianNB().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("Naive Bayes -> %0.8f" % result)
    print(classification_report(df['actual'], df['predicted']))

In [16]:
naivebayes_classifier(X_train)

Naive Bayes -> 0.78994450
              precision    recall  f1-score   support

           0       0.81      0.95      0.87     11543
           1       0.66      0.31      0.42      3772

    accuracy                           0.79     15315
   macro avg       0.73      0.63      0.65     15315
weighted avg       0.77      0.79      0.76     15315



## AdaBoost

In [17]:
def adaboost_classifier(X_train):
    model = AdaBoostClassifier().fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("AdaBoost -> %0.8f" % result) 
    print(classification_report(df['actual'], df['predicted']))

In [18]:
adaboost_classifier(X_train)

AdaBoost -> 0.85510937
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     11543
           1       0.77      0.59      0.67      3772

    accuracy                           0.86     15315
   macro avg       0.82      0.77      0.79     15315
weighted avg       0.85      0.86      0.85     15315



## Bagging

In [19]:
def bagging_classifier(X_train):
    model = BaggingClassifier(base_estimator=SVC(),n_estimators=2, random_state=0).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("Bagging SVC -> %0.8f" % result)
    print(classification_report(df['actual'], df['predicted']))

In [20]:
bagging_classifier(X_train)

Bagging SVC -> 0.79614757
              precision    recall  f1-score   support

           0       0.80      0.97      0.88     11543
           1       0.74      0.27      0.39      3772

    accuracy                           0.80     15315
   macro avg       0.77      0.62      0.64     15315
weighted avg       0.79      0.80      0.76     15315



## Stacking

In [21]:
def stacking_classifier(X_train):
    model = StackingClassifier([('ab',AdaBoostClassifier()),('nb', GaussianNB())], final_estimator=LogisticRegression()).fit(X_train,y_train)
    predictions = model.predict(X_test)
    df = pd.DataFrame({'actual':y_test,'predicted':predictions})
    result = accuracy_score(df['actual'],df['predicted'])
    print("Stacking -> %0.8f" % result)
    print(classification_report(df['actual'], df['predicted']))

In [22]:
stacking_classifier(X_train)

Stacking -> 0.80652955
              precision    recall  f1-score   support

           0       0.81      0.97      0.88     11543
           1       0.77      0.31      0.44      3772

    accuracy                           0.81     15315
   macro avg       0.79      0.64      0.66     15315
weighted avg       0.80      0.81      0.77     15315

