In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from collections import Counter
from copy import deepcopy

from sklearn.preprocessing import normalize, LabelEncoder
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv("/kaggle/input/salary-binary-classifier/salary.csv")

In [None]:
df.head()

In [None]:
categorical = ["workclass", "education", "marital-status", "occupation", "relationship", "race",
              "sex", "native-country", "salary"]
le = LabelEncoder()
encoders = []
for i in categorical:
    df[i] = le.fit_transform(df[i])
    encoders += [le]

In [None]:
df.iloc[:, :-1] = normalize(df.iloc[:, :-1], norm="max", axis=0)

In [None]:
class CL(Dataset):
    def __init__(self, data):
        self.data = data
        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, x):
        data, target = self.data[x, :-1], self.data[x, -1]
        return torch.tensor(data, dtype=torch.float32), torch.tensor(target, dtype=torch.long)

In [None]:
train, testing = train_test_split(df, random_state=42, test_size=0.3)
val, test = train_test_split(testing.values, random_state=42, test_size=0.5)

In [None]:
smote = SMOTE()
x = train.iloc[:, :-1].values
y = train.iloc[:, -1].values
print(Counter(y))
x, y = smote.fit_resample(x, y)
dataset = np.hstack((x, y.reshape(-1, 1)))
print(Counter(y))

In [None]:
train_ds = CL(dataset)
val_ds = CL(val)

In [None]:
BATCH = 16
EPOCHS = 20
LR = 0.5
IN_FEATURES = x.shape[1]
OUT_FEATURES = df["salary"].nunique()

In [None]:
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=16, shuffle=False)

In [None]:
def train(model):
    model.fit(dataset[:, :-1], dataset[:, -1])
    predict = model.predict(test[:, :-1])
    score = accuracy_score(predict, test[:, -1])
    report = classification_report(predict, test[:, -1])
    cm = confusion_matrix(predict, test[:, -1])
    print(score)
    print(report)
    sns.heatmap(cm, annot=True)
    plt.show()

In [None]:
abc = AdaBoostClassifier(learning_rate=0.9)
rfc = RandomForestClassifier()

In [None]:
train(abc)
train(rfc)

In [None]:
class Classifier(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Classifier, self).__init__()
        self.layer = nn.Sequential(nn.Linear(in_channels, 32),
                                  nn.BatchNorm1d(32),
                                  nn.ReLU(),
                                  nn.Linear(32, out_channels))
        
    def forward(self, x):
        x = self.layer(x)
        return nn.functional.softmax(x, dim=1)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
model = Classifier(IN_FEATURES, OUT_FEATURES)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.2)

In [None]:
best_model = deepcopy(model)
best_acc = 0

loss_train = []
loss_val = []
acc_train = []
acc_val = []

for i in range(1, EPOCHS+1):
    model.train()
    train_loss = 0.0
    train_total = 0
    train_acc = 0.0
    for data, label in train_dl:
        optimizer.zero_grad()
        if torch.cuda.is_available:
            data, label = data.cuda(), label.cuda()
            
        out = model(data)
        loss = criterion(out, label)
        train_loss += loss.item()
        train_total += out.size(0)
        train_acc += (out.argmax(1) == label).sum().item()
        loss.backward()
        optimizer.step()
    train_loss /= train_total
    train_acc /= train_total
    loss_train += [train_loss]
    acc_train += [train_acc]
    model.eval()
    val_loss = 0.0
    val_acc = 0
    val_total = 0
    with torch.no_grad():
        for data, label in val_dl:
            if torch.cuda.is_available():
                data, label = data.cuda(), label.cuda()
                
            out = model(data)
            loss = criterion(out, label)
            val_loss += loss.item()
            val_total += out.size(0)
            val_acc += (out.argmax(1)==label).sum().item()
            
    val_acc /= val_total
    val_loss /= val_total
    loss_val += [val_loss]
    acc_val += [val_acc]
    if val_acc > best_acc:
        best_acc = val_acc
        best_model = deepcopy(model)
        
        
    print("Epochs {} train loss {} acc {} val loss {} acc {}".format(i, train_loss, train_acc,
                                                                    val_loss, val_acc))
    scheduler.step()

In [None]:
epochs = list(range(1, EPOCHS+1))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
axes[0].plot(epochs, loss_train)
axes[0].plot(epochs, loss_val)
axes[1].plot(epochs, acc_train)
axes[1].plot(epochs, acc_val)
axes[0].legend(["Training", "Validation"])
axes[1].legend(["Training", "Validation"])
plt.show()

In [None]:
def predict(value):
    data = value[:-1]
    data = data.reshape(1, -1)
    data = torch.tensor(data, dtype=torch.float32)
    best_model.eval()
    with torch.no_grad():
        if torch.cuda.is_available():
            data = data.cuda()
            
        out = model(data)
    return out.argmax(1).item()

In [None]:
test_vals = []
predicted = []
for i in test:
    test_vals += [i[-1]]
    predicted += [predict(i)]

In [None]:
score = accuracy_score(test_vals, predicted)
report = classification_report(test_vals, predicted)
cm = confusion_matrix(test_vals, predicted)
print(score)
print(report)
sns.heatmap(cm, annot=True)
plt.show()