In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from models.NeuralNetwork import NeuralNetwork

In [2]:
filename = "adult.csv"
data = read_csv(filename,delimiter=",", na_values="?")

#class_counts = data.groupby('income').size()
#print(class_counts)

"""
Imputation
"""
null_columns =['workclass','occupation','native.country']
for i in null_columns:
    data.fillna(data[i].mode()[0], inplace=True)
    
"""
Factorize
"""  
data['income'] = data['income'].replace({'<=50K':0, '>50K':1})
data['sex'] = data['sex'].replace({'Female':0, 'Male':1})
data['race'] = data['race'].replace({'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Other':3,'Amer-Indian-Eskimo':4})
data['workclass'] = data['workclass'].replace({'Private':0, 'State-gov':1, 'Federal-gov':2, 'Self-emp-not-inc':3,
       'Self-emp-inc':4, 'Local-gov':5, 'Without-pay':6, 'Never-worked':7})
data['native.country'] = data['native.country'].replace({'United-States':0, 'Private':1, 'Mexico':2, 'Greece':3, 'Vietnam':4, 'China':5,
       'Taiwan':6, 'India':7, 'Philippines':8, 'Trinadad&Tobago':9, 'Canada':10,
       'South':11, 'Holand-Netherlands':12, 'Puerto-Rico':13, 'Poland':14, 'Iran':15,
       'England':16, 'Germany':17, 'Italy':18, 'Japan':19, 'Hong':20, 'Honduras':21, 'Cuba':22,
       'Ireland':23, 'Cambodia':24, 'Peru':25, 'Nicaragua':26, 'Dominican-Republic':27,
       'Haiti':28, 'El-Salvador':29, 'Hungary':30, 'Columbia':31, 'Guatemala':32,
       'Jamaica':33, 'Ecuador':34, 'France':35, 'Yugoslavia':36, 'Scotland':37,
       'Portugal':38, 'Laos':39, 'Thailand':40, 'Outlying-US(Guam-USVI-etc)':41})
data['occupation'] = data['occupation'].replace({'Private':0, 'Exec-managerial':1, 'Machine-op-inspct':2,
       'Prof-specialty':3, 'Other-service':4, 'Adm-clerical':5, 'Craft-repair':6,
       'Transport-moving':7, 'Handlers-cleaners':8, 'Sales':9,
       'Farming-fishing':10, 'Tech-support':11, 'Protective-serv':12,
       'Armed-Forces':13, 'Priv-house-serv':14})
data['relationship'] = data['relationship'].replace({'Not-in-family':0, 'Unmarried':1, 'Own-child':2, 'Other-relative':3,
       'Husband':4, 'Wife':5})
data['education'] = data['education'].replace({'HS-grad':0, 'Some-college':1, '7th-8th':2, '10th':3, 'Doctorate':4,
       'Prof-school':5, 'Bachelors':6, 'Masters':7, '11th':8, 'Assoc-acdm':9,
       'Assoc-voc':10, '1st-4th':11, '5th-6th':12, '12th':13, '9th':14, 'Preschool':15})
data['marital.status'] = data['marital.status'].replace(['Never-married', 'Divorced', 'Separated', 'Widowed'], 'Single')
data['marital.status'] = data['marital.status'].replace(['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'], 'Married')
data['marital.status'] = data['marital.status'].map({'Married':1, 'Single':0})


X = data.iloc[:,:14]
y = data.iloc[:,14]

"""
Undersample
"""  
undersample = RandomUnderSampler(sampling_strategy='majority')
X, y = undersample.fit_resample(X,y)

"""
Scaling
""" 
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

"""
Splitting
"""
X_train, X_test, y_train, y_test = train_test_split(rescaledX,y,test_size=0.2, random_state=1)


In [4]:
def sample_batch(data,batch_size):
    st_batch = data.shape[0] // batch_size
    idx_end = st_batch * batch_size
    batch = np.split(data[:idx_end], st_batch)
    
    if data.shape[0] % batch_size != 0:
        batch += [data[idx_end:]]
    
    return batch

In [33]:
nn = NeuralNetwork(x_no = X.shape[1], y_no=1, h_no=3)

cost_log = []
epochs = 500
batch_size = 100
learning_rate = 1

X_batch = sample_batch(X_train, batch_size)
Y_batch = sample_batch(y_train.to_numpy(),batch_size)

for t in range(epochs):
    for xs,ys in zip(X_batch,Y_batch):
        
        nn.forward(xs)
        nn.backward(ys, nn.db_cross_entropy)
        
        nn.gd(lr=1)
        
    _, prediction, accuracy = nn.predict(X_test,y_test.to_numpy())
    val_loss = np.mean(nn.b_cross_entropy(y_test.to_numpy(),_))
    #cost = np.mean(nn.b_cross_entropy(ys, nn.A3))
    cost_log.append([t,val_loss])
    if t % 100 == 0:
        #print(f"epoch: {t} - loss: {round(cost,5)}", end=" ")
        print(f"epoch: {t} - val_acc: {round(accuracy,5)}, val_loss: {round(val_loss,5)}")
        
cost_log = pd.DataFrame(cost_log, columns=["epochs","loss"])

epoch: 0 - val_acc: 50.27096, val_loss: nan
epoch: 100 - val_acc: 50.27096, val_loss: nan
epoch: 200 - val_acc: 50.27096, val_loss: nan


KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
ax.plot(cost_log["epochs"],cost_log["loss"], c="b")

In [713]:
_, prediction, accuracy = nn.predict(X_test,y_test.to_numpy())
print(prediction)
print(accuracy)

[1. 1. 0. ... 0. 1. 1.]
80.29964934650941
