In [5]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import json

# Load the dataset
df = pd.read_csv('adult.csv')

# Replace '?' with NaN and drop rows with NaN values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Encode the target variable
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

# Identify categorical columns
categorical_columns = ['workclass', 'education', 'marital.status', 'occupation', 
                       'relationship', 'race', 'sex', 'native.country']

# Initialize dictionary to store label encodings
label_encodings = {}

# Encode categorical variables with Label Encoding and save encodings
for col in categorical_columns:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col])
    # Convert values to int for JSON serialization
    label_encodings[col] = {key: int(value) for key, value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}

# Save label encodings to a file
with open('label_encodings.json', 'w') as f:
    json.dump(label_encodings, f)

# Split features and target variable
X = df.drop('income', axis=1)
y = df['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the neural network model
class IncomePredictionModel(nn.Module):
    def __init__(self, input_dim):
        super(IncomePredictionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Check if CUDA is available and use the GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
model = IncomePredictionModel(input_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 20
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluating the model
model.eval()
with torch.no_grad():
    y_pred = []
    y_true = []
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        y_pred.extend(outputs.cpu().numpy().flatten())
        y_true.extend(labels.cpu().numpy().flatten())

# Convert predictions and true values to binary labels
y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
y_true = [int(x) for x in y_true if not np.isnan(x)]

# Calculate accuracy
accuracy = sum([1 for pred, true in zip(y_pred, y_true) if pred == true]) / len(y_true)
print(f"Accuracy: {accuracy:.4f}")


Using device: cuda
Epoch [1/20], Loss: 0.3786
Epoch [2/20], Loss: 0.3324
Epoch [3/20], Loss: 0.3273
Epoch [4/20], Loss: 0.3245
Epoch [5/20], Loss: 0.3226
Epoch [6/20], Loss: 0.3204
Epoch [7/20], Loss: 0.3189
Epoch [8/20], Loss: 0.3188
Epoch [9/20], Loss: 0.3165
Epoch [10/20], Loss: 0.3164
Epoch [11/20], Loss: 0.3150
Epoch [12/20], Loss: 0.3122
Epoch [13/20], Loss: 0.3130
Epoch [14/20], Loss: 0.3129
Epoch [15/20], Loss: 0.3083
Epoch [16/20], Loss: 0.3063
Epoch [17/20], Loss: 0.3070
Epoch [18/20], Loss: 0.3038
Epoch [19/20], Loss: 0.3030
Epoch [20/20], Loss: 0.3013
Accuracy: 0.8351


In [4]:
import joblib

# Save model
torch.save(model.state_dict(), 'model.pth')

# Save scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']