In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder,StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('Clean 2022 Economics.csv')

df.columns = df.columns.str.strip()
df_clean = df[["Income Level", "Reported registered", "Reported not registered"]].copy()
df_clean["Reported registered"] = df_clean["Reported registered"].str.replace(",", "").astype(int)
df_clean["Reported not registered"] = df_clean["Reported not registered"].str.replace(",", "").astype(int)


# Melt the Data 
df_melted = df_clean.melt(id_vars="Income Level", 
                          var_name="Registration Status", 
                          value_name="Count")
print(df_melted)

# # Step 4: Plot Grouped Bar Chart
# plt.figure(figsize=(12, 6))
# sns.barplot(x="Income Level", y="Count", hue="Registration Status", data=df_melted)

# plt.title("2022 Registration Status by Income Level", fontsize=16)
# plt.xlabel("Income Level", fontsize=12)
# plt.ylabel("Number of People", fontsize=12)
# plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for readability
# plt.legend(title="Registration Status")
# plt.tight_layout()
# plt.show()


             Income Level      Registration Status  Count
0           Under $10,000      Reported registered   1399
1     $10,000  -  $14,999      Reported registered   1397
2     $15,000  -  $19,999      Reported registered   1251
3     $20,000  -  $29,999      Reported registered   4359
4     $30,000  -  $39,999      Reported registered   6397
5     $40,000  -  $49,999      Reported registered   5936
6     $50,000  -  $74,999      Reported registered  17333
7     $75,000  -  $99,999      Reported registered  15009
8   $100,000  -  $149,999      Reported registered  21400
9               $150,000+      Reported registered  28947
10    Income not reported      Reported registered  14884
11          Under $10,000  Reported not registered    840
12    $10,000  -  $14,999  Reported not registered    690
13    $15,000  -  $19,999  Reported not registered    607
14    $20,000  -  $29,999  Reported not registered   1770
15    $30,000  -  $39,999  Reported not registered   2150
16    $40,000 

In [3]:
# Step 1: Prepare Data
def prepare_data(data_path):
    # Load data
    df = pd.read_csv(data_path)
    df.columns = df.columns.str.strip()
    
    # Select and clean relevant columns
    df_clean = df[["Income Level", "Reported registered"]].copy()
    df_clean["Reported registered"] = df_clean["Reported registered"].str.replace(",", "").astype(int)
    
    # OneHotEncode Income Level
    encoder = OneHotEncoder(sparse_output=False)  # Corrected argument
    X_encoded = encoder.fit_transform(df_clean[["Income Level"]])
    
    # Target Variable
    y = df_clean["Reported registered"]
    
    # Normalize features
    scaler = StandardScaler()
    X_encoded = scaler.fit_transform(X_encoded)
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
    
    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)
    
    return X_train, X_test, y_train, y_test

# Step 2: Define Neural Network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.BatchNorm1d(64),  # Batch normalization
            nn.ReLU(),
            nn.Dropout(0.3)      # Dropout for regularization
        )
        self.fc2 = nn.Sequential(
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.output = nn.Linear(32, 1)  # Output layer
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.output(x)
        return x

# Step 3: Train the Model
def train_model(X_train, y_train, X_test, y_test, epochs=50, learning_rate=0.001):
    model = NeuralNetwork(X_train.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()  # Loss function
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # Learning rate scheduler
    
    for epoch in range(epochs):
        # Forward pass
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate
        
        if (epoch+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")
    
    # Evaluate on test data
    with torch.no_grad():
        y_test_pred = model(X_test).detach().numpy()
    
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    
    print("Mean Squared Error (MSE):", mse)
    print("R2 Score:", r2)
    
    return model

# Step 4: Run the Workflow
data_path = "Clean 2020 Economics.csv"
X_train, X_test, y_train, y_test = prepare_data(data_path)
trained_model = train_model(X_train, y_train, X_test, y_test)


Epoch [10/50], Loss: 129417584.0000
Epoch [20/50], Loss: 129409392.0000
Epoch [30/50], Loss: 129410120.0000
Epoch [40/50], Loss: 129406064.0000
Epoch [50/50], Loss: 129410704.0000
Mean Squared Error (MSE): 5279742000.0
R2 Score: -1.336122751235962
