In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler
)

import torch
from torch.utils.data import DataLoader

from utils import *
from model import *
from train import *

%load_ext autoreload
%autoreload 2

SEED = 42

device = torch.device(
    "cuda" if torch.cuda.is_available() else
    'mps' if torch.backends.mps.is_available() else
    "cpu"
)

# Using only `train.csv`

In [3]:
train_csv_path = "../../ucla-stats-101-c-2024-su-regression/train.csv"
train_csv = pd.read_csv(train_csv_path)

train_csv = train_csv.drop(['order_totals'], axis=1)
train_csv.columns

Index(['q_demos_state', 'year', 'month', 'log_total', 'count', 'count_female',
       'count_male', 'count_less5', 'count_5to10', 'count_over10', 'count_hh1',
       'count_hh2', 'count_hh3', 'count_hh4', 'count_howmany1',
       'count_howmany2', 'count_howmany3', 'count_howmany4', 'count_1824',
       'count_2534', 'count_3544', 'count_4554', 'count_5564', 'count_65up',
       'count_und25k', 'count_2549k', 'count_5074k', 'count_7599k',
       'count_100149k', 'count_150kup', 'count_lessHS', 'count_HS', 'count_B',
       'count_G'],
      dtype='object')

In [4]:
_X = train_csv.drop('log_total', axis=1)
_y = train_csv.log_total.to_numpy()

# initial split
_X, _X_test, y, y_test = train_test_split(
    _X, _y, test_size=0.2, stratify=_X.q_demos_state, random_state=SEED
)

# train set and eval set
_X_train, _X_eval, y_train, y_eval = train_test_split(
    _X, y, test_size=0.1, stratify=_X.q_demos_state, random_state=SEED
)

print('Feature Shapes:')
print(f'\traw train: {_X_train.shape}')
print(f'\traw eval: {_X_eval.shape}')
print(f'\traw test: {_X_test.shape}')

Feature Shapes:
	raw train: (2117, 33)
	raw eval: (236, 33)
	raw test: (589, 33)


In [5]:
categorical_columns = [
    'q_demos_state',
    'year',
    'month'
]

one_hot_enc = OneHotEncoder()

categorical_pipeline = Pipeline(
    [
        ('encoder', one_hot_enc)
    ]
)

numerical_columns = [
    'count', 'count_female', 'count_male', 
    'count_less5', 'count_5to10', 'count_over10', 
    'count_hh1', 'count_hh2', 'count_hh3', 'count_hh4', 
    'count_howmany1', 'count_howmany2', 'count_howmany3', 'count_howmany4',
    'count_1824', 'count_2534', 'count_3544', 'count_4554', 'count_5564', 'count_65up',
    'count_und25k', 'count_2549k', 'count_5074k', 'count_7599k', 'count_100149k', 'count_150kup', 
    'count_lessHS', 'count_HS', 'count_B', 'count_G'
]

scaler = StandardScaler()

numerical_pipeline = Pipeline(
    [
        ('std_scaler', scaler)
    ]
)

pipeline = ColumnTransformer(
    [
        ('numerical', numerical_pipeline, numerical_columns),
        ('categorical', categorical_pipeline, categorical_columns)
    ]
)

pipeline.fit(_X_train)

# train[numerical_columns].hist(
#     bins=100, figsize=(20, 15)
# )
# plt.show()

In [6]:
X_train = pipeline.transform(_X_train)
X_eval = pipeline.transform(_X_eval)
X_test = pipeline.transform(_X_test)

print('Transformed Feature Shapes:')
print(f'\ttrain: {X_train.shape}')
print(f'\teval: {X_eval.shape}')
print(f'\ttest: {X_test.shape}')

Transformed Feature Shapes:
	train: (2117, 98)
	eval: (236, 98)
	test: (589, 98)


In [7]:
# configs
BATCH_SIZE = 32
LR = 1e-4
EPOCHS = 100

In [8]:
train_dataset = CustomDataset(X_train, y_train)
eval_dataset = CustomDataset(X_eval, y_eval)

train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=BATCH_SIZE
)

In [9]:
input_size = X_train.shape[1]
hidden_1_size = 60
hidden_2_size = 7 + 2 # num categorical + num numeric
input_dropout = 0.1
hidden_dropout = 0.5

model_1 = ThreeLayerNet(
    input_size, 
    hidden_1_size, 
    hidden_2_size, 
    input_dropout,
    hidden_dropout
)

model_1

ThreeLayerNet(
  (input_dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=98, out_features=60, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=60, out_features=9, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=9, out_features=1, bias=True)
)

In [10]:
train_losses, eval_losses = solver(
    model_1, 
    train_dataloader, 
    eval_dataloader,
    LR,
    EPOCHS,
    True,
    100
)

Training ThreeLayerNet on mps...
Epoch: 1/100, Step: 1/67, Validation Loss: 9.92822
Epoch: 2/100, Step: 1/67, Validation Loss: 9.29636
Epoch: 3/100, Step: 1/67, Validation Loss: 8.59097
Epoch: 4/100, Step: 1/67, Validation Loss: 7.36471
Epoch: 5/100, Step: 1/67, Validation Loss: 6.74825
Epoch: 6/100, Step: 1/67, Validation Loss: 4.38642
Epoch: 7/100, Step: 1/67, Validation Loss: 7.18181
Epoch: 8/100, Step: 1/67, Validation Loss: 4.69077
Epoch: 9/100, Step: 1/67, Validation Loss: 5.25376
Epoch: 10/100, Step: 1/67, Validation Loss: 4.13685
Epoch: 11/100, Step: 1/67, Validation Loss: 4.36089
Epoch: 12/100, Step: 1/67, Validation Loss: 4.69684
Epoch: 13/100, Step: 1/67, Validation Loss: 4.51704
Epoch: 14/100, Step: 1/67, Validation Loss: 6.53942
Epoch: 15/100, Step: 1/67, Validation Loss: 4.72749
Epoch: 16/100, Step: 1/67, Validation Loss: 3.67082
Epoch: 17/100, Step: 1/67, Validation Loss: 4.82076
Epoch: 18/100, Step: 1/67, Validation Loss: 3.15789
Epoch: 19/100, Step: 1/67, Validation Lo

In [11]:
with torch.no_grad():
    model_1.cpu()
    preds = model_1(torch.from_numpy(X_test).to(torch.float32)).squeeze()

mse = nn.MSELoss()
mse(preds, torch.from_numpy(y_test).to(torch.float32)).item()

0.022130941972136497

# Predict on `test.csv`

In [12]:
test_csv_path = "../../ucla-stats-101-c-2024-su-regression/test.csv"
test_csv = pd.read_csv(test_csv_path)

test_ids = test_csv.id
test_csv = test_csv.drop(['id'], axis=1)

_test = pipeline.transform(test_csv)

print('Transformed Test Feature Shapes:')
print(f'\ttrain: {_test.shape}')

Transformed Test Feature Shapes:
	train: (2952, 98)


In [13]:
with torch.no_grad():
    model_1.cpu()
    preds = model_1(torch.from_numpy(_test).to(torch.float32)).squeeze()

model_1_preds = pd.DataFrame(
    {"id": test_ids,
     "log_total": preds
    }
)

save_path = "../preds/preds_3LNN_train_andy_0726.csv"

model_1_preds.to_csv(
    save_path, index=False
)

model_1_preds

Unnamed: 0,id,log_total
0,1,3.186365
1,2,3.187080
2,3,3.247694
3,4,3.235007
4,5,3.226147
...,...,...
2947,2948,2.437182
2948,2949,2.504473
2949,2950,2.582794
2950,2951,2.646356
