In [343]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler
)

import torch
from torch.utils.data import DataLoader

from utils import *
from model import *
from train import *

%load_ext autoreload
%autoreload 2

SEED = 42

device = torch.device(
    "cuda" if torch.cuda.is_available() else
    'mps' if torch.backends.mps.is_available() else
    "cpu"
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Using fine-grained data sets

In [323]:
customers_path = "../../ucla-stats-101-c-2024-su-regression/customer_info_train.csv"
customers = pd.read_csv(customers_path)

amazon_path = "../../ucla-stats-101-c-2024-su-regression/amazon_order_details_train.csv"
amazon = pd.read_csv(amazon_path)

In [324]:
# orders = amazon[
#     ~amazon.shipping_address_state.isnull()
# ].groupby(
#     ['order_date', 'survey_response_id'], 
#     as_index=False
# ).agg(
#     total_cost = ('item_cost', 'sum'),
#     item_count = ('quantity', 'count')
# ).set_index(
#     'survey_response_id'
# ).join(
#     customers.set_index('survey_response_id'), 
#     how='inner'
# ).reset_index()

# orders['year'] = orders.order_date.apply(get_year)
# orders['month'] = orders.order_date.apply(get_month)
# # orders['log_total'] = orders.total_cost.apply(np.log10)

# orders = orders.drop(['order_date'], axis=1)
# orders = orders.drop(['q_life_changes'], axis=1)
# # orders = orders.drop(['total_cost'], axis=1)

# # response_ids = orders.survey_response_id
# # orders = orders.drop(['survey_response_id'], axis=1)


orders = join_and_prep(amazon, customers)

orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389130 entries, 0 to 389129
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   survey_response_id          389130 non-null  object 
 1   total_cost                  389130 non-null  float64
 2   item_count                  389130 non-null  int64  
 3   q_demos_age                 389130 non-null  object 
 4   q_demos_hispanic            389130 non-null  object 
 5   q_demos_race                389130 non-null  object 
 6   q_demos_education           389130 non-null  object 
 7   q_demos_income              389130 non-null  object 
 8   q_demos_gender              389130 non-null  object 
 9   q_sexual_orientation        389130 non-null  object 
 10  q_demos_state               389130 non-null  object 
 11  q_amazon_use_howmany        389130 non-null  object 
 12  q_amazon_use_hh_size        389130 non-null  object 
 13  q_amazon_use_h

`orders` is now our main dataframe. The target variable is `total_cost`.

## Train-Test Split

In [325]:
_X = orders.drop('total_cost', axis=1)
_y = orders.total_cost.to_numpy()

# initial split
_X, _X_test, y, y_test = train_test_split(
    _X, _y, test_size=0.2, stratify=_X.q_demos_state, random_state=SEED
)

# train set and eval set
_X_train, _X_eval, y_train, y_eval = train_test_split(
    _X, y, test_size=0.1, stratify=_X.q_demos_state, random_state=SEED
)

response_ids = _X_test.survey_response_id

_X_train = _X_train.drop(['survey_response_id'], axis=1)
_X_eval = _X_eval.drop(['survey_response_id'], axis=1)
_X_test = _X_test.drop(['survey_response_id'], axis=1)

print('Feature Shapes:')
print(f'\traw train: {_X_train.shape}')
print(f'\traw eval: {_X_eval.shape}')
print(f'\traw test: {_X_test.shape}')

Feature Shapes:
	raw train: (280173, 24)
	raw eval: (31131, 24)
	raw test: (77826, 24)


## Feature Engineering

In [326]:
categorical_columns = [
    'q_demos_age', 'q_demos_hispanic', 'q_demos_race',
    'q_demos_education', 'q_demos_income', 'q_demos_gender',
    'q_sexual_orientation', 'q_demos_state', 'q_amazon_use_howmany',
    'q_amazon_use_hh_size', 'q_amazon_use_how_oft',
    'q_substance_use_cigarettes', 'q_substance_use_marijuana',
    'q_substance_use_alcohol', 'q_personal_diabetes',
    'q_personal_wheelchair', 'q_sell_your_data', 'q_sell_consumer_data',
    'q_small_biz_use', 'q_census_use', 'q_research_society'
]

one_hot_enc = OneHotEncoder(handle_unknown='ignore')

categorical_pipeline = Pipeline(
    [
        ('encoder', one_hot_enc)
    ]
)

numerical_columns = ['item_count', 'year', 'month']

scaler = StandardScaler()

numerical_pipeline = Pipeline(
    [
        ('std_scaler', scaler)
    ]
)

pipeline = ColumnTransformer(
    [
        ('numerical', numerical_pipeline, numerical_columns),
        ('categorical', categorical_pipeline, categorical_columns)
    ]
)

pipeline.fit(_X_train)

In [327]:
X_train = pipeline.transform(_X_train)
X_eval = pipeline.transform(_X_eval)
X_test = pipeline.transform(_X_test)

print('Transformed Feature Shapes:')
print(f'\ttrain: {X_train.shape}')
print(f'\teval: {X_eval.shape}')
print(f'\ttest: {X_test.shape}')

Transformed Feature Shapes:
	train: (280173, 148)
	eval: (31131, 148)
	test: (77826, 148)


## Model Training

In [328]:
# configs
BATCH_SIZE = 32
LR = 1e-4
EPOCHS = 2

In [329]:
train_dataset = TrainDotCsv(X_train.toarray(), y_train)
eval_dataset = TrainDotCsv(X_eval.toarray(), y_eval)

train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=BATCH_SIZE
)

In [330]:
input_size = X_train.shape[1]
hidden_sizes = [100, 100, 100, 100]
input_dropout = 0.1
hidden_dropout = 0.7

model_1 = ThreeLayerNet(
    input_size, 
    hidden_1_size, 
    hidden_2_size, 
    input_dropout,
    hidden_dropout
)

model_1

ThreeLayerNet(
  (input_dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=148, out_features=100, bias=True)
  (dropout1): Dropout(p=0.7, inplace=False)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (dropout2): Dropout(p=0.7, inplace=False)
  (fc3): Linear(in_features=50, out_features=1, bias=True)
)

In [331]:
train_losses, eval_losses = solver(
    model_1, 
    train_dataloader, 
    eval_dataloader,
    LR,
    EPOCHS,
    100
)

Training ThreeLayerNet on mps...
Epoch: 1/2, Step: 1/8756, Validation Loss: 6431.16943
Epoch: 1/2, Step: 101/8756, Validation Loss: 5504.80859
Epoch: 1/2, Step: 201/8756, Validation Loss: 6184.95215
Epoch: 1/2, Step: 301/8756, Validation Loss: 5056.18799
Epoch: 1/2, Step: 401/8756, Validation Loss: 41522.05469
Epoch: 1/2, Step: 501/8756, Validation Loss: 18182.21289
Epoch: 1/2, Step: 601/8756, Validation Loss: 13175.59863
Epoch: 1/2, Step: 701/8756, Validation Loss: 4316.14600
Epoch: 1/2, Step: 801/8756, Validation Loss: 1747.19458
Epoch: 1/2, Step: 901/8756, Validation Loss: 4453.02148
Epoch: 1/2, Step: 1001/8756, Validation Loss: 7123.35010
Epoch: 1/2, Step: 1101/8756, Validation Loss: 2763.11011
Epoch: 1/2, Step: 1201/8756, Validation Loss: 1602.62109
Epoch: 1/2, Step: 1301/8756, Validation Loss: 2543.29590
Epoch: 1/2, Step: 1401/8756, Validation Loss: 1502.96631
Epoch: 1/2, Step: 1501/8756, Validation Loss: 7297.59766
Epoch: 1/2, Step: 1601/8756, Validation Loss: 1723.76294
Epoch: 

In [344]:
with torch.no_grad():
    model_1.cpu()
    preds = model_1(
        torch.from_numpy(X_test.toarray()).to(torch.float32)
    ).squeeze()

_agg = aggregate_finegrained_preds(preds, _X_test)

_agg

Unnamed: 0,q_demos_state,year,month,log_total
0,Alabama,2018,1,2.492549
1,Alabama,2018,2,2.285269
2,Alabama,2018,3,2.629877
3,Alabama,2018,4,2.411190
4,Alabama,2018,5,2.841847
...,...,...,...,...
2882,Wyoming,2021,11,2.062529
2883,Wyoming,2022,1,1.905234
2884,Wyoming,2022,5,2.294188
2885,Wyoming,2022,10,1.854005


# Prediction on `test.csv`

In [345]:
customers_test_path = "../../ucla-stats-101-c-2024-su-regression/customer_info_test.csv"
customers_test = pd.read_csv(customers_test_path)

amazon_test_path = "../../ucla-stats-101-c-2024-su-regression/amazon_order_details_test.csv"
amazon_test = pd.read_csv(amazon_test_path)

In [349]:
orders_test = join_and_prep(amazon_test, customers_test)

orders_test

Unnamed: 0,survey_response_id,item_count,q_demos_age,q_demos_hispanic,q_demos_race,q_demos_education,q_demos_income,q_demos_gender,q_sexual_orientation,q_demos_state,...,q_substance_use_alcohol,q_personal_diabetes,q_personal_wheelchair,q_sell_your_data,q_sell_consumer_data,q_small_biz_use,q_census_use,q_research_society,year,month
0,R_10uU7A3PqkL1i0o,3,35 - 44 years,No,White or Caucasian,Bachelor's degree,"$50,000 - $74,999",Female,heterosexual (straight),Louisiana,...,No,No,No,Yes if I get part of the profit,Yes if consumers get part of the profit,No,Yes,Yes,2018,1
1,R_12mMk3Dq3Oo4HJb,1,45 - 54 years,No,White or Caucasian,High school diploma or GED,"$25,000 - $49,999",Male,heterosexual (straight),Ohio,...,No,No,No,No,No,No,Yes,Yes,2018,1
2,R_1C4nKyrv0w2thJn,1,55 - 64 years,No,White or Caucasian,"Graduate or professional degree (MA, MS, MBA, ...","$100,000 - $149,999",Female,heterosexual (straight),Georgia,...,No,No,No,Yes if I get part of the profit,Yes if consumers get part of the profit,I don't know,No,Yes,2018,1
3,R_1C4tcyEfmcsKI9X,1,55 - 64 years,No,White or Caucasian,High school diploma or GED,"$75,000 - $99,999",Female,LGBTQ+,Oregon,...,Yes,No,No,No,No,Yes,I don't know,I don't know,2018,1
4,R_1CIOVoUy3w1iF6O,1,18 - 24 years,No,"White or Caucasian,Asian",High school diploma or GED,"Less than $25,000",Other,LGBTQ+,Tennessee,...,Yes,No,No,No,No,I don't know,I don't know,Yes,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379378,R_sqjjAsz7GATggo1,3,35 - 44 years,Yes,White or Caucasian,Bachelor's degree,"$50,000 - $74,999",Male,heterosexual (straight),California,...,I stopped in the recent past,No,No,Yes if I get part of the profit,Yes if consumers get part of the profit,No,No,Yes,2022,12
379379,R_vIAsy23dmSRObGF,2,45 - 54 years,No,White or Caucasian,High school diploma or GED,"$75,000 - $99,999",Female,heterosexual (straight),Georgia,...,No,No,No,No,Yes if consumers get part of the profit,No,No,No,2022,12
379380,R_xsyCKTazvY4vHbz,1,35 - 44 years,No,White or Caucasian,Bachelor's degree,"$25,000 - $49,999",Female,heterosexual (straight),Michigan,...,No,No,No,Yes if I get part of the profit,Yes if consumers get part of the profit,No,I don't know,Yes,2022,12
379381,R_yILeYEcIeCBmy2t,1,25 - 34 years,No,White or Caucasian,"Graduate or professional degree (MA, MS, MBA, ...","$25,000 - $49,999",Female,heterosexual (straight),Pennsylvania,...,No,No,No,Yes if I get part of the profit,Yes if consumers get part of the profit,I don't know,I don't know,I don't know,2022,12


In [350]:
_TEST = orders_test.drop(['survey_response_id'], axis=1)
_TEST = pipeline.transform(_TEST)

In [352]:
with torch.no_grad():
    model_1.cpu()
    preds = model_1(torch.from_numpy(_TEST.toarray()).to(torch.float32)).squeeze()

_agg = aggregate_finegrained_preds(preds, orders_test)

_agg

Unnamed: 0,q_demos_state,year,month,log_total
0,Alabama,2018,1,3.234815
1,Alabama,2018,2,3.207967
2,Alabama,2018,3,3.271197
3,Alabama,2018,4,3.314300
4,Alabama,2018,5,3.318471
...,...,...,...,...
3025,Wyoming,2022,3,1.945483
3026,Wyoming,2022,4,2.356288
3027,Wyoming,2022,5,1.737859
3028,Wyoming,2022,6,1.999326


In [353]:
test_csv_path = "../../ucla-stats-101-c-2024-su-regression/test.csv"
test_csv = pd.read_csv(test_csv_path)

get_submission_preds(
    _agg,
    test_csv,
    save_path = "../preds/preds_5LNN_finegrained_2_andy_0727.csv"
)

Unnamed: 0,id,log_total
0,1,3.234815
1,2,3.207967
2,3,3.271197
3,4,3.314300
4,5,3.318471
...,...,...
2947,2948,2.396888
2948,2949,2.196826
2949,2950,2.703267
2950,2951,2.446807
