In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [2]:
gpu = False
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu = True
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available")

GPU is available


In [3]:
def process_data(df):
    string_columns = df.select_dtypes(include=['object', 'string']).columns
    df_encoded = pd.get_dummies(df.iloc[:, 1:], columns=string_columns, dummy_na=True)
    bool_columns = df_encoded.select_dtypes(include=[bool]).columns
    df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)
    df_encoded = df_encoded.apply(lambda x: x.fillna(x.mean()) if np.issubdtype(x.dtype, np.number) else x)
    return df_encoded

In [4]:
df_encoded = process_data(pd.read_csv('data/train.csv'))
df_encoded

Unnamed: 0,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Brand_nan,Material_Canvas,...,Style_Messenger,Style_Tote,Style_nan,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red,Color_nan
0,7.0,11.611723,112.15875,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,10.0,27.078537,68.88056,0,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,2.0,16.643760,39.17320,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,8.0,12.937220,80.60793,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,1.0,17.749338,86.02312,1,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,9.0,12.730812,129.99749,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
299996,6.0,26.633182,19.85819,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
299997,9.0,11.898250,111.41364,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
299998,1.0,6.175738,115.89080,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [5]:
class BackpackDataset(Dataset):
    def __init__(self, df):
        # 检查输入是否为 pandas DataFrame
        if not isinstance(df, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame")
        
        # 检查是否包含 'Price' 列
        if 'Price' not in df.columns:
            raise ValueError("DataFrame must contain 'Price' column")
        
        # 将特征和目标转换为张量
        self.features = torch.tensor(df.drop(columns=['Price']).values, dtype=torch.float32)
        self.targets = torch.tensor(df['Price'].values, dtype=torch.float32).unsqueeze(1)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # 直接返回预先转换好的张量
        return self.features[idx], self.targets[idx]

In [6]:
test_size = 5000
train_df, test_df = train_test_split(df_encoded, test_size=test_size, random_state=42)
train_dataset = BackpackDataset(train_df)
test_dataset = BackpackDataset(test_df)

In [7]:
len(train_dataset)

295000

In [8]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=0, pin_memory=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=False)

In [9]:
for x, y in train_dataloader:
    print(x.shape[1])
    break

34


In [10]:
class MyNet(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, **kwargs):
        super(MyNet, self).__init__(**kwargs)
        self.l1 = nn.Linear(input_size, hidden_size_1)
        self.a1 = nn.ReLU()
        self.l2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.a2 = nn.ReLU()
        self.l3 = nn.Linear(hidden_size_2, 1)
        self.a3 = nn.ReLU()
    def forward(self, X):
        X = self.l1(X)
        X = self.a1(X)
        X = self.l2(X)
        X = self.a2(X)
        X = self.l3(X)
        return self.a3(X)

In [11]:
def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

In [12]:
from IPython import display
from matplotlib import pyplot as plt
def train(net, data_iter, loss_fn, opti, max_epoch, device):
    net.apply(xavier_init_weights)
    net.train()
    loss_history = []

    for epoch in range(max_epoch):
        loss_value_history = []
        for X, Y in data_iter:
            opti.zero_grad()
            X = X.to(device)
            Y = Y.to(device)
            Y_hat = net(X)
            loss = loss_fn(Y_hat, Y)
            loss.backward()
            opti.step()
            loss_history.append(loss.item())
        loss_history.append(np.mean(loss_value_history))
        plt.plot(np.arange(len(loss_history)), loss_history)
        display.display(plt.gcf())
        display.clear_output(wait=True)

In [16]:
lr = 5e-4
decay = 0.001
hidden_size_1, hidden_size_2 = 128, 64
max_epoch = 50
for x, y in train_dataloader:
    input_size = x.shape[1]
    break
net = MyNet(input_size, hidden_size_1, hidden_size_2)
opti = optim.AdamW(net.parameters(), lr=lr, weight_decay=decay)
loss_fn = nn.MSELoss()

In [17]:
train(net, train_dataloader, loss_fn, opti, max_epoch, device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
