In [60]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 数据预处理
def encode_categorical(df):
    label_encoder = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = label_encoder.fit_transform(df[column])
        elif df[column].dtype == 'float64' or df[column].dtype == 'int64':
            df[column] = df[column].fillna(df[column].mean())
    return df

df1 = pd.read_csv('train.csv')
df1 = encode_categorical(df1)


In [117]:
from sklearn.model_selection import train_test_split
import torch.utils.data as Data
import torch

batch_sizes = 32

x = df1.iloc[:,1:-1]
y= df1.iloc[:,-1:]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2004)

x_train_tensor = torch.FloatTensor(x_train.values)
y_train_tensor = torch.FloatTensor(y_train.values)
x_test_tensor = torch.FloatTensor(x_test.values)
y_test_tensor = torch.FloatTensor(y_test.values)

train_dataset = Data.TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = Data.TensorDataset(x_test_tensor, y_test_tensor)

train_dataloader = Data.DataLoader(train_dataset,batch_sizes,shuffle=True)
test_dataloader = Data.DataLoader(test_dataset,batch_sizes,shuffle=True)
len(train_dataset)

1168

In [124]:
import torch.nn as nn

class LinearNetWithSigmoid(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear_1 = nn.Linear(in_features, 40)  # nn.Module
        self.linear_2 = nn.Linear(40, 10)
        self.linear_3 = nn.Linear(10, out_features)
        self.sigmoid = nn.Sigmoid()  
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.sigmoid(x)  
        x = self.linear_2(x)
        x = self.sigmoid(x)
        x = self.linear_3(x)
        return x


In [125]:
# 设置优化器和损失函数
import torch.optim as optim
loss = nn.MSELoss()
model = LinearNet(79,1)
optimizer = optim.Adam(model.parameters(),lr=1e-3)
model(x_test_tensor)

tensor([[-0.4132],
        [-0.4133],
        [-0.1461],
        [-0.4133],
        [-0.4133],
        [-0.1653],
        [-0.4133],
        [-0.1653],
        [-0.1181],
        [-0.3601],
        [-0.3601],
        [-0.1653],
        [-0.1595],
        [-0.2818],
        [-0.3601],
        [-0.4133],
        [-0.1653],
        [-0.1653],
        [-0.1653],
        [-0.1644],
        [-0.1653],
        [-0.2136],
        [-0.1653],
        [-0.3601],
        [ 0.0649],
        [-0.4133],
        [-0.2047],
        [-0.1653],
        [-0.4133],
        [-0.2818],
        [-0.3737],
        [-0.1988],
        [-0.3048],
        [-0.4133],
        [-0.4133],
        [-0.4133],
        [-0.4133],
        [-0.4133],
        [-0.1181],
        [-0.4117],
        [-0.4133],
        [-0.4133],
        [-0.0356],
        [-0.3737],
        [-0.4133],
        [-0.4133],
        [-0.1653],
        [-0.1653],
        [-0.3683],
        [-0.1653],
        [-0.3608],
        [-0.4128],
        [-0.

In [126]:
# 训练模型
def train_model(model,dataloader):
    model.train()
    
    total_loss = 0
    for idx,(x,y) in enumerate(dataloader):
        y_pred = model(x)
        cur_loss = loss(y_pred,y)
        optimizer.zero_grad()
        cur_loss.backward()
        optimizer.step()
        
        total_loss += cur_loss.item()
    print(f"train loss:{total_loss/len(train_dataset)}")
    

In [127]:
Epoch = 100
for i in range(Epoch):
    train_model(model,train_dataloader)

train loss:1206009564.9315069
train loss:1209982579.7260275
train loss:1207031920.2191782
train loss:1213477670.5753424
train loss:1208056374.3561645
train loss:1208303517.8082192
train loss:1203599828.1643836
train loss:1212322810.739726
train loss:1210287149.589041
train loss:1204484455.4520547
train loss:1202990095.7808218
train loss:1207324470.3561645
train loss:1203756670.2465754
train loss:1213032837.260274
train loss:1209658013.8082192
train loss:1203720000.8767123
train loss:1211018904.5479453
train loss:1222467868.0547945
train loss:1201804421.260274
train loss:1208227443.7260275
train loss:1205379010.630137
train loss:1207464370.8493152
train loss:1204507823.3424656
train loss:1205865687.671233
train loss:1210220640.4383562
train loss:1217093777.5342467
train loss:1214289734.1369863
train loss:1205551708.9315069
train loss:1213783602.8493152
train loss:1207274227.7260275
train loss:1205523712.0
train loss:1206202562.630137
train loss:1204655566.9041095
train loss:1205153358.9

In [128]:
# 验证模型
def test_model(model, dataloader):
    model.eval()

    total_loss = 0.
    for idx, (x, y) in enumerate(dataloader):
        y_pred = model(x)
        cur_loss = loss(y_pred, y)
        total_loss += cur_loss.item()
    print(f"Test loss: {total_loss/len(test_dataset)}")

In [123]:
num_epoch = 50
for i in range(num_epoch):
    print(f"==== Epoch {i} ====")
    train_model(model, train_dataloader)
    test_model(model, test_dataloader)

==== Epoch 0 ====
train loss:1211683392.8767123
Test loss: 1410238218.5205479
==== Epoch 1 ====
train loss:1208887553.7534246
Test loss: 1424549439.1232877
==== Epoch 2 ====
train loss:1210246776.9863014
Test loss: 1827541083.1780822
==== Epoch 3 ====
train loss:1204169854.2465754
Test loss: 1409016242.8493152
==== Epoch 4 ====
train loss:1210650806.3561645
Test loss: 1433728000.0
==== Epoch 5 ====
train loss:1207500782.4657533
Test loss: 1844194710.7945206
==== Epoch 6 ====
train loss:1204756536.109589
Test loss: 1398098424.9863014
==== Epoch 7 ====
train loss:1208391690.5205479
Test loss: 1413409413.260274
==== Epoch 8 ====
train loss:1207075489.3150685
Test loss: 1384307150.9041095
==== Epoch 9 ====
train loss:1209338017.3150685
Test loss: 1488872938.958904
==== Epoch 10 ====
train loss:1207906167.2328768
Test loss: 1422450337.3150685
==== Epoch 11 ====
train loss:1210043216.6575344
Test loss: 1451432258.630137
==== Epoch 12 ====
train loss:1203827296.4383562
Test loss: 1371583081.2