In [336]:
import pandas as pd
import torch 

train_data=pd.read_csv("../data/lot/train.csv")
test_data=pd.read_csv("../data/lot/test.csv")


In [337]:
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


In [338]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
all_features.iloc[0:4]


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml


In [339]:
import numpy as np
indices=all_features.dtypes[all_features.dtypes != 'object'].index
all_features[indices].apply(
    lambda x: (x-x.mean())/x.std()
)
all_features[indices]=all_features[indices].fillna(0)
all_features=pd.get_dummies(all_features,dummy_na=True)
all_features.iloc[0:4]

arr = all_features.to_numpy(dtype=np.float32)
arr

array([[6.0000e+01, 6.5000e+01, 8.4500e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0000e+01, 8.0000e+01, 9.6000e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [6.0000e+01, 6.8000e+01, 1.1250e+04, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [2.0000e+01, 1.6000e+02, 2.0000e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [8.5000e+01, 6.2000e+01, 1.0441e+04, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       [6.0000e+01, 7.4000e+01, 9.6270e+03, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00]], dtype=float32)

In [340]:
n_train=train_data.shape[0]
train_feathures=torch.tensor(arr[:n_train],dtype=torch.float32)
test_features=torch.tensor(arr[n_train:],dtype=torch.float32)
labels=torch.tensor(train_data.SalePrice.values,dtype=torch.float32)
print(labels.sum(),labels.mean(),labels.std())

tensor(2.6414e+08) tensor(180921.1875) tensor(79442.5000)


In [None]:
from torch.utils import data
from torch import nn

def log_rmse(net,loss,feathures,labels):
    clip_pred=torch.clamp(net(feathures),1,float('inf'))
    rmse=torch.sqrt(loss(torch.log(clip_pred),torch.log(labels)))
    return rmse.item()

def get_k_fold_data(k,i,x,y):
    assert k > 1
    fold_size=x.shape[0] // k
    x_train,y_train=None,None
    for j in range(k):
        idx=slice(j*fold_size,(j+1)*fold_size)
        x_part,y_part=x[idx,:],y[idx]
        if i==j:
            x_valid,y_valid=x_part,y_part
        elif x_train is None:
            x_train,y_train=x_part,y_part
        else:
            x_train=torch.cat([x_train,x_part],0)
            y_train=torch.cat([y_train,y_part],0)
    return x_train,y_train,x_valid,y_valid

def data_iter(dds,batch_size):
    ds=data.TensorDataset(*dds)
    return data.DataLoader(ds,batch_size,True)

def init_params(m):
    if type(m)==nn.Linear:
        nn.init.normal_(m.weight,std=0.01)
        nn.init.zeros_(m.bias)

def train(trainx,trainy,k,num_epoch,batch_size,lr,):
    net=nn.Sequential(nn.Linear(trainx.shape[1],1))
    net.apply(init_params)
    for i in range(k):
        x_train,y_train,x_valid,y_valid=get_k_fold_data(k,i,trainx,trainy)
        train_iter=data_iter((x_train,y_train),batch_size)
        #test_iter=data_iter((x_valid,y_valid),batch_size)
        
        loss=nn.MSELoss()
        updater=torch.optim.Adam(net.parameters(),lr=lr)
        for epoch in range(num_epoch):
            for x,y in train_iter:
                updater.zero_grad()
                y_hat=net(x)
                l=loss(y_hat,y.reshape(y_hat.shape))
                l.backward()
                updater.step()
            with torch.no_grad():
                # train_y_hat=net(x_train)
                # train_loss=loss(train_y_hat,y_train.reshape(train_y_hat.shape))
                trainlog_loss=log_rmse(net,loss,x_train,y_train.reshape(len(y_train),-1))
                # test_y_hat=net(x_valid)
                # test_loss=loss(test_y_hat,y_valid.reshape(test_y_hat.shape))
                testlog_loss=log_rmse(net,loss,x_valid,y_valid.reshape(len(y_valid),-1))
                print(f'fold k:{i} epoch:{epoch} trainlog_loss:{trainlog_loss} testlog_loss:{testlog_loss}')



        

In [343]:
k=5
num_epoch=10
batch_size=50
lr=0.01
train(train_feathures,labels,k,num_epoch,batch_size,lr)

fold k:0 epoch:0 trainlog_loss:3.4598286151885986 testlog_loss:3.472022533416748
fold k:0 epoch:1 trainlog_loss:2.7564916610717773 testlog_loss:2.7679097652435303
fold k:0 epoch:2 trainlog_loss:2.355991840362549 testlog_loss:2.3667476177215576
fold k:0 epoch:3 trainlog_loss:2.074723720550537 testlog_loss:2.0848631858825684
fold k:0 epoch:4 trainlog_loss:1.856006145477295 testlog_loss:1.8655058145523071
fold k:0 epoch:5 trainlog_loss:1.686835765838623 testlog_loss:1.6957393884658813
fold k:0 epoch:6 trainlog_loss:1.5447273254394531 testlog_loss:1.5530025959014893
fold k:0 epoch:7 trainlog_loss:1.4229950904846191 testlog_loss:1.4306622743606567
fold k:0 epoch:8 trainlog_loss:1.3169111013412476 testlog_loss:1.3239209651947021
fold k:0 epoch:9 trainlog_loss:1.2257925271987915 testlog_loss:1.2321559190750122
fold k:1 epoch:0 trainlog_loss:3.3992254734039307 testlog_loss:3.3803157806396484
fold k:1 epoch:1 trainlog_loss:2.7241737842559814 testlog_loss:2.7065861225128174
fold k:1 epoch:2 trai

In [None]:
# from torch import nn
# from torch.utils import data

# lr=4
# epoch_num=1000
# batch_size=300
# num_inputs=train_feathures.shape[1]
# num_hidden=10
# num_output=1

# def init_data(m):
#     if type(m)==nn.Linear:
#         nn.init.normal_(m.weight,std=0.01)
#         nn.init.zeros_(m.bias)

# net=nn.Sequential(nn.Linear(num_inputs,1))
# net.apply(init_data)
# loss=nn.MSELoss()
# updater=torch.optim.Adam(net.parameters(),lr=lr)

# def data_iter(dds,batch_size):
#     ds=data.TensorDataset(*dds)
#     return data.DataLoader(ds,batch_size,True)

# train_iter=data_iter((train_feathures,labels),batch_size)
# for i in range(epoch_num):
#     for x,y in train_iter:
#         updater.zero_grad()
#         y_hat=net(x)
#         l=loss(y_hat,y.reshape(y_hat.shape))
#         l.backward()
#         updater.step()
#     with torch.no_grad():
#         train_x=loss(net(train_feathures),labels.reshape(len(labels),-1))
#         rmse=log_rmse(net,loss,train_feathures,labels.reshape(len(labels),-1))
#         print(f'i:{i} loss:{train_x.sum()} rmse:{rmse}')




i:0 loss:12300360704.0 rmse:0.48895829916000366
i:1 loss:7017857024.0 rmse:0.8262695074081421
i:2 loss:3676394240.0 rmse:0.6488840579986572
i:3 loss:5261285888.0 rmse:0.4122036397457123
i:4 loss:3510949888.0 rmse:0.625517725944519
i:5 loss:2364568320.0 rmse:0.23286159336566925
i:6 loss:2647398144.0 rmse:0.25794288516044617
i:7 loss:2605748736.0 rmse:0.22475965321063995
i:8 loss:2209977088.0 rmse:0.232025146484375
i:9 loss:2044600448.0 rmse:0.21386659145355225
i:10 loss:2064692608.0 rmse:0.21107909083366394
i:11 loss:2040970112.0 rmse:0.22068624198436737
i:12 loss:2007899392.0 rmse:0.2091904729604721
i:13 loss:1971053184.0 rmse:0.21179154515266418
i:14 loss:1949412096.0 rmse:0.20820389688014984
i:15 loss:1937644160.0 rmse:0.20838873088359833
i:16 loss:1926941184.0 rmse:0.209553062915802
i:17 loss:1917382272.0 rmse:0.2091505080461502
i:18 loss:1909062784.0 rmse:0.20877936482429504
i:19 loss:1901950976.0 rmse:0.20848076045513153
i:20 loss:1895885696.0 rmse:0.20817381143569946
i:21 loss:18