In [None]:
'''
pd.concat():默认按行进行拼接的。
datarframe.index:取列的索引,[0,1,2,3...]
dataframe.dtypes:查看每一列的数据类型,若是这一列不是数字，则数据类型是object
dataframe.values:将dataframe转换成array
dataframe.列名：取dataframe的一列数据
dataframe.apply(lambda x:f(x)):这里的x默认操作的是dataframe的每一列。
pd.get_dummies(all_features,dummy_na = True):对dataframe进行one-hot编码
一个元祖在传入函数中的时候，要加上*
'''

In [1]:
!pip install pandas

Looking in indexes: http://pypi.douban.com/simple


In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
import d2lzh_pytorch as d2l

print(torch.__version__)
torch.set_default_tensor_type(torch.FloatTensor)

1.3.1


In [3]:
train_data = pd.read_csv('./data/kaggle_house/train.csv')
test_data = pd.read_csv('./data/kaggle_house/test.csv')

In [4]:
print(type(train_data))

<class 'pandas.core.frame.DataFrame'>


In [5]:
train_data.shape, test_data.shape

((1460, 81), (1459, 80))

In [6]:
train_data.iloc[0:4,[0,1,2,3.-3.-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,SaleCondition,SalePrice
0,1,60,RL,Normal,208500
1,2,20,RL,Normal,181500
2,3,60,RL,Normal,223500
3,4,70,RL,Abnorml,140000


In [7]:
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [8]:
all_features.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458],
           dtype='int64', length=2919)

In [9]:
all_features['MSZoning'].dtypes=='object' #所以，要去掉不是数字的特征

True

In [10]:
num_index = all_features.dtypes[all_features.dtypes!='object'].index
print(num_index)#必须加上index才是取出来index

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


In [11]:
all_feature_num = all_features[num_index].apply(lambda x:(x-x.mean())/x.std())
#x.mean()默认的是求每一列数据的均值，x.mean(1)：求的是每一行的举止

In [12]:
all_feature_num = all_feature_num.fillna(0)

In [13]:
all_features = pd.get_dummies(all_features,dummy_na = True)
#对所有的特征进行one-hot编码.
all_features.shape

(2919, 331)

In [14]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values,dtype= torch.float)

In [15]:
train_labels = torch.tensor(train_data.SalePrice.values,dtype=torch.float).view(-1,1)

In [30]:
loss = nn.MSELoss()
def get_net(feature_num):
    net = nn.Linear(feature_num,1)
    for param in net.parameters():
        nn.init.normal_(param,mean=0,std=0.01)
    return net

#之前这里都是self.linear = nn.Linear(~,~)
#接着这里是self.linear(x),只不过这里，这次换成了net.

In [43]:
def log_rmse(net,feature,labels):
    with torch.no_grad():
        clipped_pred = torch.max(net(feature),torch.tensor(1.0))
        rmse = torch.sqrt(loss(clipped_pred.log(),labels.log()))
        return rmse

In [57]:
def train(net,train_features,train_labels,test_features,test_labels,
         num_epochs,learning_rate,weight_decay,batch_size):
    train_ls,test_ls = [],[]
    dataset = torch.utils.data.TensorDataset(train_features,train_labels)
    train_iter = torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)
    optimizer = torch.optim.Adam(net.parameters(),learning_rate)
    net = net.float()
    for epoch in range(num_epochs):
        for x,y in train_iter:
            l = loss(net(x.float()),y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net,test_features,test_labels))
        return train_ls,test_ls

In [58]:
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

In [63]:
def k_fold(k,x_train,y_train,num_epoches,learning_rate,weight_decay,batch_size):
    train_l_sum,valid_l_sum=0,0

    for i in range(k):
        data = get_k_fold_data(k,i,x_train,y_train)
        net = get_net(x_train.shape[1])
        train_ls,valid_ls  = train(net,*data,num_epoches,learning_rate,
                                  weight_decay,batch_size)
        train_l_sum+=train_ls[-1]
        valid_l_sum+=valid_ls[-1]
        print('fold %d, trian rsme %f, valid lmse %f'%(i,train_ls[-1],valid_ls[-1]))
    return train_l_sum/k, valid_l_sum/k

In [64]:
k,num_epoches,lr,weight_decay,batch_size=5,100,5,0,64
train_l,valid_l = k_fold(k,train_features,train_labels,num_epoches,
                        lr,weight_decay,batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))

fold 0, trian rsme 12.028135, valid lmse 12.040851
fold 1, trian rsme 12.029689, valid lmse 12.034640
fold 2, trian rsme 12.027271, valid lmse 12.044302
fold 3, trian rsme 12.039710, valid lmse 11.994490
fold 4, trian rsme 12.028587, valid lmse 12.039045
5-fold validation: avg train rmse 12.030679, avg valid rmse 12.030665
