In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
for dirname, _, filenames in os.walk('./california-house-prices'):
    print(dirname)
    for filename in filenames:
        print(os.path.join(dirname, filename))

./california-house-prices
./california-house-prices\sample_submission.csv
./california-house-prices\test.csv
./california-house-prices\train.csv


In [3]:
train_data = pd.read_csv('./california-house-prices/train.csv')
test_data = pd.read_csv('./california-house-prices/test.csv')

In [4]:
train_data.shape, test_data.shape

((47439, 41), (31626, 40))

In [5]:
train_data.head(1)

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,0,540 Pine Ln,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,...,"Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022,CA


In [6]:
test_data.head(1)

Unnamed: 0,Id,Address,Summary,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,47439,3034 N Coolidge Ave,Live within steps to the scenic views on the L...,SingleFamily,2020.0,Central,Central Air,Tandem Uncovered,940.0,2,...,Tandem Uncovered,,,2020-11-06,799900.0,2020-07-01,819000.0,Dodgertown,90090,CA


In [7]:
all_features = pd.concat((train_data.loc[:, train_data.columns != 'Sold Price'], test_data.iloc[:, 1:]))

In [8]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79065 entries, 0 to 31625
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Id                           47439 non-null  float64
 1   Address                      79065 non-null  object 
 2   Summary                      78219 non-null  object 
 3   Type                         79065 non-null  object 
 4   Year built                   77123 non-null  float64
 5   Heating                      66858 non-null  object 
 6   Cooling                      48921 non-null  object 
 7   Parking                      76740 non-null  object 
 8   Lot                          56076 non-null  float64
 9   Bedrooms                     74467 non-null  object 
 10  Bathrooms                    73655 non-null  float64
 11  Full bathrooms               66137 non-null  float64
 12  Total interior livable area  75187 non-null  float64
 13  Total spaces         

In [9]:
all_features.head(1)

Unnamed: 0,Id,Address,Summary,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,0.0,540 Pine Ln,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,"Ground Floor Bedroom, Master Bedroom on Ground...",...,"Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022,CA


In [10]:
all_features.tail(1)

Unnamed: 0,Id,Address,Summary,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
31625,,3875 Stewart Ave,DUPLEX STYLE TOWNHOUSE IN GATED COMMUNITY ONE ...,Townhouse,1993.0,Forced Air,Central Air,"Garage Door Opener, Parking Space",,4,...,"Garage Door Opener, Parking Space",543150.0,7281.0,2018-05-08,559000.0,2018-07-17,532500.0,Baldwin Park,91706,CA


In [11]:
# 假设 all_features 已经定义并加载
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

# 标准化数值特征
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std())

# 填充缺失值
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [12]:
all_features = all_features[numeric_features[1:]]
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79065 entries, 0 to 31625
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year built                   79065 non-null  float64
 1   Lot                          79065 non-null  float64
 2   Bathrooms                    79065 non-null  float64
 3   Full bathrooms               79065 non-null  float64
 4   Total interior livable area  79065 non-null  float64
 5   Total spaces                 79065 non-null  float64
 6   Garage spaces                79065 non-null  float64
 7   Elementary School Score      79065 non-null  float64
 8   Elementary School Distance   79065 non-null  float64
 9   Middle School Score          79065 non-null  float64
 10  Middle School Distance       79065 non-null  float64
 11  High School Score            79065 non-null  float64
 12  High School Distance         79065 non-null  float64
 13  Tax assessed value   

In [13]:
import torch

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float32)

# 开始 

In [14]:
print(f'train_features shape: {train_features.shape}')
print(f'test_features shape: {test_features.shape}')
print(f'train_labels shape: {train_labels.shape}')

train_features shape: torch.Size([47439, 18])
test_features shape: torch.Size([31626, 18])
train_labels shape: torch.Size([47439, 1])


In [15]:
from torch import nn

loss = nn.MSELoss()

in_features = train_features.shape[1]

net = nn.Sequential(nn.Linear(in_features, 1))

optimizer = torch.optim.Adam(net.parameters(), lr=0.01, weight_decay=0)

In [16]:
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(train_features, train_labels)
train_iter = DataLoader(dataset, 64)

In [19]:
from tqdm import tqdm

for epoch in range(100):
    pbar = tqdm(train_iter, total=100, desc=f'Epoch {epoch}/{100}', leave=False)
    for X, y in pbar:
        optimizer.zero_grad()
        l = loss(net(X), y)
        l.backward()
        optimizer.step()
        pbar.set_postfix({'Loss': l.item()})
        pbar.update(1)

                                                                              

In [20]:
result = net(test_features[0:1])

In [21]:
result

tensor([[2721.4497]], grad_fn=<AddmmBackward0>)

In [None]:

\]