In [320]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset, random_split
import time

import pandas as pd
import numpy as np



In [321]:
train_data = pd.read_csv('./data/CaliforniaHousePrices/train.csv')
test_data = pd.read_csv('./data/CaliforniaHousePrices/test.csv')

In [322]:
train_data.head()

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,0,540 Pine Ln,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,...,"Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022,CA
1,1,1727 W 67th St,505000.0,"HURRY, HURRY.......Great house 3 bed and 2 bat...",SingleFamily,1926.0,Combination,"Wall/Window Unit(s), Evaporative Cooling, See ...","Detached Carport, Garage",4047.0,...,"Detached Carport, Garage",505000.0,6253.0,2019-10-16,525000.0,2019-08-30,328000.0,Los Angeles,90047,CA
2,2,28093 Pine Ave,140000.0,'THE PERFECT CABIN TO FLIP! Strawberry deligh...,SingleFamily,1958.0,Forced air,,0 spaces,9147.0,...,,49627.0,468.0,2019-08-25,180000.0,,,Strawberry,95375,CA
3,3,10750 Braddock Dr,1775000.0,Rare 2-story Gated 5 bedroom Modern Mediterran...,SingleFamily,1947.0,Central,Central Air,"Detached Carport, Driveway, Garage - Two Door",,...,"Detached Carport, Driveway, Garage - Two Door",1775000.0,20787.0,2019-10-24,1895000.0,2016-08-30,1500000.0,Culver City,90230,CA
4,4,7415 O Donovan Rd,1175000.0,Beautiful 200 acre ranch land with several pas...,VacantLand,,,,0 spaces,,...,,,,2019-06-07,1595000.0,2016-06-27,900000.0,Creston,93432,CA


In [323]:
test_data.dtypes

Id                               int64
Address                         object
Summary                         object
Type                            object
Year built                     float64
Heating                         object
Cooling                         object
Parking                         object
Lot                            float64
Bedrooms                        object
Bathrooms                      float64
Full bathrooms                 float64
Total interior livable area    float64
Total spaces                   float64
Garage spaces                  float64
Region                          object
Elementary School               object
Elementary School Score        float64
Elementary School Distance     float64
Middle School                   object
Middle School Score            float64
Middle School Distance         float64
High School                     object
High School Score              float64
High School Distance           float64
Flooring                 

In [324]:
print(train_data.shape)
print(test_data.shape)

(47439, 41)
(31626, 40)


In [325]:
labels = train_data['Sold Price']
train_data = train_data.drop(['Sold Price'], axis=1)

In [326]:
# 丢弃id
all_features = pd.concat( (train_data.iloc[:, 1:], test_data.iloc[:, 1:]) )

In [327]:
date2 = pd.to_datetime(all_features['Listed On'].max())

all_features['Listed On'] = all_features['Listed On'].apply(
    lambda x: (date2 - pd.to_datetime(x)).days
).astype('float64')
all_features['Listed On']

0         494.0
1         502.0
2         554.0
3         494.0
4         633.0
          ...  
31621     124.0
31622     188.0
31623    3026.0
31624     219.0
31625    1028.0
Name: Listed On, Length: 79065, dtype: float64

In [328]:
numberic_features = all_features.dtypes[ all_features.dtypes != 'object' ].index
all_features[ numberic_features ] = all_features[ numberic_features ].apply(
    lambda x: (x-x.mean()) / x.std()
)
all_features[ numberic_features ] = all_features[ numberic_features ].fillna(0)
# 先计算均值方差后才能填充均值 否则影响真实数据

In [329]:
object_features = all_features.dtypes[ all_features.dtypes == 'object' ].index
print( object_features )
for col in object_features:
    value_counts = all_features[col].value_counts()

    while len(value_counts) > 20:
        threshold = value_counts.mean()
        all_features[col] = all_features[col].mask(all_features[col].map(value_counts) < threshold, 'others')
        value_counts = all_features[col].value_counts()
# 独热编码 把na作为一个类
all_features = pd.get_dummies(all_features, dummy_na=True)

# bool -> float
bool_index = all_features.dtypes[all_features.dtypes=='bool'].index
all_features[bool_index] = all_features[bool_index].astype('float32')

Index(['Address', 'Summary', 'Type', 'Heating', 'Cooling', 'Parking',
       'Bedrooms', 'Region', 'Elementary School', 'Middle School',
       'High School', 'Flooring', 'Heating features', 'Cooling features',
       'Appliances included', 'Laundry features', 'Parking features',
       'Last Sold On', 'City', 'State'],
      dtype='object')


In [330]:
# all_features = all_features[numberic_features]

In [331]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79065 entries, 0 to 31625
Columns: 178 entries, Year built to State_nan
dtypes: float32(159), float64(19)
memory usage: 60.0 MB


In [332]:
all_features.shape

(79065, 178)

In [333]:
source_labels = labels
labels = np.log(labels)

In [334]:
n_train = train_data.shape[0]
train_features = torch.tensor( all_features[:n_train].values, dtype=torch.float32 )
test_features = torch.tensor( all_features[n_train:].values, dtype=torch.float32 )
train_labels = torch.tensor( labels.values, dtype=torch.float32 ).reshape(n_train, 1)

In [335]:
print(train_features.shape)
print(test_features.shape)
print(train_labels.shape)

torch.Size([47439, 178])
torch.Size([31626, 178])
torch.Size([47439, 1])


In [336]:
full_dataset = TensorDataset(train_features, train_labels)

# 划分数据集
num_samples = len(full_dataset)
train_size = int(0.8 * num_samples)  # 80% 用于训练
val_size = num_samples - train_size   # 剩下的 20% 用于验证

# 使用 random_split 划分数据集
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])


In [337]:
batch_size = 512
train_dataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataLoader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataLoader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [355]:
# 定义MLP模型
class MLP(nn.Module):
    def __init__(self, in_features):
        
        super(MLP, self).__init__()

        self.in_features = in_features
        self.fc1 = nn.Linear(in_features,64)
        self.fc2 = nn.Linear(64, 1)
        

        self.ReLU = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.3)
    
    def forward(self, x):
        x = x.view(-1, self.in_features)
        
        x = self.fc1(x)
        x = self.ReLU(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.ReLU(x)
        # x = self.dropout(x)

        # x = self.fc3(x)
        # x = self.ReLU(x)

        return x
        

In [339]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Traing Device {device}')

Traing Device cuda


In [360]:
model = MLP( train_features.shape[1] ).to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [361]:
start = time.time()

num_epochs = 50
print_num = 1

train_ls, test_ls = [], []

for epoch in range(num_epochs):
    model.train()
    sum_loss = 0

    for features, labels in train_dataLoader:
        
        features = features.to(device)
        labels = labels.to(device)

        outputs = model(features)

        loss = criterion(outputs, labels)
        sum_loss += loss.item() * len(features)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % print_num == 0: print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {sum_loss/len(train_features):.4f}')
print(f'Traing Time = {time.time() - start}')

Epoch [1/50], Loss: 36.7511
Epoch [2/50], Loss: 5.8555
Epoch [3/50], Loss: 5.6207
Epoch [4/50], Loss: 3.2220
Epoch [5/50], Loss: 2.3761
Epoch [6/50], Loss: 1.8608
Epoch [7/50], Loss: 1.4418
Epoch [8/50], Loss: 1.0972
Epoch [9/50], Loss: 0.8386
Epoch [10/50], Loss: 0.7263
Epoch [11/50], Loss: 0.5193
Epoch [12/50], Loss: 0.3989
Epoch [13/50], Loss: 0.3401
Epoch [14/50], Loss: 0.2782
Epoch [15/50], Loss: 0.2603
Epoch [16/50], Loss: 0.2400
Epoch [17/50], Loss: 0.2395
Epoch [18/50], Loss: 0.2048
Epoch [19/50], Loss: 0.1890
Epoch [20/50], Loss: 0.1818
Epoch [21/50], Loss: 0.1745
Epoch [22/50], Loss: 0.2964
Epoch [23/50], Loss: 0.1892
Epoch [24/50], Loss: 0.1798
Epoch [25/50], Loss: 0.1774
Epoch [26/50], Loss: 0.1753
Epoch [27/50], Loss: 0.1741
Epoch [28/50], Loss: 0.1674
Epoch [29/50], Loss: 0.1711
Epoch [30/50], Loss: 0.1628
Epoch [31/50], Loss: 0.1579
Epoch [32/50], Loss: 0.1629
Epoch [33/50], Loss: 0.1821
Epoch [34/50], Loss: 0.1635
Epoch [35/50], Loss: 0.1652
Epoch [36/50], Loss: 0.1620


In [362]:
test_dataset[0]

(tensor([ 0.4436, -0.0184,  1.3845,  0.9547, -0.0039,  0.1556,  0.1688,  1.1056,
         -0.2935,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.1560,
          0.1211, -1.4717]),)

In [371]:
test_features.shape

tensor([[ 0.4436, -0.0184,  1.3845,  ...,  0.0000,  1.0000,  0.0000],
        [-0.3002, -0.0180, -0.3368,  ...,  0.0000,  1.0000,  0.0000],
        [ 0.4436, -0.0184,  1.3845,  ...,  0.0000,  1.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  1.0000,  0.0000],
        [ 0.1647,  0.0000,  0.5238,  ...,  0.0000,  1.0000,  0.0000],
        [ 0.2345,  0.0000, -0.3368,  ...,  0.0000,  1.0000,  0.0000]])

In [376]:
res = []

with torch.no_grad():
    for features in test_features:
        features = features.cuda()

        output = torch.exp(model(features)).cpu()
        res.append(output)
        # print(features.shape)
        # break

In [378]:
num = 47439

ans = []
for item in res:
    for j in item: 
        ans.append({
            "Id": num,
            "Sold Price": j.item()
        })
        num += 1

In [380]:
ans

[{'Id': 47439, 'Sold Price': 886754.9375},
 {'Id': 47440, 'Sold Price': 713125.1875},
 {'Id': 47441, 'Sold Price': 703453.1875},
 {'Id': 47442, 'Sold Price': 952799.1875},
 {'Id': 47443, 'Sold Price': 1166659.5},
 {'Id': 47444, 'Sold Price': 677613.625},
 {'Id': 47445, 'Sold Price': 1305983.625},
 {'Id': 47446, 'Sold Price': 328986.0625},
 {'Id': 47447, 'Sold Price': 1351584.25},
 {'Id': 47448, 'Sold Price': 623680.4375},
 {'Id': 47449, 'Sold Price': 348097.09375},
 {'Id': 47450, 'Sold Price': 793726.75},
 {'Id': 47451, 'Sold Price': 215747.125},
 {'Id': 47452, 'Sold Price': 415427.4375},
 {'Id': 47453, 'Sold Price': 484617.84375},
 {'Id': 47454, 'Sold Price': 551288.0625},
 {'Id': 47455, 'Sold Price': 308486.34375},
 {'Id': 47456, 'Sold Price': 1303757.375},
 {'Id': 47457, 'Sold Price': 220689.078125},
 {'Id': 47458, 'Sold Price': 166595.65625},
 {'Id': 47459, 'Sold Price': 243375.53125},
 {'Id': 47460, 'Sold Price': 1282250.0},
 {'Id': 47461, 'Sold Price': 1302985.5},
 {'Id': 47462, 

In [381]:
df = pd.DataFrame(ans)

In [382]:
df.to_csv('output.csv', encoding='utf-8', index=False)