In [1]:
from IPython.display import display
import pandas as pd

mls_data = pd.read_csv("standardized-mls-data.csv")
mls_data.head()

Unnamed: 0.1,Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Bedrooms,Association Fee,Address - Zip Code,Pool,Stories,Full Baths,Half Baths
0,0,0.22579,0.241465,0.14308,0.574006,-0.266296,-1.130717,-0.468623,-0.616885,-0.183358,1.847107
1,1,1.079862,1.64051,0.003674,0.574006,-0.266296,-0.053129,-0.468623,1.563242,1.35784,1.847107
2,2,-0.239254,-0.513486,-0.590704,-0.630825,0.645194,1.563253,-0.468623,1.563242,-0.183358,1.847107
3,3,-0.357478,-0.38579,-0.319918,-0.630825,-0.266296,-0.053129,-0.468623,-0.616885,-0.183358,-0.52829
4,4,-0.232492,-0.617327,0.180255,-0.630825,-0.266296,-1.130717,-0.468623,-0.616885,-0.183358,-0.52829


In [2]:
row_count = len(mls_data.index)
print(f"The dataset has {row_count} rows")

input_columns = list(mls_data.columns)
print(input_columns)

The dataset has 1866 rows
['Unnamed: 0', 'Close Price', 'Approx SqFt', 'Lot Size - Acres', 'Bedrooms', 'Association Fee', 'Address - Zip Code', 'Pool', 'Stories', 'Full Baths', 'Half Baths']


In [3]:
mls_data = mls_data.drop(columns = ["Unnamed: 0"])
mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Bedrooms,Association Fee,Address - Zip Code,Pool,Stories,Full Baths,Half Baths
0,0.22579,0.241465,0.14308,0.574006,-0.266296,-1.130717,-0.468623,-0.616885,-0.183358,1.847107
1,1.079862,1.64051,0.003674,0.574006,-0.266296,-0.053129,-0.468623,1.563242,1.35784,1.847107
2,-0.239254,-0.513486,-0.590704,-0.630825,0.645194,1.563253,-0.468623,1.563242,-0.183358,1.847107
3,-0.357478,-0.38579,-0.319918,-0.630825,-0.266296,-0.053129,-0.468623,-0.616885,-0.183358,-0.52829
4,-0.232492,-0.617327,0.180255,-0.630825,-0.266296,-1.130717,-0.468623,-0.616885,-0.183358,-0.52829


# Get the data into tensors

In [4]:
output_columns = ["Close Price"]
input_columns = ['Approx SqFt', 'Lot Size - Acres', 'Bedrooms', 'Association Fee', 'Address - Zip Code', 'Pool', 'Stories', 'Full Baths', 'Half Baths']

data_frame = mls_data.copy(deep = True)

inputs_array = data_frame[input_columns].to_numpy()
targets_array = data_frame[output_columns].to_numpy()

In [5]:
import torch

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

print(f"Using device {device}")

inputs = torch.Tensor(inputs_array).to(device)
targets = torch.Tensor(targets_array).to(device)

print(inputs.shape)
print(targets.shape)

Using device cuda
torch.Size([1866, 9])
torch.Size([1866, 1])


# Split the data between training and validation sets

In [6]:
from torch.utils.data import DataLoader, TensorDataset, random_split

dataset = TensorDataset(inputs, targets)

validation_percent = 0.3
validation_size = int(row_count * validation_percent)
training_size = row_count - validation_size

training_dataset, validation_dataset = random_split(dataset, [training_size, validation_size])
training_size, validation_size

(1307, 559)

In [7]:
batch_size = 16

training_loader = DataLoader(training_dataset, batch_size, shuffle = True)
validation_loader = DataLoader(validation_dataset, batch_size)

for X, y in training_loader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([16, 9])
Shape of y: torch.Size([16, 1]) torch.float32


# Create the model

In [16]:
from torch import nn

input_size = len(input_columns)
output_size = len(output_columns)

class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        output = self.linear(x)
        return output

    def training_step(self, batch):
        inputs, targets = batch
        output = self(inputs)
        loss = nn.functional.l1_loss(
            input = output,
            target = targets,
            size_average = None,
            reduce = None,
            reduction = 'mean'
        )
        return loss

    def validation_step(self, batch):
        inputs, targets = batch
        output = self(inputs)
        loss = nn.functional.l1_loss(
            input = output,
            target = targets,
            size_average = None,
            reduce = None,
            reduction = 'mean'
        )
        return {'val_loss': loss.detach()}

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        return {'val_loss': epoch_loss.item()}

    def epoch_end(self, epoch, result, num_epochs):
        if(epoch) % 10 == 0 or epoch == num_epochs - 1:
            print(f"Epoch {epoch}, val_loss: {result['val_loss']:.4f}")

model = LinearRegression().to(device)
print(model)

LinearRegression(
  (linear): Linear(in_features=9, out_features=1, bias=True)
)


In [17]:
def evaluate(model, validation_loader):
    outputs = [model.validation_step(batch) for batch in validation_loader]
    return model.validation_epoch_end(outputs)

def fit( epochs, lr, model, training_loader, validation_loader, optimizer_function = torch.optim.SGD):
    history = []
    optimizer = optimizer_function(model.parameters(), lr)
    for epoch in range(epochs):
        for batch in training_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        result = evaluate(model, validation_loader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    
    return history

Loss before training

In [18]:
result = evaluate(model, validation_loader)
print(result)

{'val_loss': 0.9282631874084473}


# Do some training

In [None]:
epochs = 1000
lr = 1e-6
history = fit(epochs, lr, model, training_loader, validation_loader)

# Save the model

In [73]:
torch.save(model.state_dict(), "predict_house_prices_model.pth")
print("Saved PyTorchModel State to predict_house_prices_model.pth")

Saved PyTorchModel State to predict_house_prices_model.pth


In [46]:
validation_dataset[0]

(tensor([ 0.1068, -0.3056,  0.5740, -0.2663, -0.5919, -0.4686,  1.5632, -0.1834,
          1.8471], device='cuda:0'),
 tensor([-0.0658], device='cuda:0'))

In [76]:
input, target = validation_dataset[0]
model(input)

tensor([-0.1052], device='cuda:0', grad_fn=<AddBackward0>)

In [49]:
target

tensor([-0.0658], device='cuda:0')