Get Data

In [13]:
import os
import hashlib
import requests

cache_dir = './data'
def download_data(file, hash) :
    url = "https://p.cloudgav.com/"+file
    fname = os.path.join(cache_dir, url.split('/')[-1])
    print(f'file in {fname}')
    dl = False
    os.makedirs(cache_dir, exist_ok=True)

    if os.path.exists(fname):
        with open(fname, 'r+b') as f:
            content = f.read()
            if len(content ) == 0:
                print('empty file')
                dl = True
            else:
                sha256 = hashlib.sha256()
                sha256.update(content)
                print(f"sha256 is {sha256.hexdigest()}")
                if hash != sha256.hexdigest():
                    f.truncate(0)
                    dl = True
                else:
                    print("file is good, no need to re-download")
    else:
        dl = True

    if dl == True:
        print(f"Download to {fname}")
        with open(fname, 'w+b') as f:
            r = requests.get(url, stream=True, verify=True)
            if r.status_code != 200:
                raise SystemExit(f"status error {r.status_code}!")

            # assuming the download is always correct
            f.write(r.content)
            print("Done")

In [14]:
download_data("train.csv.zip", "1bac0c0bcbbbd4965a89edbe2dc632ab875d2f696b8a3c83f93fe1707666601d")
download_data("test.csv.zip", "caf72c51b825ed5481bd0df949f889b87266879dfd567fc6dd0bd6db095d4606")

file in ./data/train.csv.zip
sha256 is 1bac0c0bcbbbd4965a89edbe2dc632ab875d2f696b8a3c83f93fe1707666601d
file is good, no need to re-download
file in ./data/test.csv.zip
sha256 is caf72c51b825ed5481bd0df949f889b87266879dfd567fc6dd0bd6db095d4606
file is good, no need to re-download


Read data in, then filter out the features which are not useful

In [15]:
import zipfile
import pandas as pd

train_zip = zipfile.ZipFile(os.path.join(cache_dir, "train.csv.zip"))
test_zip = zipfile.ZipFile(os.path.join(cache_dir, "test.csv.zip"))

train_zip.extractall(cache_dir)
test_zip.extractall(cache_dir)

train_df = pd.read_csv(os.path.join(cache_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(cache_dir, "test.csv"))

In [16]:
train_cols = sorted(train_df.columns)
test_cols = sorted(test_df.columns)

print(train_df.shape, train_cols)
print(test_df.shape,test_cols)


(47439, 41) ['Address', 'Annual tax amount', 'Appliances included', 'Bathrooms', 'Bedrooms', 'City', 'Cooling', 'Cooling features', 'Elementary School', 'Elementary School Distance', 'Elementary School Score', 'Flooring', 'Full bathrooms', 'Garage spaces', 'Heating', 'Heating features', 'High School', 'High School Distance', 'High School Score', 'Id', 'Last Sold On', 'Last Sold Price', 'Laundry features', 'Listed On', 'Listed Price', 'Lot', 'Middle School', 'Middle School Distance', 'Middle School Score', 'Parking', 'Parking features', 'Region', 'Sold Price', 'State', 'Summary', 'Tax assessed value', 'Total interior livable area', 'Total spaces', 'Type', 'Year built', 'Zip']
(31626, 40) ['Address', 'Annual tax amount', 'Appliances included', 'Bathrooms', 'Bedrooms', 'City', 'Cooling', 'Cooling features', 'Elementary School', 'Elementary School Distance', 'Elementary School Score', 'Flooring', 'Full bathrooms', 'Garage spaces', 'Heating', 'Heating features', 'High School', 'High School 

In [17]:
import matplotlib.pyplot as plt
from matplotlib_inline import backend_inline
from IPython import display
import numpy as np

class AnimatedLoss:
    """在动画中绘制数据"""
    def __init__(self, subplots_config, nrows=1, ncols=1, figsize=(3.5, 2.5)):
        self.use_svg_display()
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes]
        elif isinstance(self.axes, np.ndarray):
            self.axes = self.axes.flatten().tolist()
        self.subplots_config = subplots_config
        self.lines = [[] for _ in range(len(self.axes))]
        self._initialize_axes()

    @staticmethod
    def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
        axes.set_xlabel(xlabel)
        axes.set_ylabel(ylabel)
        axes.set_xscale(xscale)
        axes.set_yscale(yscale)
        axes.set_xlim(xlim)
        axes.set_ylim(ylim)
        if legend:
            axes.legend(legend)
        axes.grid()

    @staticmethod
    def use_svg_display():
        backend_inline.set_matplotlib_formats('svg')

    def _initialize_axes(self):
        for ax, config in zip(self.axes, self.subplots_config):
            self.set_axes(
                ax, 
                config.get('xlabel', None), 
                config.get('ylabel', None), 
                config.get('xlim', None), 
                config.get('ylim', None), 
                config.get('xscale', 'linear'), 
                config.get('yscale', 'linear'), 
                config.get('legend', None)
            )

    def add(self, x, y, subplot_index=0):
        config = self.subplots_config[subplot_index]
        if not isinstance(y, list):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.lines[subplot_index]:
            self.lines[subplot_index] = []
            for fmt, label in zip(config.get('fmts', ['-']), config.get('legend', [None]*n)):
                line, = self.axes[subplot_index].plot([], [], fmt, label=label)
                self.lines[subplot_index].append(line)

        for line, yi in zip(self.lines[subplot_index], y):
            new_x = np.append(line.get_xdata(), x)
            new_y = np.append(line.get_ydata(), yi)
            line.set_data(new_x, new_y)
        
        self.axes[subplot_index].relim()  # Recompute limits
        self.axes[subplot_index].autoscale_view()  # Autoscale
        self.axes[subplot_index].legend() 

        display.display(self.fig)
        display.clear_output(wait=True)

Drop not useful columns
TODO: please also remove multicollinearity, such as "Parking", and "Parking features", or combine them into one, and "State", "City", "Region" can be represented by Zip?

In [18]:
# check how is the difference between "Parking" and "Parking features"
# pd.set_option('display.max_row', None)
(train_df['Parking'] == train_df["Parking features"]).sum()
print(train_df['Parking'][2], "<===>", train_df['Parking features'][2])

mask = (train_df['Parking'] != train_df['Parking features'])
mask.sum()/train_df.shape[0]
# for parking, parking_feature in zip(train_df[mask]["Parking"], train_df[mask]["Parking features"]):
#     print(parking, "<===>",parking_feature)

0 spaces <===> nan


0.10069773814793735

In [19]:
import pprint
import re

def clean_string(s):
    s = s.lower()  # Strip and convert to lower case
    s = re.sub(r'\-+', ' ', s)  # Replace multiple - with a single space
    s = re.sub(r'\s+', ' ', s)  # Replace multiple spaces with a single space
    return s

def expand_col(df, col_name):
    col = df[col_name]
    col.fillna("0", inplace=True)
    # col is a pandas Series objects, so apply is working on its each element
    col = col.apply(lambda s: [e.strip() for e in clean_string(s).split(',')])
    # print(col)
    unique = set()
    for val in col:
        unique.update(val)
        # print(f"val is {val}, unique is {unique}")

    new_columns = pd.DataFrame()    
    for val in unique:
        new_columns= pd.concat([new_columns,col.apply(lambda x: int(val in x) ).to_frame(name=f'{col_name}_{val}')], axis=1)
    
    # print(new_columns.shape)
    
    df = df.join(new_columns)
    
    df.drop(col_name, axis=1, inplace=True)
    return df

def expand_col_date(df, col_name):
    col = df[col_name]
    col.fillna("0", inplace=True)
    # after read_cvs, xx/yy/zz turned into xx-yy-zz, zz is not important
    new_columns = pd.DataFrame()    

    new_columns[f'{col_name}_year'] = col.apply(lambda x: int(x.split('-')[0]) if '-' in x  else 0)
    new_columns[f'{col_name}_month'] = col.apply(lambda x: int(x.split('-')[1]) if '-' in x and len(x.split('-')) > 1 else 0)
    
    df = df.join(new_columns)

    df.drop(col_name, axis=1, inplace=True)

    return df

def print_col_start_with(name):
    expand = train_df.filter(regex=f'^{name}').columns
    pprint.pprint(f'{name}_expand to {len(expand)}')
    pprint.pprint(expand)

Clean up the data

In [20]:
print(train_df.dtypes)
# print(train_df.dtypes[train_df.dtypes != 'object'])
# 在Pandas中，当你用一个布尔Series去索引DataFrame时，它通常是用来选择行的，而不是列
# 因为train_df.dtypes 是行的形式：
# Id                                  int64
# Address                            object
# Sold Price                        float64
# Summary                            object
# Type                               object
#
# 所以，要选择列features，需要用这种形式来选取列的名字列表：                              ...   
numeric_features = train_df.dtypes[train_df.dtypes != 'object'].index

Id                               int64
Address                         object
Sold Price                     float64
Summary                         object
Type                            object
Year built                     float64
Heating                         object
Cooling                         object
Parking                         object
Lot                            float64
Bedrooms                        object
Bathrooms                      float64
Full bathrooms                 float64
Total interior livable area    float64
Total spaces                   float64
Garage spaces                  float64
Region                          object
Elementary School               object
Elementary School Score        float64
Elementary School Distance     float64
Middle School                   object
Middle School Score            float64
Middle School Distance         float64
High School                     object
High School Score              float64
High School Distance     

In [21]:
print(f"train data's shape xx", train_df.shape)
train_df.drop('Summary', axis=1, inplace=True)
train_df.drop('Id', axis=1, inplace=True)
train_df.drop('Address', axis=1, inplace=True) # zip is enough
train_df.drop('Heating', axis=1, inplace=True) # Heating features is enough
train_df.drop('Cooling', axis=1, inplace=True) # Cooling features is enough
train_df.drop('Parking', axis=1,inplace=True)  # Parking features is enough
train_df.drop('Region', axis=1,inplace=True)  # Parking features is enough
train_df.drop('State', axis=1,inplace=True)  # Parking features is enough
train_df.drop('City', axis=1,inplace=True)  # Parking features is enough

numeric_features = train_df.dtypes[train_df.dtypes != 'object'].index
numeric_features_not_price = [feature for feature in numeric_features if feature != "Sold Price" ]

train_df[numeric_features] = train_df[numeric_features].astype(float)
# don't normalized for labels
train_df[numeric_features_not_price] = train_df[numeric_features_not_price].apply(
    lambda x: (x - x.mean()) / (x.std()))

# TODO: fillna with x.mean
train_df[numeric_features] = train_df[numeric_features].fillna(0)

train_df = expand_col(train_df,"Appliances included")
train_df = expand_col(train_df,"Heating features")
train_df = expand_col(train_df,"Cooling features")
train_df = expand_col(train_df,"Parking features")
train_df = expand_col(train_df,"Flooring")
train_df = expand_col(train_df,"Laundry features")
train_df = expand_col(train_df,"Bedrooms")
train_df = expand_col(train_df,"Type")

train_df = expand_col_date(train_df, "Listed On")
train_df = expand_col_date(train_df, "Last Sold On")

train_df = pd.get_dummies(train_df, dummy_na=True)
# TODO: why TRUE or FALSE instead of 0/1 unless calling astype
train_df = train_df.astype(float)

# print_col_start_with('Type')
# print(train_df["Listed On_year"])
# pprint.pprint(train_df.columns.to_list())


pprint.pprint(f"train data's shape after get dummy {train_df.shape}")
train_df[0:10].to_csv('cleanup_data.csv', index=False)


train data's shape xx (47439, 41)
"train data's shape after get dummy (47439, 3562)"


In [22]:
import torch

dev_id = 0
if torch.cuda.is_available():
    print(f"found {torch.cuda.get_device_name(dev_id)}")
    device = torch.device(f"cuda:{dev_id}")
else:
    device = torch.device("cpu")
    
print(f"using device: {device}")

print(torch.cuda.memory_allocated())


train_data_tensor = torch.tensor(train_df.drop("Sold Price",axis=1).values, dtype=torch.float32).to(device)
train_label_tensor = torch.tensor(train_df["Sold Price"].values, dtype=torch.float32).to(device)
print(f"train_data_tensor dtype {train_data_tensor.dtype}")

print(torch.cuda.memory_allocated())


found NVIDIA GeForce RTX 2080 Ti
using device: cuda:0
675911168
train_data_tensor dtype torch.float32
675911168


split data into k-fold, one for verification and rest for training

In [23]:
import random
# TODO: make a a real k fold instead of random selection
def k_data_split(data_len, k):
    if k >= data_len or k < 3:
        raise SystemExit(f"k is too big or too small {k}")
    # print(f"data_len {data_len}")
    k_len = data_len//k
    index = list(range(0,data_len))
    random.shuffle(index)
    return index[0:k_len], index[k_len:]


# K = 10
# k_verify_df,k_train_df = k_data_split(train_df, K)
# print(f"k_verify_df len {len(k_verify_df)}, k_train_df len {len(k_train_df)}")
# print(f"k_train_df 0-9 ", k_train_df[0:10].to_string())
# print(f"k_verify_df 0-9 ", k_verify_df[0:10].to_string())


In [24]:
from torch import Tensor, nn
import torch
import warnings

class LogMseLoss(nn.Module):
    def __init__(self):
        super().__init__()
    # forward will be automatically called when this class is from nn.Module
    def forward(self, predicted, target):
        if not (predicted.size() == target.size()):
            warnings.warn(
                f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
                "This will likely lead to incorrect results due to broadcasting. "
                "Please ensure they have the same size.",
                stacklevel=2,
            )
        
        predicted_mod = torch.clamp(predicted, 1)
        target_mod = torch.clamp(target, 1)
        r = torch.sqrt(torch.mean((torch.log(predicted_mod) - torch.log(target_mod)) ** 2))
        # print(f"log loss {r}, shape {r.shape}, predicted shape {predicted.shape}: {predicted[0:3]}, target shape {target.shape}: {target[0:3]}")
        return r

log_mse_loss = LogMseLoss()

loss = nn.MSELoss()

def nn_net():
    feature_size = train_df.shape[1]-1 # -1 to remove the label column
    return nn.Sequential(nn.Linear(feature_size,1))
    # return nn.Sequential(nn.Linear(feature_size,feature_size//2),
    #                      nn.ReLU(),
    #                      nn.Linear(feature_size//2, 1)) 
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_kaiming(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

net = nn_net()
net.apply(init_kaiming)

Sequential(
  (0): Linear(in_features=3561, out_features=1, bias=True)
)

In [25]:
print("1", torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

1 675911168
1356857344


In [26]:
from torch.utils.data import DataLoader,Dataset, TensorDataset
import numpy as np

# Convert DataFrame to a PyTorch Dataset
class CustomDataset(TensorDataset):
    def __init__(self, data_tensor, label_tensor):
        # print(dataframe.apply(pd.to_numeric, errors='coerce').isnull())
        self.features = data_tensor
        # print(f"features dtype {self.features.dtype}")
        self.labels = label_tensor.reshape(self.features.shape[0], -1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

K = 10
epochs = 50
batch_size = 32
lr = 1e-3
weight_decay = 0
train_loss_mean, verify_loss_mean, train_loss_array, verify_loss_array = np.zeros((epochs)), np.zeros((epochs)), \
                                                np.zeros((K,epochs)), np.zeros((K,epochs)) 

net = net.to(device)

optimizer = torch.optim.Adam(net.parameters(),
                                 lr = lr,
                                 weight_decay = weight_decay)

In [27]:
print(torch.cuda.memory_allocated())
import torch


675926016


In [28]:
for i in range(K):
    # subplots_config = [
    #     {'xlabel': 'epoch', 'ylabel': 'loss', 'xlim': (0, epochs), 'ylim': (1.1, 1.6), 'xscale': 'linear', 'yscale': 'linear', 'legend': ['train', 'verify'], 'fmts': ['r-', 'b--']}
    # ]

    # animator = AnimatedLoss(subplots_config, nrows=1, ncols=1, figsize=(10, 5) )
    print(f' ===>   K:{i}', torch.cuda.memory_allocated())
    print(f'   reserved', torch.cuda.memory_reserved())    
    
    # print(f' ===> K:{i}')
    k_verify_index,k_train_index = k_data_split(len(train_data_tensor), K)
    k_verify_index,k_train_index = torch.tensor(k_verify_index).to(device),torch.tensor(k_train_index).to(device)
    # print(f"train data len {len(k_train_index)}, verify data len {len(k_verify_index)}")
    print(f'         1:', torch.cuda.memory_allocated())
    print(f'   reserved', torch.cuda.memory_reserved())    

    # k_train_data_tensor = train_data_tensor.index_select(0, k_train_index)
    # k_train_label_tensor = train_label_tensor.index_select(0, k_train_index)
    # k_verify_data_tensor = train_data_tensor.index_select(0, k_verify_index)
    # k_verify_label_tensor = train_label_tensor.index_select(0, k_verify_index)

    k_train_data_tensor = train_data_tensor[len(train_data_tensor)//K:]
    k_train_label_tensor = train_label_tensor[len(train_data_tensor)//K:]
    k_verify_data_tensor = train_data_tensor[0:len(train_data_tensor)//K:]
    k_verify_label_tensor = train_label_tensor[0:len(train_data_tensor)//K:]

    print(f'         2:', torch.cuda.memory_allocated())
    print(f'   reserved', torch.cuda.memory_reserved())


    # print(f"len of k_train_data_tensor {len(k_train_data_tensor)}")

    train_data = DataLoader(CustomDataset(k_train_data_tensor,k_train_label_tensor), shuffle=True, batch_size = batch_size)
    

    print(f'         3:', torch.cuda.memory_allocated())
    print(f'   reserved', torch.cuda.memory_reserved())    


    j = -1
    for ep in range(epochs):
        # print(f'\t\tepoch:{ep}')
        j += 1
        for X, y in train_data:
            # print(f"X shape, {X.shape} {X.dtype}, y shape {y.shape}")
            optimizer.zero_grad()
            l = loss(net(X),y)
            l.backward()
            optimizer.step() 
  
        print(f'       ep{ep}:', torch.cuda.memory_allocated())
        print(f'  reserved:', torch.cuda.memory_reserved())  
        with torch.no_grad():

            tlabel = net(k_train_data_tensor).reshape(k_train_label_tensor.shape[0])
            vlabel = net(k_verify_data_tensor).reshape(k_verify_label_tensor.shape[0])
            l1 = log_mse_loss(tlabel, k_train_label_tensor)
            l2 = log_mse_loss(vlabel, k_verify_label_tensor)
            train_loss_array[i][j] += l1
            verify_loss_array[i][j] += l2

            # animator.add([ep],[l1.to("cpu"),l2.to("cpu")])

        # print(f"l1 shape {l1.shape}: {l1}")
        # print(f'train label {tlabel[0:3]} \nvs\n{k_train_label_tensor[0:3]},\nverify label {vlabel[0:3]} \nvs \n{k_verify_label_tensor[0:3]}')
        
        train_loss_array[i][j] += log_mse_loss(tlabel, k_train_label_tensor)
        verify_loss_array[i][j] += log_mse_loss(vlabel, k_verify_label_tensor)
        
        

# index is epochs#
train_loss_mean = train_loss_array.sum(axis=0) / train_loss_array.shape[0]
verify_loss_mean = verify_loss_array.sum(axis=0) / verify_loss_array.shape[0]
# train_loss_array.shape, verify_loss_array.shape,  train_loss_mean, verify_loss_mean

 ===>   K:0 675926016
   reserved 1356857344
         1: 676306432
   reserved 1356857344
         2: 676306432
   reserved 1356857344
         3: 676306432
   reserved 1356857344
       ep0: 676466176
  reserved: 1356857344
       ep1: 676657664
  reserved: 1356857344
       ep2: 676657664
  reserved: 1356857344
       ep3: 676657664
  reserved: 1356857344
       ep4: 676657664
  reserved: 1356857344
       ep5: 676657664
  reserved: 1356857344
       ep6: 676657664
  reserved: 1356857344
       ep7: 676657664
  reserved: 1356857344
       ep8: 676657664
  reserved: 1356857344
       ep9: 676657664
  reserved: 1356857344
       ep10: 676657664
  reserved: 1356857344
       ep11: 676657664
  reserved: 1356857344
       ep12: 676657664
  reserved: 1356857344
       ep13: 676657664
  reserved: 1356857344
       ep14: 676657664
  reserved: 1356857344
       ep15: 676657664
  reserved: 1356857344
       ep16: 676657664
  reserved: 1356857344
       ep17: 676657664
  reserved: 1356857344
  

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

# Plotting the training and validation loss
plt.plot(np.arange(epochs), train_loss_mean, marker='o', label='Training Loss')
plt.plot(np.arange(epochs), verify_loss_mean, marker='x', label='Validation Loss')

# Adding title and labels
plt.title('Training vs. Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()  # This adds a legend to distinguish between the two lines
plt.grid(True)
plt.show()

In [None]:
print(torch.cuda.memory_allocated())

In [None]:
print(device)
print(net)

In [None]:
# clean up GPU resource
del train_data_tensor,train_label_tensor, net, optimizer, train_loss_array, verify_loss_array, train_loss_mean, verify_loss_mean
torch.cuda.empty_cache()
 

In [None]:
print(f'After clean:', torch.cuda.memory_allocated())
print(f'    reserved', torch.cuda.memory_reserved())