# Advance House Price Prediction Using Pytorch

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("houseprice.csv",usecols=["SalePrice","MSSubClass","MSZoning",
                                        "LotFrontage","LotArea","Street","YearBuilt","LotShape","1stFlrSF",
                                        "2ndFlrSF"]).dropna()

In [3]:
df.shape

(1201, 10)

In [4]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   object 
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   object 
 5   LotShape     1201 non-null   object 
 6   YearBuilt    1201 non-null   int64  
 7   1stFlrSF     1201 non-null   int64  
 8   2ndFlrSF     1201 non-null   int64  
 9   SalePrice    1201 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


In [6]:
for i in df.columns:
    print("Column name {} and unique values are {}".format(i,len(df[i].unique())))

Column name MSSubClass and unique values are 15
Column name MSZoning and unique values are 5
Column name LotFrontage and unique values are 110
Column name LotArea and unique values are 869
Column name Street and unique values are 2
Column name LotShape and unique values are 4
Column name YearBuilt and unique values are 112
Column name 1stFlrSF and unique values are 678
Column name 2ndFlrSF and unique values are 368
Column name SalePrice and unique values are 597


In [7]:
# chossingc categorical values of columns having less unique values:
import datetime
yearNow=datetime.datetime.now().year

In [8]:
df["Total Years"]=yearNow-df["YearBuilt"]

In [9]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500,22
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500,49
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500,24
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000,110
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000,25


In [10]:
df.drop("YearBuilt",axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,60,RL,65.0,8450,Pave,Reg,856,854,208500,22
1,20,RL,80.0,9600,Pave,Reg,1262,0,181500,49
2,60,RL,68.0,11250,Pave,IR1,920,866,223500,24
3,70,RL,60.0,9550,Pave,IR1,961,756,140000,110
4,60,RL,84.0,14260,Pave,IR1,1145,1053,250000,25


In [12]:
## categorical features
cat_features=["MSSubClass","MSZoning","LotShape","Street"]
out_feature="SalePrice"

In [13]:
df["MSSubClass"].unique()

array([ 60,  20,  70,  50, 190,  45,  90, 120,  30,  80, 160,  75, 180,
        40,  85], dtype=int64)

In [14]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders={}
lbl_encoders["MSSubClass"]=LabelEncoder()
lbl_encoders["MSSubClass"].fit_transform(df["MSSubClass"])

array([5, 0, 5, ..., 6, 0, 0], dtype=int64)

In [15]:
lbl_encoders

{'MSSubClass': LabelEncoder()}

In [16]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders={}
for feature in cat_features:
   lbl_encoders[feature]=LabelEncoder()
   df[feature]=lbl_encoders[feature].fit_transform(df[feature])


In [17]:
#stacking and converting into tensors:
cat_features=np.stack([df["MSSubClass"],df["MSZoning"],df["Street"],
                      df["LotShape"]],axis=1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]], dtype=int64)

In [18]:
# convert numpy to Tensors
import torch
cat_features=torch.tensor(cat_features,dtype=torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [19]:
## create continous variable 
cont_features=[]
for i in df.columns:
    if i in ["MSSubClass","MSZoning","Street","LotShape","SalePrice"]:
        pass
    else:
        cont_features.append(i)






In [20]:
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total Years']

In [21]:
### stacking continuous variable to a tensor:
cont_values=np.stack([df[i].values for i in cont_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    22.],
        [   80.,  9600.,  1262.,     0.,    49.],
        [   68., 11250.,   920.,   866.,    24.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    84.],
        [   68.,  9717.,  1078.,     0.,    75.],
        [   75.,  9937.,  1256.,     0.,    60.]])

In [22]:
cont_values.dtype

torch.float32

In [23]:
###dependent features:
y=torch.tensor(df["SalePrice"].values,dtype=torch.float).reshape(-1,1)
#reshape is done to create two dimension features....
y



tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   int32  
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   int32  
 5   LotShape     1201 non-null   int32  
 6   1stFlrSF     1201 non-null   int64  
 7   2ndFlrSF     1201 non-null   int64  
 8   SalePrice    1201 non-null   int64  
 9   Total Years  1201 non-null   int64  
dtypes: float64(1), int32(3), int64(6)
memory usage: 89.1 KB


In [25]:
df.shape

(1201, 10)

In [26]:
cat_features.shape,cont_values.shape,y.shape

(torch.Size([1201, 4]), torch.Size([1201, 5]), torch.Size([1201, 1]))

In [27]:
len(df["MSSubClass"].unique())

15

## embedding features is done for only categorical features:
# embedding size for categorical columns

In [28]:
cat_dims=[len(df[col].unique()) for col in ["MSSubClass","MSZoning","Street","LotShape"]]

In [29]:
cat_dims

[15, 5, 2, 4]

In [30]:
### output dimension should be set based on the input dimension(min(50,feature dimension/2))
embedding_dim=[(x,min(50,(x+1)//2)) for x in cat_dims]
embedding_dim # upto here was preprocessing state

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [31]:
# using the embedding dimension, we should now create embedding layer:



In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F #  relu values and different sort of values stays here
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [33]:
# moduleList---required as u not only have one embedding layer,u will have multiple embedding layer so ..


In [34]:
cat_features #for only top4,  # below output values are label encoded values:

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [36]:
cat_featuresZ=cat_features[:4]
cat_featuresZ #top4 values are outputted below

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [37]:
# converting values into vectores using ModuleList:

pd.set_option("display.max_rows",500)
embedding_val=[]
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

In [38]:
embedding_val

[tensor([[-0.0906,  0.0128, -0.4567,  ..., -2.4046,  0.2705, -0.9198],
         [-0.6949,  1.5898,  1.2107,  ...,  0.4502,  0.4850,  0.0125],
         [-0.0906,  0.0128, -0.4567,  ..., -2.4046,  0.2705, -0.9198],
         ...,
         [ 0.3297,  1.2331,  1.1656,  ..., -1.7718, -0.9618,  0.6346],
         [-0.6949,  1.5898,  1.2107,  ...,  0.4502,  0.4850,  0.0125],
         [-0.6949,  1.5898,  1.2107,  ...,  0.4502,  0.4850,  0.0125]],
        grad_fn=<EmbeddingBackward0>),
 tensor([[ 1.3242,  0.8247, -1.7133],
         [ 1.3242,  0.8247, -1.7133],
         [ 1.3242,  0.8247, -1.7133],
         ...,
         [ 1.3242,  0.8247, -1.7133],
         [ 1.3242,  0.8247, -1.7133],
         [ 1.3242,  0.8247, -1.7133]], grad_fn=<EmbeddingBackward0>),
 tensor([[1.1223],
         [1.1223],
         [1.1223],
         ...,
         [1.1223],
         [1.1223],
         [1.1223]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.5993,  0.3007],
         [-0.5993,  0.3007],
         [ 0.0218,  0.5123],

In [39]:
z=torch.cat(embedding_val,1)
z
#stacking basically used with numpyarrays and cat with torch tensors

tensor([[-0.0906,  0.0128, -0.4567,  ...,  1.1223, -0.5993,  0.3007],
        [-0.6949,  1.5898,  1.2107,  ...,  1.1223, -0.5993,  0.3007],
        [-0.0906,  0.0128, -0.4567,  ...,  1.1223,  0.0218,  0.5123],
        ...,
        [ 0.3297,  1.2331,  1.1656,  ...,  1.1223, -0.5993,  0.3007],
        [-0.6949,  1.5898,  1.2107,  ...,  1.1223, -0.5993,  0.3007],
        [-0.6949,  1.5898,  1.2107,  ...,  1.1223, -0.5993,  0.3007]],
       grad_fn=<CatBackward0>)

In [40]:
### implement dropout
dropOut=nn.Dropout(.4) #40% is dropout

In [41]:
dropOut

Dropout(p=0.4, inplace=False)

In [43]:
final_embed=dropOut(z)
final_embed

tensor([[-0.1510,  0.0000, -0.7612,  ...,  1.8705, -0.0000,  0.5012],
        [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.9989,  0.5012],
        [-0.1510,  0.0214, -0.0000,  ...,  1.8705,  0.0000,  0.0000],
        ...,
        [ 0.5494,  2.0551,  1.9427,  ...,  0.0000, -0.0000,  0.0000],
        [-0.0000,  0.0000,  2.0178,  ...,  1.8705, -0.0000,  0.0000],
        [-1.1582,  2.6497,  2.0178,  ...,  0.0000, -0.0000,  0.5012]],
       grad_fn=<MulBackward0>)

In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForwardNN(nn.Module):
    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):  # p is dropout ratio
        super().__init__()
        
        self.embeds = nn.ModuleList([nn.Embedding(inp, out) for inp, out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layerlist = []
        n_emb = sum(out for inp, out in embedding_dim)
        n_in = n_emb + n_cont

        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1], out_sz))

        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):
        embeddings = []
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:, i]))

        x = torch.cat(embeddings, dim=1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], dim=1)
        x = self.layers(x)
        return x


In [61]:
#always remember when we create feed forward network, we create two functions : 
#one is init and another is forward()

In [62]:
len(cont_features)

5

In [63]:
#generally use relu for regression problem:

In [65]:
torch.manual_seed(100)
model=FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.4)

In [66]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

### define loss and optimizer