In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
df = pd.read_csv("/content/houseprice.csv",usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [3]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [4]:
df['Total years'] = datetime.datetime.now().year-df['YearBuilt']

In [5]:
df.drop('YearBuilt',axis=1,inplace = True)

In [6]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total years
0,60,RL,65.0,8450,Pave,Reg,856,854,208500,17
1,20,RL,80.0,9600,Pave,Reg,1262,0,181500,44
2,60,RL,68.0,11250,Pave,IR1,920,866,223500,19
3,70,RL,60.0,9550,Pave,IR1,961,756,140000,105
4,60,RL,84.0,14260,Pave,IR1,1145,1053,250000,20


In [7]:
cat_features=["MSSubClass", "MSZoning", "Street", "LotShape"]
out_feature="SalePrice"

In [8]:
from sklearn.preprocessing import LabelEncoder
encoder =  {}
encoder['MSSubClass'] = LabelEncoder()
encoder['MSSubClass'].fit_transform(df['MSSubClass'])

array([5, 0, 5, ..., 6, 0, 0])

In [9]:
encoder = {}
for feature in cat_features:
  encoder[feature] = LabelEncoder()
  df[feature] = encoder[feature].fit_transform(df[feature])

In [None]:
df

In [11]:
cat_features = np.stack([df['MSSubClass'],df['MSZoning'],df['Street'],df['LotShape']],1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]])

In [13]:
import torch
cat_features = torch.tensor(cat_features,dtype=torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [14]:
cont_features = []
for i in df.columns:
  if i in ['MSSubClass','MSZoning','Street','LotShape','SalePrice']:
    pass
  else:
    cont_features.append(i)

In [16]:
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total years']

In [17]:
cont_values = np.stack([df[i].values for i in cont_features],axis=1)
cont_values = torch.tensor(cont_values,dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    17.],
        [   80.,  9600.,  1262.,     0.,    44.],
        [   68., 11250.,   920.,   866.,    19.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    79.],
        [   68.,  9717.,  1078.,     0.,    70.],
        [   75.,  9937.,  1256.,     0.,    55.]])

In [18]:
y = torch.tensor(df['SalePrice'].values,dtype=torch.float).reshape(-1,1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [19]:
cat_dims = [len(df[col].unique()) for col in ['MSSubClass','MSZoning','Street','LotShape']]

In [20]:
cat_dims

[15, 5, 2, 4]

In [21]:
embedding_dim = [(x,min(50,(x+1)//2)) for x in cat_dims]

In [22]:
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [22]:
cat_featurers  = cat_feat[:4]
cat_featurers

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [25]:
pd.set_option('display.max_rows',500)
embedding_val = []
for i,e in enumerate(embed_representation):
  embedding_val.append(e(cat_features[:,i]))

In [26]:
embedding_val

[tensor([[ 1.3084,  0.5572,  0.3316,  ...,  0.3779,  0.4534,  0.9133],
         [ 0.8084, -0.6820,  1.1680,  ..., -0.0459,  0.7412,  0.1600],
         [ 1.3084,  0.5572,  0.3316,  ...,  0.3779,  0.4534,  0.9133],
         ...,
         [ 0.2987, -0.6146,  0.4999,  ...,  0.4067,  0.4051, -0.4147],
         [ 0.8084, -0.6820,  1.1680,  ..., -0.0459,  0.7412,  0.1600],
         [ 0.8084, -0.6820,  1.1680,  ..., -0.0459,  0.7412,  0.1600]],
        grad_fn=<EmbeddingBackward>), tensor([[ 1.6386, -0.5319,  2.3992],
         [ 1.6386, -0.5319,  2.3992],
         [ 1.6386, -0.5319,  2.3992],
         ...,
         [ 1.6386, -0.5319,  2.3992],
         [ 1.6386, -0.5319,  2.3992],
         [ 1.6386, -0.5319,  2.3992]], grad_fn=<EmbeddingBackward>), tensor([[-0.2293],
         [-0.2293],
         [-0.2293],
         ...,
         [-0.2293],
         [-0.2293],
         [-0.2293]], grad_fn=<EmbeddingBackward>), tensor([[-0.6975,  0.8154],
         [-0.6975,  0.8154],
         [-1.2093,  0.1298],

In [27]:
z = torch.cat(embedding_val,1)
z

tensor([[ 1.3084,  0.5572,  0.3316,  ..., -0.2293, -0.6975,  0.8154],
        [ 0.8084, -0.6820,  1.1680,  ..., -0.2293, -0.6975,  0.8154],
        [ 1.3084,  0.5572,  0.3316,  ..., -0.2293, -1.2093,  0.1298],
        ...,
        [ 0.2987, -0.6146,  0.4999,  ..., -0.2293, -0.6975,  0.8154],
        [ 0.8084, -0.6820,  1.1680,  ..., -0.2293, -0.6975,  0.8154],
        [ 0.8084, -0.6820,  1.1680,  ..., -0.2293, -0.6975,  0.8154]],
       grad_fn=<CatBackward>)

In [28]:
dropout = nn.Dropout(0.4)

In [29]:
final_embed = dropout(z)
final_embed

tensor([[ 2.1806,  0.9287,  0.5526,  ..., -0.0000, -0.0000,  0.0000],
        [ 1.3473, -0.0000,  0.0000,  ..., -0.0000, -1.1625,  0.0000],
        [ 0.0000,  0.0000,  0.5526,  ..., -0.0000, -2.0155,  0.2164],
        ...,
        [ 0.0000, -0.0000,  0.0000,  ..., -0.3822, -1.1625,  0.0000],
        [ 0.0000, -1.1366,  0.0000,  ..., -0.0000, -1.1625,  1.3591],
        [ 1.3473, -1.1366,  1.9466,  ..., -0.0000, -1.1625,  1.3591]],
       grad_fn=<MulBackward0>)

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):

    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((out for inp,out in embedding_dim))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [33]:
len(cont_features)

5

In [35]:
torch.manual_seed(100)
model = FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.2)

In [36]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.2, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [37]:
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

In [38]:
df.shape

(1201, 10)

In [40]:
batch_size = 1200
test_size=int(batch_size*0.15)
train_categorical=cat_features[:batch_size-test_size]
test_categorical=cat_features[batch_size-test_size:batch_size]
train_cont=cont_values[:batch_size-test_size]
test_cont=cont_values[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

In [None]:
epochs=5000
final_losses=[]
for i in range(epochs):
    i=i+1
    y_pred=model(train_categorical,train_cont)
    loss=torch.sqrt(loss_func(y_pred,y_train))
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch number: 1 and the loss : 200496.796875
Epoch number: 11 and the loss : 200493.34375
Epoch number: 21 and the loss : 200488.71875
Epoch number: 31 and the loss : 200482.21875
Epoch number: 41 and the loss : 200473.125
Epoch number: 51 and the loss : 200461.265625
Epoch number: 61 and the loss : 200445.890625
Epoch number: 71 and the loss : 200427.8125
Epoch number: 81 and the loss : 200406.796875
Epoch number: 91 and the loss : 200380.828125
Epoch number: 101 and the loss : 200352.421875
Epoch number: 111 and the loss : 200322.28125
Epoch number: 121 and the loss : 200287.265625
Epoch number: 131 and the loss : 200247.078125
Epoch number: 141 and the loss : 200200.421875
Epoch number: 151 and the loss : 200155.703125
Epoch number: 161 and the loss : 200107.40625
Epoch number: 171 and the loss : 200060.109375
Epoch number: 181 and the loss : 199994.078125
Epoch number: 191 and the loss : 199939.40625
Epoch number: 201 and the loss : 199875.90625
Epoch number: 211 and the loss : 199