In [80]:
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from pandas import DataFrame

In [81]:
df = pd.read_csv('https://raw.githubusercontent.com/shiv1709/House_price_prediction/master/USA_Housing.csv')
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


# **Exploring Dataset**

In [82]:
print(df.columns.values)

['Avg. Area Income' 'Avg. Area House Age' 'Avg. Area Number of Rooms'
 'Avg. Area Number of Bedrooms' 'Area Population' 'Price' 'Address']


In [83]:
df.shape

(5000, 7)

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [85]:
for i in df.columns:
    print(f"column : {i} and no. of unique values: {len(df[i].unique())}")

#lower number of unique values => categorical, if large no => continuous values

column : Avg. Area Income and no. of unique values: 5000
column : Avg. Area House Age and no. of unique values: 5000
column : Avg. Area Number of Rooms and no. of unique values: 5000
column : Avg. Area Number of Bedrooms and no. of unique values: 255
column : Area Population and no. of unique values: 5000
column : Price and no. of unique values: 5000
column : Address and no. of unique values: 5000


In [86]:
for itr, i in enumerate(df['Address'][0:5]):
    print('\n', itr+1, ' ',i)


 1   208 Michael Ferry Apt. 674
Laurabury, NE 37010-5101

 2   188 Johnson Views Suite 079
Lake Kathleen, CA 48958

 3   9127 Elizabeth Stravenue
Danieltown, WI 06482-3489

 4   USS Barnett
FPO AP 44820

 5   USNS Raymond
FPO AE 09386


below:
1)pd.Series(i) to convert the string i into a pandas Series so that you can use the .str methods.

2)used [0] and .iloc[0] to get the first element of each extracted Series since .extract returns a DataFrame, but you are looking for individual strings.

In [87]:
for itr, i in enumerate(df['Address'][0:10]):
    print('\n')
    apart = pd.Series(i).str.extract(r'(\w+\s?\w*\.\s?\w+)')[0]  # Extracts 'Michael Ferry Apt.'
    place = pd.Series(i).str.extract(r'([A-Z][a-z]+)')[0]  # Extracts 'Laurabury'
    state = pd.Series(i).str.extract(r'([A-Z]{2})')[0]  # Extracts 'NE'
    print(f"{itr}) Apartment: {apart.iloc[0]}, Place: {place.iloc[0]}, State: {state.iloc[0]}")




0) Apartment: Ferry Apt. 674, Place: Michael, State: NE


1) Apartment: nan, Place: Johnson, State: CA


2) Apartment: nan, Place: Elizabeth, State: WI


3) Apartment: nan, Place: Barnett, State: US


4) Apartment: nan, Place: Raymond, State: US


5) Apartment: Islands Apt. 443, Place: Jennifer, State: KS


6) Apartment: nan, Place: Daniel, State: CO


7) Apartment: nan, Place: Joyce, State: TN


8) Apartment: nan, Place: Gilbert, State: US


9) Apartment: nan, Place: Unit, State: DP


##Adding Columns place and state to make more unique features hopefully

In [88]:
df['Place'] = df['Address'].str.extract(r'([A-Z][a-z]+)', expand=False)
df['State'] = df['Address'].str.extract(r'([A-Z]{2})', expand=False)

df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,Place,State
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",Michael,NE
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",Johnson,CA
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",Elizabeth,WI
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,Barnett,US
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,Raymond,US


In [89]:
for i in df.columns:
    print(f"column : {i} and no. of unique values: {len(df[i].unique())}")

column : Avg. Area Income and no. of unique values: 5000
column : Avg. Area House Age and no. of unique values: 5000
column : Avg. Area Number of Rooms and no. of unique values: 5000
column : Avg. Area Number of Bedrooms and no. of unique values: 255
column : Area Population and no. of unique values: 5000
column : Price and no. of unique values: 5000
column : Address and no. of unique values: 5000
column : Place and no. of unique values: 1193
column : State and no. of unique values: 62


**Building Model**

In [112]:
numeric_cols = df.copy()
numeric_cols.drop(columns = ['Place', 'State', 'Address'], axis=1, inplace=True)
numeric_cols.head(5)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5


In [113]:
numeric_x_cols = list(numeric_cols.columns)
numeric_x_cols.remove('Price')
numeric_y_cols = ['Price']

numeric_x_cols

['Avg. Area Income',
 'Avg. Area House Age',
 'Avg. Area Number of Rooms',
 'Avg. Area Number of Bedrooms',
 'Area Population']

Normalizing Data

In [123]:
numeric_cols = (numeric_cols - numeric_cols.mean()) // (numeric_cols.max() - numeric_cols.min())
numeric_cols.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,0.0,-1.0,0.0,0.0,-1.0,-1.0
1,0.0,0.0,-1.0,-1.0,0.0,0.0
2,-1.0,-1.0,0.0,0.0,0.0,-1.0
3,-1.0,0.0,-1.0,-1.0,-1.0,0.0
4,-1.0,-1.0,0.0,0.0,-1.0,-1.0


In [124]:
numeric_x_df = DataFrame(numeric_cols, columns = numeric_x_cols)
numeric_y_df = DataFrame(numeric_cols, columns = numeric_y_cols)

numeric_x = torch.tensor(numeric_x_df.values, dtype = torch.float)
numeric_y = torch.tensor(numeric_y_df.values, dtype = torch.float)

numeric_x.shape, numeric_y.shape

(torch.Size([5000, 5]), torch.Size([5000, 1]))

Defining Neural Net Architecture

In [125]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim, H1, H2, H3, output_dim):
        super(NeuralNet, self).__init__()

        self.seq_layer = nn.Sequential(
            nn.Linear(input_dim, H1),
            nn.Linear(H1, H2),
            nn.Linear(H2, H3),
            nn.Linear(H3, output_dim)
        )

    def forward(self, x):
        y_pred = self.seq_layer(x)
        return y_pred

In [126]:
H1, H2, H3 = 500, 1000, 200
input_dim, output_dim = numeric_x.shape[1], numeric_y.shape[1]

In [127]:
model = NeuralNet(input_dim, H1, H2, H3, output_dim)

loss_fn = nn.MSELoss(reduction = 'sum')
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)

In [128]:
loss_net = []

for i in range (500):
    y_pred = model(numeric_x)

    loss = loss_fn(y_pred, numeric_y)
    loss_net.append(loss.item())

    if i % 1 == 0:
        print(i, loss.item())

    if torch.isnan(loss):
        break

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


0 2948.5087890625
1 1578.66748046875
2 960.5748901367188
3 955.7957763671875
4 1211.4832763671875
5 1356.3978271484375
6 1293.333740234375
7 1108.9727783203125
8 916.8967895507812
9 793.4063720703125
10 763.216064453125
11 806.0460815429688
12 876.213623046875
13 928.5575561523438
14 938.288818359375
15 905.6444702148438
16 848.372314453125
17 790.258056640625
18 751.1039428710938
19 740.2791748046875
20 754.7001953125
21 781.6200561523438
22 805.28564453125
23 814.3612670898438
24 806.2324829101562
25 786.411865234375
26 764.3834838867188
27 748.6748657226562
28 743.4573974609375
29 747.6981811523438
30 756.6396484375
31 764.5271606445312
32 767.217041015625
33 763.60498046875
34 755.5302734375
35 746.497802734375
36 739.939697265625
37 737.772705078125
38 739.7869873046875
39 744.0257568359375
40 747.8956909179688
41 749.4266357421875
42 748.0718994140625
43 744.72265625
44 741.0487670898438
45 738.5887451171875
46 738.0611572265625
47 739.1773681640625
48 740.9528198242188
49 742.28

KeyboardInterrupt: ignored