Load The Correct Libraries

In [205]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.regularizers import L2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

Load Data

In [206]:
df = pd.read_csv(r'house-prices-advanced-regression-techniques\train.csv')
df_test = pd.read_csv(r'house-prices-advanced-regression-techniques\test.csv')
# Show all rows
pd.set_option("display.max_rows", None)

# Show all columns
pd.set_option("display.max_columns", None)

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

Clean Data

In [207]:
# function that gives the following information for each column
# 1. number of unique values
# 2. the data type
# 3. the null ratio
def general_info(x):
    # Get unique values and data types in each column
    datatypes = x.dtypes
    unique_values = x.nunique()
    # get null ratio for each column
    nulls = x.isnull().sum()
    nullratios = nulls / x.shape[0]
    general_info = pd.DataFrame(
        {
            "datatype": datatypes,
            "unique_values": unique_values,
            "nullratios": nullratios,
        }
    )
    # print(general_info)
    return general_info


info = general_info(df)
print(df.shape)

(1460, 81)


In [208]:
# function that cleans the data set by doing the following:
# 1. remove all columns with more than 25% missing data
# 2 fill all missing float data types with the mean value of their column
# 3. fill all missing categorical data with most previously filled category within that column
def clean_NaNs(x, info_x, x_test):
    # keep all rows with less than 25% NaN values
    x = x.loc[:, info_x["nullratios"] < 0.25]
    x_test = x_test[x.columns[x.columns!='SalePrice']]
    info_x = general_info(x)
    # fix all the NaNs where floats becomes means
    float_nan_columns = info_x[
        (info_x["datatype"] == "float64") & (info_x["nullratios"] > 0)
    ].index.tolist()
    # print(float_nan_columns)
    means = x[float_nan_columns].mean()
    print("means: ", means)
    x.loc[:, float_nan_columns] = x[float_nan_columns].fillna(means)
    x_test.loc[:,float_nan_columns] = x_test[float_nan_columns].fillna(means)
    info_x = general_info(x)
    print(info_x.loc[info_x["nullratios"] > 0, :])
    # fix all the NaNs where objects becomes ffill()
    x = x.ffill()
    x_test = x_test.ffill()
    info_x = general_info(x)
    print(info_x.loc[info_x["nullratios"] > 0, :])
    # fix info
    return x, info_x, x_test


df, info, df_test = clean_NaNs(df, info,df_test)
# print(info.loc[info['nullratios']>0,:]) # should have no outputs


means:  LotFrontage      70.049958
MasVnrArea      103.685262
GarageYrBlt    1978.506164
dtype: float64
             datatype  unique_values  nullratios
BsmtQual       object              4    0.025342
BsmtCond       object              4    0.025342
BsmtExposure   object              4    0.026027
BsmtFinType1   object              6    0.025342
BsmtFinType2   object              6    0.026027
Electrical     object              5    0.000685
GarageType     object              6    0.055479
GarageFinish   object              3    0.055479
GarageQual     object              5    0.055479
GarageCond     object              5    0.055479
Empty DataFrame
Columns: [datatype, unique_values, nullratios]
Index: []


In [209]:
print(df.shape)


# function that separates all the categories into labeled numbers
def Label_encode(x, x_info,x_test):
    object_columns = x_info.loc[x_info["datatype"] == "object"].index.tolist()
    print(object_columns)
    x[object_columns] = x[object_columns].astype("category")
    x_test[object_columns] = x_test[object_columns].astype("category")
    x[object_columns] = x[object_columns].apply(lambda col: col.cat.codes)
    x_test[object_columns] = x_test[object_columns].apply(lambda col: col.cat.codes)
    x_info = general_info(x)
    return x, x_info, object_columns, x_test


df, info, objectColumns,df_test = Label_encode(df, info,df_test)


def OH_encode(x, x_info):
    object_columns = x_info.loc[x_info["datatype"] == "object"].index.tolist()
    x = pd.get_dummies(x, columns=object_columns, drop_first=False)
    x_info = general_info(x)
    return x, x_info


# df, info = OH_encode(df, info)
print(df.head())
print(df_test.head())
# print(info)

(1460, 75)
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
   Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  LandContour  Utilities  LotConfig  LandSlope  Neighborhood  Condition1  Condition2  BldgType  HouseStyle  OverallQual  OverallCond  YearBuilt  YearRemodAdd  RoofStyle  RoofMatl  Exterior1st  Exterior2nd  MasVnrArea  ExterQual  ExterCond  Foundation  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinSF1  BsmtFinType2  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  Heating  HeatingQC  CentralAir  Electrical  1stFlrSF

In [210]:
# prepare inputs and outputs for training and testing
# Split the data
x = df.drop("SalePrice", axis=1)
y = df["SalePrice"]
print(x.shape)
print(y.shape)
y = np.log1p(y)

(1460, 74)
(1460,)


NN Model

In [211]:
# scale all values
scalar = StandardScaler()
x[objectColumns] = scalar.fit_transform(x[objectColumns])
df_test[objectColumns] = scalar.fit_transform(df_test[objectColumns])
'''
model = Sequential(
    [
        Dense(45, input_dim=x.shape[1], activation="relu", kernel_regularizer=L2(0.01)),
        BatchNormalization(),
        Dense(64, activation="relu", kernel_regularizer=L2(0.01)),
        BatchNormalization(),
        Dense(80, activation="relu", kernel_regularizer=L2(0.01)),
        BatchNormalization(),
        Dense(1),
    ]
)
model.compile(optimizer = 'adam', loss='mean_squared_error', metrics = ['mse'])
'''
model = load_model('NNforHousingPrices.h5')
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2
)

# model.fit(X_train,y_train,epochs = 50, batch_size= 32)
y_pred = model.predict(X_test)



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [212]:
print(pd.DataFrame({"real": y_test, "prediction": y_pred.flatten()}))
mse = mean_squared_error(y_test,y_pred)
print(mse)
#model.save('NNforHousingPrices')

           real  prediction
70    12.404928   12.435134
1237  12.180760   11.962178
1398  11.835016   11.968440
661   12.904210   12.219182
695   12.078245   11.768107
651   11.589896   11.860199
1054  12.449023   12.199861
411   11.884496   11.763292
1436  11.699413   11.903621
1159  12.128117   12.113273
523   12.126764   12.550450
129   11.918397   11.946128
542   12.270225   12.045572
1138  12.185875   12.207878
863   11.794345   11.972939
1293  12.000898   11.960166
1203  12.269052   11.949563
1365  12.283038   12.228140
706   12.618186   13.003716
595   12.672950   12.363788
1157  12.345839   12.438781
844   11.944065   11.767426
106   11.512935   11.510139
1160  11.891369   11.913806
469   12.138869   12.097218
1258  12.154785   11.868265
920   12.211065   12.166001
1340  11.719948   11.803103
744   12.100718   12.373441
232   11.456366   11.130166
631   12.252484   12.006288
1032  12.644331   12.319818
1408  11.740069   11.964903
277   11.856522   11.678531
1099  11.964007   11

In [213]:
y_pred = model.predict(df_test)
SalePrice = np.expm1(y_pred)
print(SalePrice)
submission = pd.DataFrame({'Id':df_test['Id'],'SalePrice':SalePrice.flatten()})
print(submission)

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[[128635.36]
 [383642.72]
 [146409.53]
 ...
 [139781.27]
 [124304.01]
 [186679.28]]
        Id      SalePrice
0     1461  128635.359375
1     1462  383642.718750
2     1463  146409.531250
3     1464  163221.031250
4     1465  224169.203125
5     1466  152534.562500
6     1467  165959.562500
7     1468  154027.500000
8     1469  158699.843750
9     1470  160080.984375
10    1471  227339.187500
11    1472  111425.609375
12    1473  117450.914062
13    1474  149773.531250
14    1475  103012.265625
15    1476  199624.968750
16    1477  165000.375000
17    1478  199687.609375
18    1479  163749.109375
19    1480  465658.250000
20    1481  201577.453125
21    1482  153982.984375
22    1483  124414.187500
23    1484  169012.687500
24    1485  109241.242188
25    1486  171239.484375
26    1487  194717.250000
27    1488  191409.046875
28    1489  167485.000000
29    1490  179525.265625
30    1491  139890.484375
31    1492 

In [214]:
submission.to_csv("submission.csv", index=False)

In [215]:
print(pd.read_csv('submission.csv').head())

     Id  SalePrice
0  1461  128635.36
1  1462  383642.72
2  1463  146409.53
3  1464  163221.03
4  1465  224169.20
