## Importing Packages

In [2]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import regularizers
#from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Loading data from a .csv

In [3]:
url="https://junwin.github.io/data/housepriceclean2.csv"
housePrices=pd.read_csv(url).sample(frac=1)
housePrices.pop('ClosedDate')
housePrices.head()

Unnamed: 0,SoldPr,Type,Zip,Area,Rooms,FullBaths,HalfBaths,BsmtBth,Beds,BsmtBeds,GarageSpaces,houseEra
6267,558407.1875,SFH,60201,1722,8,2,1,Yes,4.0,0,0.0,19B
7161,183859.375,Condo,60025,1100,5,2,0,No,2.0,0,0.0,19A
8934,152340.625,Condo,60025,875,5,1,0,No,2.0,0,2.0,19A
3970,629173.347656,SFH,60062,3042,13,3,2,Yes,4.0,1,2.5,19A
4373,298029.480469,SFH,60076,1373,7,2,0,Yes,3.0,0,2.0,19A


In [3]:
housePrices.dtypes

SoldPr          float64
Type             object
Zip               int64
Area              int64
Rooms             int64
FullBaths         int64
HalfBaths         int64
BsmtBth          object
Beds            float64
BsmtBeds          int64
GarageSpaces    float64
houseEra         object
dtype: object

In [4]:
housePrices['Zip'] = housePrices['Zip'].astype(str)
housePrices.dtypes

SoldPr          float64
Type             object
Zip              object
Area              int64
Rooms             int64
FullBaths         int64
HalfBaths         int64
BsmtBth          object
Beds            float64
BsmtBeds          int64
GarageSpaces    float64
houseEra         object
dtype: object

# Calculate missing values in each feature :

In [6]:
missing_data = housePrices.isnull()
missing_data.head()

Unnamed: 0,SoldPr,Type,Zip,Area,Rooms,FullBaths,HalfBaths,BsmtBth,Beds,BsmtBeds,GarageSpaces,houseEra
3500,False,False,False,False,False,False,False,False,False,False,False,False
6859,False,False,False,False,False,False,False,False,False,False,False,False
6623,False,False,False,False,False,False,False,False,False,False,False,False
8672,False,False,False,False,False,False,False,False,False,False,False,False
4572,False,False,False,False,False,False,False,False,False,False,False,False


# Get summary of missing values in each feature :

In [7]:
for column in missing_data.columns.values.tolist():
    print(missing_data[column].value_counts())
    print(" ")

SoldPr
False    9358
Name: count, dtype: int64
 
Type
False    9358
Name: count, dtype: int64
 
Zip
False    9358
Name: count, dtype: int64
 
Area
False    9358
Name: count, dtype: int64
 
Rooms
False    9358
Name: count, dtype: int64
 
FullBaths
False    9358
Name: count, dtype: int64
 
HalfBaths
False    9358
Name: count, dtype: int64
 
BsmtBth
False    9358
Name: count, dtype: int64
 
Beds
False    9358
Name: count, dtype: int64
 
BsmtBeds
False    9358
Name: count, dtype: int64
 
GarageSpaces
False    9358
Name: count, dtype: int64
 
houseEra
False    9358
Name: count, dtype: int64
 


# Checking The Dataset :

In [10]:
housePrices.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9358 entries, 3500 to 727
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SoldPr        9358 non-null   float64
 1   Type          9358 non-null   object 
 2   Zip           9358 non-null   object 
 3   Area          9358 non-null   int64  
 4   Rooms         9358 non-null   int64  
 5   FullBaths     9358 non-null   int64  
 6   HalfBaths     9358 non-null   int64  
 7   BsmtBth       9358 non-null   object 
 8   Beds          9358 non-null   float64
 9   BsmtBeds      9358 non-null   int64  
 10  GarageSpaces  9358 non-null   float64
 11  houseEra      9358 non-null   object 
dtypes: float64(3), int64(5), object(4)
memory usage: 950.4+ KB


# Saving Cleaned Dataset:

In [None]:
housePrices.to_csv('cleaned_dataset.csv')