# Predict Home Price Feature Prep

In [3]:
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt

mls_data = pd.read_csv("mls-data.csv")
mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Year Built,Bedrooms,Bathrooms,Association Fee,Address - Zip Code,Foundation,Pool,Stories
0,620000,1972,0.23,1961,4,3 (2 1),0,95608,Raised,No,1
1,825000,2969,0.197,?,4,4 (3 1),0,95757,"Concrete,Slab",No,2
2,508377,1434,0.0563,?,3,3 (2 1),75,95823,Slab,No,2
3,480000,1525,0.1204,2003,3,2 (2 0),0,95757,Slab,No,1
4,510000,1360,0.2388,1959,3,2 (2 0),0,95608,Raised,No,1


In [4]:
mls_data["Bathrooms"][0]

'3 (2 1)'

In [5]:
bathroom_raw = mls_data["Bathrooms"][0]
open_paren_index = bathroom_raw.index("(")
close_paren_index = bathroom_raw.index(")")
full_baths = bathroom_raw[open_paren_index + 1]
half_baths = bathroom_raw[close_paren_index - 1]

print(f"Full Baths: {full_baths} Half Baths: {half_baths}")

Full Baths: 2 Half Baths: 1


In [6]:
full_baths = []
half_baths = []

for values in mls_data['Bathrooms']:
    open_paren_index = values.index("(")
    close_paren_index = values.index(")")
    full_baths.append(values[open_paren_index + 1])
    half_baths.append(values[close_paren_index - 1])

mls_data['Full Baths'] = full_baths
mls_data['Half Baths'] = half_baths

mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Year Built,Bedrooms,Bathrooms,Association Fee,Address - Zip Code,Foundation,Pool,Stories,Full Baths,Half Baths
0,620000,1972,0.23,1961,4,3 (2 1),0,95608,Raised,No,1,2,1
1,825000,2969,0.197,?,4,4 (3 1),0,95757,"Concrete,Slab",No,2,3,1
2,508377,1434,0.0563,?,3,3 (2 1),75,95823,Slab,No,2,2,1
3,480000,1525,0.1204,2003,3,2 (2 0),0,95757,Slab,No,1,2,0
4,510000,1360,0.2388,1959,3,2 (2 0),0,95608,Raised,No,1,2,0


In [7]:
mls_data = mls_data.drop(columns=["Bathrooms"])
mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Year Built,Bedrooms,Association Fee,Address - Zip Code,Foundation,Pool,Stories,Full Baths,Half Baths
0,620000,1972,0.23,1961,4,0,95608,Raised,No,1,2,1
1,825000,2969,0.197,?,4,0,95757,"Concrete,Slab",No,2,3,1
2,508377,1434,0.0563,?,3,75,95823,Slab,No,2,2,1
3,480000,1525,0.1204,2003,3,0,95757,Slab,No,1,2,0
4,510000,1360,0.2388,1959,3,0,95608,Raised,No,1,2,0


In [7]:
has_pool = []

for values in mls_data['Pool']:
    if values == "Yes":
        has_pool.append(1)
    else:
        has_pool.append(0)

mls_data['Pool'] = has_pool

mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Year Built,Bedrooms,Bathrooms,Association Fee,Address - Zip Code,Foundation,Pool,Stories,Full Baths,Half Baths
0,620000,1972,0.23,1961,4,3 (2 1),0,95608,Raised,0,1,2,1
1,825000,2969,0.197,?,4,4 (3 1),0,95757,"Concrete,Slab",0,2,3,1
2,508377,1434,0.0563,?,3,3 (2 1),75,95823,Slab,0,2,2,1
3,480000,1525,0.1204,2003,3,2 (2 0),0,95757,Slab,0,1,2,0
4,510000,1360,0.2388,1959,3,2 (2 0),0,95608,Raised,0,1,2,0


In [8]:
mls_data["Address - Zip Code"].unique()

array([95608, 95757, 95823, 95621, 95815, 95758, 95842], dtype=int64)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(mls_data["Address - Zip Code"].unique())
zip_codes = le.transform(mls_data["Address - Zip Code"])
mls_data["Address - Zip Code"] = zip_codes

mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Year Built,Bedrooms,Bathrooms,Association Fee,Address - Zip Code,Foundation,Pool,Stories,Full Baths,Half Baths
0,620000,1972,0.23,1961,4,3 (2 1),0,0,Raised,0,1,2,1
1,825000,2969,0.197,?,4,4 (3 1),0,2,"Concrete,Slab",0,2,3,1
2,508377,1434,0.0563,?,3,3 (2 1),75,5,Slab,0,2,2,1
3,480000,1525,0.1204,2003,3,2 (2 0),0,2,Slab,0,1,2,0
4,510000,1360,0.2388,1959,3,2 (2 0),0,0,Raised,0,1,2,0


In [10]:
min_mls_data = mls_data[["Close Price", "Approx SqFt", "Lot Size - Acres", "Bedrooms", "Association Fee", "Address - Zip Code", "Pool", "Stories", "Full Baths", "Half Baths"]]
min_mls_data.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Bedrooms,Association Fee,Address - Zip Code,Pool,Stories,Full Baths,Half Baths
0,620000,1972,0.23,4,0,0,0,1,2,1
1,825000,2969,0.197,4,0,2,0,2,3,1
2,508377,1434,0.0563,3,75,5,0,2,2,1
3,480000,1525,0.1204,3,0,2,0,1,2,0
4,510000,1360,0.2388,3,0,0,0,1,2,0


In [11]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
scaled_features = standard_scaler.fit_transform(min_mls_data.values)

scaled_features_df = pd.DataFrame(scaled_features, index=min_mls_data.index, columns=min_mls_data.columns)

scaled_features_df.head()

Unnamed: 0,Close Price,Approx SqFt,Lot Size - Acres,Bedrooms,Association Fee,Address - Zip Code,Pool,Stories,Full Baths,Half Baths
0,0.22579,0.241465,0.14308,0.574006,-0.266296,-1.130717,-0.468623,-0.616885,-0.183358,1.847107
1,1.079862,1.64051,0.003674,0.574006,-0.266296,-0.053129,-0.468623,1.563242,1.35784,1.847107
2,-0.239254,-0.513486,-0.590704,-0.630825,0.645194,1.563253,-0.468623,1.563242,-0.183358,1.847107
3,-0.357478,-0.38579,-0.319918,-0.630825,-0.266296,-0.053129,-0.468623,-0.616885,-0.183358,-0.52829
4,-0.232492,-0.617327,0.180255,-0.630825,-0.266296,-1.130717,-0.468623,-0.616885,-0.183358,-0.52829


In [29]:
scaled_features_df.to_csv("standardized-mls-data.csv")

In [16]:
print(standard_scaler.scale_)
print(standard_scaler.mean_)
print(standard_scaler.var_)

[2.40026613e+05 7.12629208e+02 2.36717986e-01 8.29991957e-01
 8.22828377e+01 1.85599697e+00 3.84241010e-01 4.58688932e-01
 6.48846129e-01 4.20982139e-01]
[5.65804297e+05 1.79992497e+03 1.96130332e-01 3.52357985e+00
 2.19115756e+01 2.09860665e+00 1.80064309e-01 1.28295820e+00
 2.11897106e+00 2.22400857e-01]
[5.76127750e+10 5.07840389e+05 5.60354051e-02 6.88886649e-01
 6.77046539e+03 3.44472475e+00 1.47641153e-01 2.10395536e-01
 4.21001299e-01 1.77225961e-01]


In [30]:
predicted = standard_scaler.inverse_transform([[ -0.1052, 0.1068, -0.3056,  0.5740, -0.2663, -0.5919, -0.4686,  1.5632, -0.1834, 1.8471]])
actual = standard_scaler.inverse_transform([[ -0.0658, 0.1068, -0.3056,  0.5740, -0.2663, -0.5919, -0.4686,  1.5632, -0.1834, 1.8471]])
price_difference = predicted[0][0] - actual[0][0]
percent_error = (price_difference/actual[0][0]) * 100
print(f"Predicted: {predicted} \nActual: {actual}")
print(f"Price Difference: ${price_difference:>0.0f} Percent Error: {percent_error:>0.1f}%")

Predicted: [[ 5.40553497e+05  1.87603377e+03  1.23789316e-01  3.99999523e+00
  -3.44126308e-04  1.00004204e+00  8.97160614e-06  1.99998074e+00
   1.99997268e+00  9.99996967e-01]] 
Actual: [[ 5.50010546e+05  1.87603377e+03  1.23789316e-01  3.99999523e+00
  -3.44126308e-04  1.00004204e+00  8.97160614e-06  1.99998074e+00
   1.99997268e+00  9.99996967e-01]]
Price Difference: $-9457 Percent Error: -1.7%
