In [146]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Data Ingestion

In [147]:
data_path = "HousesInfo.txt"
columns = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
data = pd.read_csv(data_path, sep=" ", names=columns)
data.head()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226


In [148]:
len(data)

535

# Exploratory Data Analysis

Total Number of ZIP codes:

In [149]:
len(data.zipcode.unique())

49

Explore the repetition of ZIP codes:

In [150]:
data.zipcode.value_counts()

92276    100
93510     60
93446     54
92880     49
94501     41
91901     32
92677     26
94531     22
96019     12
85255     12
92021     11
85266     11
93111     11
81524     11
95220     10
92802      9
85262      9
62234      7
62214      4
98021      4
85377      3
91752      3
60002      3
81418      2
62025      2
92253      2
60016      2
92692      2
90265      1
62034      1
62088      1
91915      1
94565      1
95008      1
90803      1
90038      1
93314      1
93720      1
93924      1
92040      1
90211      1
94568      1
92543      1
62249      1
85331      1
93105      1
60046      1
36372      1
81521      1
Name: zipcode, dtype: int64

Exploring the distribution of frequencies:

In [151]:
data.zipcode.value_counts().value_counts()

1      21
2       5
11      4
3       3
4       2
9       2
12      2
22      1
60      1
26      1
10      1
32      1
7       1
41      1
49      1
54      1
100     1
Name: zipcode, dtype: int64

In 21 opportunities a ZIP code is only included once, perhaps including values that aren't so common won't add much information to the analysis, therefore the decision is taken to remove frequencies lower than 10.

In [152]:
mask_zip_frequency = data.zipcode.value_counts() >= 10
list_frequent_zip_codes = data.zipcode.value_counts()[mask_zip_frequency].index.tolist()
list_frequent_zip_codes

[92276,
 93510,
 93446,
 92880,
 94501,
 91901,
 92677,
 94531,
 96019,
 85255,
 92021,
 85266,
 93111,
 81524,
 95220]

Total number of ZIP codes:

In [153]:
len(list_frequent_zip_codes)

15

Exclude rows that don't belong to the filtered list of ZIP codes:

In [154]:
mask_filtered_zip_codes =  data.zipcode.isin(list_frequent_zip_codes)
data = data[mask_filtered_zip_codes]
data.head()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85255,869500
2,3,4.0,3923,85266,889000
4,3,4.0,4116,85266,971226
5,4,5.0,4581,85266,1249000
7,4,5.0,5524,85266,1698000


Let's explore other predictors:

In [155]:
data.area.describe()

count     462.000000
mean     2289.729437
std      1127.011770
min       701.000000
25%      1416.250000
50%      1978.500000
75%      3010.000000
max      9536.000000
Name: area, dtype: float64

In [156]:
print(data.shape)

(462, 5)


In [157]:
mask_area = data.area < 5000
data = data[mask_area]
print(data.shape)

(454, 5)


In [158]:
fig = make_subplots(
    rows=2, cols=1,
    #shared_xaxes=True
    x_title='Area in Squared Meters',
)

fig.add_trace(
    go.Histogram(x=data.area),
    row=1, col=1,
)

fig.add_trace(
    go.Box(x=data.area, name=""),
    row=2, col=1,
)

fig.update_layout(
    title_text="Area Distribution",
    showlegend=False
)

fig.show()

Exploring number of bedrooms:




In [159]:
data.bedrooms.value_counts().sort_index()

1       4
2     119
3     154
4     112
5      50
6      10
7       3
8       1
10      1
Name: bedrooms, dtype: int64

In [160]:
print(data.shape)

(454, 5)


In [161]:
mask_bedrooms = data.bedrooms < 6
data = data[mask_bedrooms]
print(data.shape)

(439, 5)


In [162]:
fig = px.bar(
    data.bedrooms.value_counts().sort_index(),
    title="Distribution Bedrooms",
    labels={
        "value":"Count",
        "index":"Number of Bedrooms"
    },
    text_auto='.2s'

)

fig.update_layout(
    showlegend=False
)

fig.show()

Exploring number of bathrooms:

In [163]:
data.bathrooms.value_counts().sort_index()

1.00     22
1.50     11
2.00    198
2.50     52
3.00     78
3.25      1
3.50     30
4.00     27
4.50      9
5.00     11
Name: bathrooms, dtype: int64

In [164]:
fig = px.bar(
    data.bathrooms.value_counts().sort_index(),
    title="Distribution Bathrooms",
    labels={
        "value":"Count",
        "index":"Number of Bathrooms"
    },
    text_auto='.2s'

)

fig.update_layout(
    showlegend=False
)

Exploring target variable:

In [165]:
data.price.describe()

count    4.390000e+02
mean     5.525664e+05
std      4.618657e+05
min      2.200000e+04
25%      2.350000e+05
50%      5.200000e+05
75%      6.992500e+05
max      5.858000e+06
Name: price, dtype: float64

In [166]:
print(data.shape)

(439, 5)


In [167]:
mask_price = data.price < 1400000
data = data[mask_price]
print(data.shape)

(420, 5)


In [168]:
fig = make_subplots(
    rows=2, cols=1,
    #shared_xaxes=True
    x_title='Value in USD',
)

fig.add_trace(
    go.Histogram(x=data.price),
    row=1, col=1,
)

fig.add_trace(
    go.Box(x=data.price, name=""),
    row=2, col=1,
)

fig.update_layout(
    title_text="Price Distribution",
    showlegend=False
)

fig.show()

Data Partition

In [169]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [170]:
labels = data.price
features = data[['bedrooms', 'bathrooms', 'area', 'zipcode']]


In [171]:
train_features, test_features, train_labels, test_labels = train_test_split(
    features,
    labels,
    test_size=0.2,
    stratify=features.zipcode,
    random_state=7
)

In [172]:
print(train_features.shape, train_labels.shape)
print(test_features.shape, test_labels.shape)

(336, 4) (336,)
(84, 4) (84,)


In [173]:
list_ordered_zip_codes = train_features.zipcode.value_counts().sort_index().index.tolist()
list_ordered_zip_codes = [str(element) for element in list_ordered_zip_codes]

fig = go.Figure(data=[
    go.Bar(
        name='Train Set',
        x=list_ordered_zip_codes,
        y=train_features.zipcode.value_counts(normalize=True).sort_index().values.tolist()

    ),
    go.Bar(
        name='Test Set',
        x=list_ordered_zip_codes,
        y=test_features.zipcode.value_counts(normalize=True).sort_index().values.tolist()
    )
])
fig.update_layout(
    barmode='group',
    title="Zip Code Percentage Distribution in Tran-Test Splits",
    yaxis=dict(
        title='Percentage %',
    ),
    xaxis=dict(
        title='ZIP Code',
    ),
)
fig.show()

In [174]:
tabular_columns = ['bedrooms', 'bathrooms', 'area']

max_tabular_values = train_features.loc[:, tabular_columns].max()
max_tabular_values

bedrooms        5.0
bathrooms       5.0
area         4829.0
dtype: float64

In [175]:
train_tabular_features = train_features.loc[:, tabular_columns]
train_tabular_features = train_tabular_features.div(max_tabular_values)

test_tabular_features = test_features.loc[:, tabular_columns]
test_tabular_features = test_tabular_features.div(max_tabular_values)


train_tabular_features.head()

Unnamed: 0,bedrooms,bathrooms,area
193,0.4,0.2,0.172707
187,0.8,0.4,0.338372
471,0.4,0.4,0.258439
470,0.8,0.7,0.432802
159,0.8,0.7,0.600538


In [176]:
train_zip_codes = pd.get_dummies(train_features.zipcode)
test_zip_codes = pd.get_dummies(test_features.zipcode)

print(train_tabular_features.shape, train_zip_codes.shape)
print(test_tabular_features.shape, test_zip_codes.shape)

(336, 3) (336, 15)
(84, 3) (84, 15)


In [177]:
processed_train_features = pd.concat(
    [
        train_tabular_features,
        train_zip_codes
    ],
    axis=1,
).reset_index(drop=True)

print(processed_train_features.shape)
processed_train_features.head()

(336, 18)


Unnamed: 0,bedrooms,bathrooms,area,81524,85255,85266,91901,92021,92276,92677,92880,93111,93446,93510,94501,94531,95220,96019
0,0.4,0.2,0.172707,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0.8,0.4,0.338372,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0.4,0.4,0.258439,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.8,0.7,0.432802,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0.8,0.7,0.600538,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [178]:
processed_test_features = pd.concat(
    [
        test_tabular_features,
        test_zip_codes
    ],
    axis=1
).reset_index(drop=True)

print(processed_test_features.shape)
processed_test_features.head()

(84, 18)


Unnamed: 0,bedrooms,bathrooms,area,81524,85255,85266,91901,92021,92276,92677,92880,93111,93446,93510,94501,94531,95220,96019
0,1.0,0.6,0.628287,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0.4,0.4,0.458273,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0.2,0.2,0.145165,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0.6,0.5,0.376683,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,1.0,0.7,0.59619,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [179]:
train_preprocessed_labels = train_labels / train_labels.max()
test_preprocessed_labels = test_labels / train_labels.max()

In [180]:
from sklearn.linear_model import LinearRegression

In [181]:
model = LinearRegression()
model.fit(processed_train_features.values, train_labels.values)


In [182]:
model.score(processed_train_features.values, train_labels.values)

0.8500733396849876

In [183]:
model.coef_

array([  83805.66044844,  139111.95352615,  786517.25215476,
       -129453.62744828,   70557.80709788,  -21814.20982831,
         50193.71168876,  -11110.19569113, -292974.80524095,
        267615.16957124, -250259.31459681,  421389.43595873,
        -26408.66327449,  -72848.92343282,  329118.94388339,
       -172345.70566049,   82392.21195863, -244051.83498532])

In [184]:
model.intercept_

103975.3382056327

In [185]:
processed_train_features.values.shape

(336, 18)

In [186]:
train_labels.values.shape

(336,)

In [187]:
train_preds = model.predict(processed_train_features.values)
test_preds = model.predict(processed_test_features.values)

In [188]:
num_test_cases = [i for i in range(1, 85)]


fig = go.Figure(data=[
    go.Bar(
        name='Predictions',
        x=num_test_cases,
        y=test_preds.tolist()

    ),
    go.Bar(
        name='Test Set',
        x=num_test_cases,
        y=test_labels.tolist()
    )
])
fig.update_layout(
    barmode='group',
    title="Comparisson Predictions vs Ground Truth",
    yaxis=dict(
        title='House Value',
    ),
    xaxis=dict(
        title='Samples',
    ),
)
fig.show()

In [189]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


train_score = mean_squared_error(train_preds, train_labels.values, squared=True)
test_score = mean_squared_error(test_preds, test_labels.values, squared=True)

print(f"Train RMSE: {train_score:.4f} - Test RMSE: {test_score:.4f}")

Train RMSE: 13476324324.4274 - Test RMSE: 16063356732.1045


In [237]:
from sklearn.metrics import mean_absolute_percentage_error


train_mape_score = mean_absolute_percentage_error(train_preds, train_labels.values)
test_mape_score = mean_absolute_percentage_error(test_preds, test_labels.values)
print(f"Train MAPE: {train_mape_score:.4f} - Test RMSE: {test_mape_score:.4f}")

Train MAPE: 0.5311 - Test RMSE: 0.1895


In [190]:
train_score = model.score(processed_train_features.values, train_labels.values)
test_score = model.score(processed_test_features.values, test_labels.values)
# Train score: 0.5168 - Test score: 0.7011
print(f"Train score: {train_score:.4f} - Test score: {test_score:.4f}")

Train score: 0.8501 - Test score: 0.8031


In [218]:
def performance_report(input_predictions, input_labels):
  rmse_score = mean_squared_error(input_predictions, input_labels, squared=True)
  model_score = model.score(input_predictions, input_labels)
  return model_score, rmse_score


list_subset_model_score = []
list_subset_rmse_score = []
list_zip_codes = test_features.zipcode.unique().tolist()



for row_zipcode in list_zip_codes:
  mask_zipcode = test_features.zipcode == row_zipcode
  mask_zipcode.reset_index(drop=True, inplace=True)
  subset_features = processed_test_features[mask_zipcode]
  subset_labels = test_labels.reset_index(drop=True)[mask_zipcode]
  subset_predictions = model.predict(subset_features.values)

  subset_model_score = model.score(subset_features.values, subset_labels.values)
  list_subset_model_score.append(subset_model_score)
  subset_rmse_score = mean_squared_error(subset_predictions, subset_labels, squared=True)
  list_subset_rmse_score.append(subset_rmse_score)

  print(f"Performance {row_zipcode}: RMSE: {subset_rmse_score:.4f} - Score: {subset_model_score:.4f}")

Performance 92880: RMSE: 8759493408.4564 - Score: -13.2739
Performance 93446: RMSE: 8222055567.8921 - Score: 0.5377
Performance 94501: RMSE: 60730359457.9430 - Score: 0.3690
Performance 92677: RMSE: 19531291309.2610 - Score: 0.4662
Performance 85255: RMSE: 48287352847.5009 - Score: 0.2088
Performance 92276: RMSE: 1848388776.2587 - Score: -0.3094
Performance 91901: RMSE: 2078209235.2062 - Score: 0.9477
Performance 93510: RMSE: 5386745615.8051 - Score: 0.8270
Performance 94531: RMSE: 7617712443.7815 - Score: -0.7830
Performance 96019: RMSE: 1010536704.0535 - Score: -2.3186
Performance 92021: RMSE: 6378691727.1914 - Score: -8.8482
Performance 81524: RMSE: 22050638414.0854 - Score: 0.2370
Performance 93111: RMSE: 4718037791.1744 - Score: nan
Performance 85266: RMSE: 164167344418.4291 - Score: -3.8019
Performance 95220: RMSE: 1427534.7195 - Score: nan



R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.



In [219]:
report_dict = {
    "zipcode": list_zip_codes,
    "score": list_subset_model_score,
    "rmse": list_subset_rmse_score
}

report_df = pd.DataFrame(report_dict)
report_df.head()

Unnamed: 0,zipcode,score,rmse
0,92880,-13.273938,8759493000.0
1,93446,0.537685,8222056000.0
2,94501,0.369012,60730360000.0
3,92677,0.466181,19531290000.0
4,85255,0.208841,48287350000.0


In [235]:
list_zip_codes = report_df.zipcode.tolist()
list_zip_codes =  [str(value) for value in list_zip_codes]


fig = px.bar(
    x = list_zip_codes,
    y = report_df.rmse,
    title="Distribution Scores with Different Zipcodes",
    labels={
        "x":"Zipcode",
        "y":"Score Value"
    },
    text_auto='.2s'

)

fig.update_layout(
    showlegend=False
)