# HomeVision

### W210 - Capstone

Dylan Jin
Lynn Liu
Andrew Beckerman


In [51]:
# Import Statements

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

### Data Import

In [52]:
# Load the dataset

# Full Dataset
data = pd.read_csv('./data_cleaned_filtered.csv')

# For House Images
# data = pd.read_csv('./data_cleaned_filtered_by_img.csv')

### Data Cleaning

In [53]:
# Selecting the desired columns
columns_to_keep = ['Home_ID', 'Price', 'Sqr Ft', 'Lot Size', 'Beds', 'Bath', 'Year Built', 'State']
data = data.loc[:, columns_to_keep]

# Get the number of rows in "data"
num_rows_init = data.shape[0]

# Display the number of rows
print("Initial Samples:", num_rows_init)

Initial Samples: 18929


In [54]:
# Get Null Percentage

def calculate_null_percentage(data):
    total_rows = len(data)
    null_percentages = {}
    
    for column in data.columns:
        null_count = data[column].isnull().sum()
        if null_count > 0:
            null_percentage = (null_count / total_rows) * 100
            null_percentages[column] = null_percentage
            
    return null_percentages

# Calculate the percentage of null values for each column
null_percentages = (data.isnull().sum() / len(data)) * 100

# Filter out columns with no missing values
# null_percentages = null_percentages[null_percentages > 0]

# Create a new DataFrame to display the result
na_data = pd.DataFrame({
    'Column Name': null_percentages.index,
    'Null Percentage': null_percentages.values
})

# Sort the result by Null Percentage in descending order
na_data.sort_values(by='Null Percentage', ascending=False, inplace=True)

# Reset the index of the result DataFrame for a clean representation
na_data.reset_index(drop=True, inplace=True)

# Export the result to a CSV file
# na_data.to_csv('null_percentage_results.csv', index=False)

# Display the result
print(na_data)

  Column Name  Null Percentage
0  Year Built         1.880712
1        Beds         0.438481
2      Sqr Ft         0.380369
3        Bath         0.295842
4     Home_ID         0.000000
5       Price         0.000000
6    Lot Size         0.000000
7       State         0.000000


In [55]:
# Drop rows with missing data
data = data.dropna()

# Get the number of rows in "data"
num_rows_final = data.shape[0]

# Display the number of rows
print("Samples after cleaning:", num_rows_final)
print("Samples Dropped:", num_rows_init-num_rows_final)

Samples after cleaning: 18452
Samples Dropped: 477


In [56]:
print(data)

         Home_ID    Price   Sqr Ft  Lot Size  Beds  Bath  Year Built State
0      202000003   127020   1235.0    6970.0   3.0   1.0      1954.0    WI
1      202000004   340000   2430.0    6899.0   4.0   4.0      2019.0    OK
2      202000005    52000    802.0    6251.0   2.0   1.0      1938.0    TX
3      202000007   299900   3201.0   12196.8   4.0   3.0      1959.0    IL
4      202000009   950000   5466.0   13068.0   5.0   5.0      2007.0    NV
...          ...      ...      ...       ...   ...   ...         ...   ...
18924  203929994   400000   2640.0   21344.4   4.0   3.0      1970.0    NC
18925  203929995   530000   2167.0    7505.0   3.0   2.0      2001.0    AZ
18926  203929996  1172000   4676.0   19602.0   4.0   6.0      1995.0    TX
18927  203929999   171000   1269.0    7860.0   3.0   2.0      2001.0    TX
18928  203930005  4500000  10149.0  229996.8   4.0   9.0      2004.0    NC

[18452 rows x 8 columns]


### Feature Engineering

In [57]:
# One-Hot encoding of categorical features
categorical_columns = ['State']
df = pd.get_dummies(data, columns=categorical_columns)

# Save copy as baseline
baseline_df = df.copy()

In [58]:
print(df)

         Home_ID    Price   Sqr Ft  Lot Size  Beds  Bath  Year Built  \
0      202000003   127020   1235.0    6970.0   3.0   1.0      1954.0   
1      202000004   340000   2430.0    6899.0   4.0   4.0      2019.0   
2      202000005    52000    802.0    6251.0   2.0   1.0      1938.0   
3      202000007   299900   3201.0   12196.8   4.0   3.0      1959.0   
4      202000009   950000   5466.0   13068.0   5.0   5.0      2007.0   
...          ...      ...      ...       ...   ...   ...         ...   
18924  203929994   400000   2640.0   21344.4   4.0   3.0      1970.0   
18925  203929995   530000   2167.0    7505.0   3.0   2.0      2001.0   
18926  203929996  1172000   4676.0   19602.0   4.0   6.0      1995.0   
18927  203929999   171000   1269.0    7860.0   3.0   2.0      2001.0   
18928  203930005  4500000  10149.0  229996.8   4.0   9.0      2004.0   

       State_AK  State_AL  State_AZ  ...  State_NY  State_OH  State_OK  \
0             0         0         0  ...         0         0 

In [39]:
# Get Null Percentage

def calculate_null_percentage(df):
    total_rows = len(df)
    null_percentages = {}
    
    for column in df.columns:
        null_count = df[column].isnull().sum()
        if null_count > 0:
            null_percentage = (null_count / total_rows) * 100
            null_percentages[column] = null_percentage
            
    return null_percentages

# Calculate the percentage of null values for each column
null_percentages = (df.isnull().sum() / len(df)) * 100

# Filter out columns with no missing values
# null_percentages = null_percentages[null_percentages > 0]

# Create a new dataFrame to display the result
na_df = pd.DataFrame({
    'Column Name': null_percentages.index,
    'Null Percentage': null_percentages.values
})

# Sort the result by Null Percentage in descending order
na_df.sort_values(by='Null Percentage', ascending=False, inplace=True)

# Reset the index of the result dfFrame for a clean representation
na_df.reset_index(drop=True, inplace=True)

# Export the result to a CSV file
# na_df.to_csv('null_percentage_results.csv', index=False)

# Display the result
print(na_df)

   Column Name  Null Percentage
0      Home_ID              0.0
1     State_LA              0.0
2     State_MD              0.0
3     State_MI              0.0
4     State_MN              0.0
5     State_NC              0.0
6     State_NE              0.0
7     State_NJ              0.0
8     State_NM              0.0
9     State_NV              0.0
10    State_NY              0.0
11    State_OH              0.0
12    State_OK              0.0
13    State_OR              0.0
14    State_PA              0.0
15    State_TN              0.0
16    State_TX              0.0
17    State_VA              0.0
18    State_WA              0.0
19    State_MA              0.0
20    State_KY              0.0
21       Price              0.0
22    State_KS              0.0
23      Sqr Ft              0.0
24    Lot Size              0.0
25        Beds              0.0
26        Bath              0.0
27  Year Built              0.0
28    State_AK              0.0
29    State_AL              0.0
30    St

### Download Statellite Images

In [10]:
# Additional Import Statements
import os
import csv
import requests
import time

# Parameters
API_KEY = 'AIzaSyBYwJUpLvcvUmQsJ0oqALLn045OClTQQsM'
ZOOM = 20  # The highest zoom level possible for satellite images
OUTPUT_FOLDER = 'satellite_images'
DELAY_SECONDS = 2

# Create the output folder if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

def download_satellite_image(lat, lon, home_id):
    image_path = os.path.join(OUTPUT_FOLDER, f'{home_id}.jpg')
    if os.path.exists(image_path):
        print(f'Satellite image {home_id} already downloaded.')
        return

    base_url = 'https://maps.googleapis.com/maps/api/staticmap?'
    params = {
        'center': f'{lat},{lon}',
        'zoom': ZOOM,
        'size': '640x640',
        'maptype': 'satellite',
        'key': API_KEY,
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        with open(image_path, 'wb') as f:
            f.write(response.content)
        print(f'Satellite image {home_id} downloaded successfully.')
    else:
        print(f'Failed to download satellite image {home_id}. Status code: {response.status_code}')
    time.sleep(DELAY_SECONDS)

def main():
    with open('data_cleaned_filtered.csv', 'r', encoding='utf-8-sig') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            home_id = row['Home_ID']
            lat, lon = float(row['Latitude']), float(row['Longitude'])
            download_satellite_image(lat, lon, home_id)
#             time.sleep(DELAY_SECONDS)

if __name__ == '__main__':
    main()

Satellite image 202000003 already downloaded.
Satellite image 202000004 already downloaded.
Satellite image 202000005 already downloaded.
Satellite image 202000007 already downloaded.
Satellite image 202000009 already downloaded.
Satellite image 202000012 already downloaded.
Satellite image 202000014 already downloaded.
Satellite image 202000016 already downloaded.
Satellite image 202000017 already downloaded.
Satellite image 202000018 already downloaded.
Satellite image 202000020 already downloaded.
Satellite image 202000022 already downloaded.
Satellite image 202000024 already downloaded.
Satellite image 202000025 already downloaded.
Satellite image 202000028 already downloaded.
Satellite image 202000029 already downloaded.
Satellite image 202000030 already downloaded.
Satellite image 202000032 already downloaded.
Satellite image 202000033 already downloaded.
Satellite image 202000034 already downloaded.
Satellite image 202000036 already downloaded.
Satellite image 202000038 already 

Satellite image 202002664 already downloaded.
Satellite image 202002665 already downloaded.
Satellite image 202002666 already downloaded.
Satellite image 202002667 already downloaded.
Satellite image 202002668 already downloaded.
Satellite image 202002669 already downloaded.
Satellite image 202002670 already downloaded.
Satellite image 202002673 already downloaded.
Satellite image 202002674 already downloaded.
Satellite image 202002679 already downloaded.
Satellite image 202002680 already downloaded.
Satellite image 202002681 already downloaded.
Satellite image 202002685 already downloaded.
Satellite image 202002686 already downloaded.
Satellite image 202002687 already downloaded.
Satellite image 202002689 already downloaded.
Satellite image 202002693 already downloaded.
Satellite image 202002696 already downloaded.
Satellite image 202002697 already downloaded.
Satellite image 202002698 already downloaded.
Satellite image 202002699 already downloaded.
Satellite image 202002702 already 

Satellite image 202004432 already downloaded.
Satellite image 202004433 already downloaded.
Satellite image 202004434 already downloaded.
Satellite image 202004435 already downloaded.
Satellite image 202004436 already downloaded.
Satellite image 202004437 already downloaded.
Satellite image 202004438 already downloaded.
Satellite image 202004446 already downloaded.
Satellite image 202004449 already downloaded.
Satellite image 202004450 already downloaded.
Satellite image 202004454 already downloaded.
Satellite image 202004456 already downloaded.
Satellite image 202004457 already downloaded.
Satellite image 202004458 already downloaded.
Satellite image 202004459 already downloaded.
Satellite image 202004460 already downloaded.
Satellite image 202004462 already downloaded.
Satellite image 202004465 already downloaded.
Satellite image 202004467 already downloaded.
Satellite image 202004468 already downloaded.
Satellite image 202004469 already downloaded.
Satellite image 202004470 already 

Satellite image 203900698 already downloaded.
Satellite image 203900700 already downloaded.
Satellite image 203900701 already downloaded.
Satellite image 203900702 already downloaded.
Satellite image 203900703 already downloaded.
Satellite image 203900709 already downloaded.
Satellite image 203900710 already downloaded.
Satellite image 203900712 already downloaded.
Satellite image 203900715 already downloaded.
Satellite image 203900721 already downloaded.
Satellite image 203900722 already downloaded.
Satellite image 203900723 already downloaded.
Satellite image 203900726 already downloaded.
Satellite image 203900727 already downloaded.
Satellite image 203900728 already downloaded.
Satellite image 203900729 already downloaded.
Satellite image 203900730 already downloaded.
Satellite image 203900734 already downloaded.
Satellite image 203900736 already downloaded.
Satellite image 203900737 already downloaded.
Satellite image 203900739 already downloaded.
Satellite image 203900740 already 

Satellite image 203903135 already downloaded.
Satellite image 203903136 already downloaded.
Satellite image 203903137 already downloaded.
Satellite image 203903142 already downloaded.
Satellite image 203903143 already downloaded.
Satellite image 203903144 already downloaded.
Satellite image 203903145 already downloaded.
Satellite image 203903147 already downloaded.
Satellite image 203903149 already downloaded.
Satellite image 203903150 already downloaded.
Satellite image 203903152 already downloaded.
Satellite image 203903153 already downloaded.
Satellite image 203903154 already downloaded.
Satellite image 203903155 already downloaded.
Satellite image 203903157 already downloaded.
Satellite image 203903158 already downloaded.
Satellite image 203903159 already downloaded.
Satellite image 203903160 already downloaded.
Satellite image 203903162 already downloaded.
Satellite image 203903163 already downloaded.
Satellite image 203903165 already downloaded.
Satellite image 203903167 already 

Satellite image 203905651 already downloaded.
Satellite image 203905652 already downloaded.
Satellite image 203905653 already downloaded.
Satellite image 203905654 already downloaded.
Satellite image 203905655 already downloaded.
Satellite image 203905656 already downloaded.
Satellite image 203905659 already downloaded.
Satellite image 203905662 already downloaded.
Satellite image 203905663 already downloaded.
Satellite image 203905664 already downloaded.
Satellite image 203905668 already downloaded.
Satellite image 203905670 already downloaded.
Satellite image 203905671 already downloaded.
Satellite image 203905673 already downloaded.
Satellite image 203905675 already downloaded.
Satellite image 203905677 already downloaded.
Satellite image 203905679 already downloaded.
Satellite image 203905680 already downloaded.
Satellite image 203905681 already downloaded.
Satellite image 203905682 already downloaded.
Satellite image 203905687 already downloaded.
Satellite image 203905688 already 

Satellite image 203907523 already downloaded.
Satellite image 203907529 already downloaded.
Satellite image 203907530 already downloaded.
Satellite image 203907532 already downloaded.
Satellite image 203907533 already downloaded.
Satellite image 203907534 already downloaded.
Satellite image 203907535 already downloaded.
Satellite image 203907537 already downloaded.
Satellite image 203907538 already downloaded.
Satellite image 203907539 already downloaded.
Satellite image 203907540 already downloaded.
Satellite image 203907541 already downloaded.
Satellite image 203907542 already downloaded.
Satellite image 203907543 already downloaded.
Satellite image 203907544 already downloaded.
Satellite image 203907546 already downloaded.
Satellite image 203907548 already downloaded.
Satellite image 203907552 already downloaded.
Satellite image 203907553 already downloaded.
Satellite image 203907556 already downloaded.
Satellite image 203907558 already downloaded.
Satellite image 203907559 already 

Satellite image 203908780 already downloaded.
Satellite image 203908782 already downloaded.
Satellite image 203908783 already downloaded.
Satellite image 203908784 already downloaded.
Satellite image 203908785 already downloaded.
Satellite image 203908787 already downloaded.
Satellite image 203908788 already downloaded.
Satellite image 203908789 already downloaded.
Satellite image 203908791 already downloaded.
Satellite image 203908797 already downloaded.
Satellite image 203908798 already downloaded.
Satellite image 203908800 already downloaded.
Satellite image 203908803 already downloaded.
Satellite image 203908805 already downloaded.
Satellite image 203908811 already downloaded.
Satellite image 203908812 already downloaded.
Satellite image 203908815 already downloaded.
Satellite image 203908816 already downloaded.
Satellite image 203908817 already downloaded.
Satellite image 203908818 already downloaded.
Satellite image 203908820 already downloaded.
Satellite image 203908821 already 

Satellite image 203911237 already downloaded.
Satellite image 203911239 already downloaded.
Satellite image 203911241 already downloaded.
Satellite image 203911242 already downloaded.
Satellite image 203911243 already downloaded.
Satellite image 203911244 already downloaded.
Satellite image 203911245 already downloaded.
Satellite image 203911249 already downloaded.
Satellite image 203911250 already downloaded.
Satellite image 203911253 already downloaded.
Satellite image 203911254 already downloaded.
Satellite image 203911255 already downloaded.
Satellite image 203911256 already downloaded.
Satellite image 203911257 already downloaded.
Satellite image 203911261 already downloaded.
Satellite image 203911263 already downloaded.
Satellite image 203911264 already downloaded.
Satellite image 203911267 already downloaded.
Satellite image 203911268 already downloaded.
Satellite image 203911269 already downloaded.
Satellite image 203911270 already downloaded.
Satellite image 203911271 already 

Satellite image 203913509 already downloaded.
Satellite image 203913512 already downloaded.
Satellite image 203913513 already downloaded.
Satellite image 203913514 already downloaded.
Satellite image 203913517 already downloaded.
Satellite image 203913519 already downloaded.
Satellite image 203913523 already downloaded.
Satellite image 203913524 already downloaded.
Satellite image 203913534 already downloaded.
Satellite image 203913536 already downloaded.
Satellite image 203913538 already downloaded.
Satellite image 203913539 already downloaded.
Satellite image 203913540 already downloaded.
Satellite image 203913541 already downloaded.
Satellite image 203913542 already downloaded.
Satellite image 203913546 already downloaded.
Satellite image 203913548 already downloaded.
Satellite image 203913552 already downloaded.
Satellite image 203913557 already downloaded.
Satellite image 203913559 already downloaded.
Satellite image 203913562 already downloaded.
Satellite image 203913563 already 

Satellite image 203915380 already downloaded.
Satellite image 203915381 already downloaded.
Satellite image 203915382 already downloaded.
Satellite image 203915388 already downloaded.
Satellite image 203915392 already downloaded.
Satellite image 203915393 already downloaded.
Satellite image 203915394 already downloaded.
Satellite image 203915395 already downloaded.
Satellite image 203915396 already downloaded.
Satellite image 203915397 already downloaded.
Satellite image 203915399 already downloaded.
Satellite image 203915400 already downloaded.
Satellite image 203915401 already downloaded.
Satellite image 203915403 already downloaded.
Satellite image 203915405 already downloaded.
Satellite image 203915406 already downloaded.
Satellite image 203915407 already downloaded.
Satellite image 203915408 already downloaded.
Satellite image 203915409 already downloaded.
Satellite image 203915410 already downloaded.
Satellite image 203915411 already downloaded.
Satellite image 203915416 already 

Satellite image 203916584 already downloaded.
Satellite image 203916586 already downloaded.
Satellite image 203916592 already downloaded.
Satellite image 203916593 already downloaded.
Satellite image 203916594 already downloaded.
Satellite image 203916595 already downloaded.
Satellite image 203916597 already downloaded.
Satellite image 203916599 already downloaded.
Satellite image 203916602 already downloaded.
Satellite image 203916603 already downloaded.
Satellite image 203916605 already downloaded.
Satellite image 203916606 already downloaded.
Satellite image 203916607 already downloaded.
Satellite image 203916608 already downloaded.
Satellite image 203916611 already downloaded.
Satellite image 203916612 already downloaded.
Satellite image 203916613 already downloaded.
Satellite image 203916614 already downloaded.
Satellite image 203916615 already downloaded.
Satellite image 203916616 already downloaded.
Satellite image 203916618 already downloaded.
Satellite image 203916619 already 

Satellite image 203919111 already downloaded.
Satellite image 203919112 already downloaded.
Satellite image 203919116 already downloaded.
Satellite image 203919117 already downloaded.
Satellite image 203919118 already downloaded.
Satellite image 203919119 already downloaded.
Satellite image 203919120 already downloaded.
Satellite image 203919122 already downloaded.
Satellite image 203919125 already downloaded.
Satellite image 203919129 already downloaded.
Satellite image 203919130 already downloaded.
Satellite image 203919131 already downloaded.
Satellite image 203919133 already downloaded.
Satellite image 203919134 already downloaded.
Satellite image 203919136 already downloaded.
Satellite image 203919137 already downloaded.
Satellite image 203919138 already downloaded.
Satellite image 203919139 already downloaded.
Satellite image 203919143 already downloaded.
Satellite image 203919145 already downloaded.
Satellite image 203919146 already downloaded.
Satellite image 203919148 already 

Satellite image 203921529 already downloaded.
Satellite image 203921530 already downloaded.
Satellite image 203921531 already downloaded.
Satellite image 203921533 already downloaded.
Satellite image 203921534 already downloaded.
Satellite image 203921537 already downloaded.
Satellite image 203921539 already downloaded.
Satellite image 203921543 already downloaded.
Satellite image 203921545 already downloaded.
Satellite image 203921546 already downloaded.
Satellite image 203921550 already downloaded.
Satellite image 203921553 already downloaded.
Satellite image 203921558 already downloaded.
Satellite image 203921561 already downloaded.
Satellite image 203921562 already downloaded.
Satellite image 203921565 already downloaded.
Satellite image 203921566 already downloaded.
Satellite image 203921568 already downloaded.
Satellite image 203921572 already downloaded.
Satellite image 203921574 already downloaded.
Satellite image 203921575 already downloaded.
Satellite image 203921576 already 

Satellite image 203923421 already downloaded.
Satellite image 203923422 already downloaded.
Satellite image 203923423 already downloaded.
Satellite image 203923425 already downloaded.
Satellite image 203923426 already downloaded.
Satellite image 203923427 already downloaded.
Satellite image 203923429 already downloaded.
Satellite image 203923431 already downloaded.
Satellite image 203923432 already downloaded.
Satellite image 203923434 already downloaded.
Satellite image 203923437 already downloaded.
Satellite image 203923440 already downloaded.
Satellite image 203923442 already downloaded.
Satellite image 203923443 already downloaded.
Satellite image 203923444 already downloaded.
Satellite image 203923447 already downloaded.
Satellite image 203923451 already downloaded.
Satellite image 203923452 already downloaded.
Satellite image 203923453 already downloaded.
Satellite image 203923456 already downloaded.
Satellite image 203923457 already downloaded.
Satellite image 203923458 already 

Satellite image 203924660 already downloaded.
Satellite image 203924662 already downloaded.
Satellite image 203924665 already downloaded.
Satellite image 203924666 already downloaded.
Satellite image 203924667 already downloaded.
Satellite image 203924670 already downloaded.
Satellite image 203924673 already downloaded.
Satellite image 203924676 already downloaded.
Satellite image 203924677 already downloaded.
Satellite image 203924679 already downloaded.
Satellite image 203924682 already downloaded.
Satellite image 203924683 already downloaded.
Satellite image 203924684 already downloaded.
Satellite image 203924685 already downloaded.
Satellite image 203924686 already downloaded.
Satellite image 203924687 already downloaded.
Satellite image 203924688 already downloaded.
Satellite image 203924689 already downloaded.
Satellite image 203924700 already downloaded.
Satellite image 203924701 already downloaded.
Satellite image 203924702 already downloaded.
Satellite image 203924703 already 

Satellite image 203927139 already downloaded.
Satellite image 203927143 already downloaded.
Satellite image 203927147 already downloaded.
Satellite image 203927148 already downloaded.
Satellite image 203927152 already downloaded.
Satellite image 203927153 already downloaded.
Satellite image 203927156 already downloaded.
Satellite image 203927157 already downloaded.
Satellite image 203927158 already downloaded.
Satellite image 203927161 already downloaded.
Satellite image 203927163 already downloaded.
Satellite image 203927165 already downloaded.
Satellite image 203927166 already downloaded.
Satellite image 203927169 already downloaded.
Satellite image 203927171 already downloaded.
Satellite image 203927172 already downloaded.
Satellite image 203927174 already downloaded.
Satellite image 203927175 already downloaded.
Satellite image 203927177 already downloaded.
Satellite image 203927179 already downloaded.
Satellite image 203927184 already downloaded.
Satellite image 203927185 already 

Satellite image 203929583 already downloaded.
Satellite image 203929584 already downloaded.
Satellite image 203929585 already downloaded.
Satellite image 203929586 already downloaded.
Satellite image 203929587 already downloaded.
Satellite image 203929588 already downloaded.
Satellite image 203929589 already downloaded.
Satellite image 203929590 already downloaded.
Satellite image 203929593 already downloaded.
Satellite image 203929595 already downloaded.
Satellite image 203929596 already downloaded.
Satellite image 203929599 already downloaded.
Satellite image 203929600 already downloaded.
Satellite image 203929602 already downloaded.
Satellite image 203929604 already downloaded.
Satellite image 203929607 already downloaded.
Satellite image 203929609 already downloaded.
Satellite image 203929610 already downloaded.
Satellite image 203929611 already downloaded.
Satellite image 203929615 already downloaded.
Satellite image 203929616 already downloaded.
Satellite image 203929617 already 

In [40]:
# Image location
# House images
# image_folder = "./preprocessed_images"
# Satellite images
image_folder = "./preprocessed_satellite_images"

In [12]:
# Save the preprocessed images for future use (Comment to bypass)
# preprocessed_image_folder = image_folder
# os.makedirs(preprocessed_image_folder, exist_ok=True)

# for home_id in data['Home_ID']:
#     image_filename = f"{home_id}.jpg"
#     image_path = os.path.join(image_folder, image_filename)
#     preprocessed_image_path = os.path.join(preprocessed_image_folder, image_filename)
#     if os.path.exists(image_path):
#         img = image.load_img(image_path, target_size=(224, 224))
#         img.save(preprocessed_image_path)

### Extract image Features

In [13]:
# Additional Import Statements
import os
import tensorflow as tf 
from PIL import Image
from tensorflow import keras
from tensorflow.keras.preprocessing import image

# Load the EfficientNet model
model = tf.keras.applications.efficientnet_v2.EfficientNetV2L(weights='imagenet', include_top=False, pooling='avg')

# Define a function to extract image features using EfficientNet
def extract_image_features(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = tf.keras.applications.efficientnet.preprocess_input(x)
    features = model.predict(x)
    return features.flatten()  # Flatten the features to a 1-dimensional array

# Create a list to store image features and corresponding Home_ID
image_features_list = []
home_id_list = []

# Iterate over the dataframe and extract image features for each image
for home_id in df['Home_ID']:
    image_filename = f"{home_id}.jpg"
    image_path = os.path.join(image_folder, image_filename)
    if os.path.exists(image_path):
        features = extract_image_features(image_path)
        image_features_list.append(features)
        home_id_list.append(home_id)
    else:
        print(f"Image not found for Home_ID: {home_id}")

# Create a new dataframe with extracted image features
image_features_df = pd.DataFrame(image_features_list, columns=[f"img_feature_{i}" for i in range(image_features_list[0].shape[0])])
image_features_df['Home_ID'] = home_id_list













































































































































































































































In [59]:
# Merge the original dataframe with the image features dataframe using Home_ID
df = pd.merge(df, image_features_df, on='Home_ID')

In [60]:
# Show dataframes
print('Baseline:')
print(baseline_df)
print('With Image Features:')
print(df)

Baseline:
         Home_ID    Price   Sqr Ft  Lot Size  Beds  Bath  Year Built  \
0      202000003   127020   1235.0    6970.0   3.0   1.0      1954.0   
1      202000004   340000   2430.0    6899.0   4.0   4.0      2019.0   
2      202000005    52000    802.0    6251.0   2.0   1.0      1938.0   
3      202000007   299900   3201.0   12196.8   4.0   3.0      1959.0   
4      202000009   950000   5466.0   13068.0   5.0   5.0      2007.0   
...          ...      ...      ...       ...   ...   ...         ...   
18924  203929994   400000   2640.0   21344.4   4.0   3.0      1970.0   
18925  203929995   530000   2167.0    7505.0   3.0   2.0      2001.0   
18926  203929996  1172000   4676.0   19602.0   4.0   6.0      1995.0   
18927  203929999   171000   1269.0    7860.0   3.0   2.0      2001.0   
18928  203930005  4500000  10149.0  229996.8   4.0   9.0      2004.0   

       State_AK  State_AL  State_AZ  ...  State_NY  State_OH  State_OK  \
0             0         0         0  ...         0 

### Modeling

In [63]:
# Drop Home_ID column from model input
columns_to_drop = ['Home_ID']
df = df.drop(columns=columns_to_drop) 
baseline_df = baseline_df.drop(columns=columns_to_drop) 

In [64]:
# Create X - Input and Y - Output (Price)
y_base = baseline_df['Price']
y = df['Price']

# Baseline
X_base = baseline_df.drop('Price', axis=1)
X_base_num = X_base.select_dtypes(include='number').copy()
X_base_cat = X_base.select_dtypes(exclude='number').copy().apply(lambda x: LabelEncoder().fit_transform(x.astype(str)))
X_base_1 = pd.concat([X_base_num, X_base_cat], axis=1)

# With Images
X = df.drop('Price', axis=1)
X_num = X.select_dtypes(include='number').copy()
X_cat = X.select_dtypes(exclude='number').copy().apply(lambda x: LabelEncoder().fit_transform(x.astype(str)))
X_1 = pd.concat([X_num, X_cat], axis=1)

In [67]:
# Split training and test datasets (80/20)

# Baseline
X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_base_1, y_base, test_size=0.2, random_state=42)

# With Images
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.2, random_state=42)

In [70]:
X_train = X_train.drop(0, axis=1)
X_test = X_test.drop(0, axis=1)

### Gradient Boosting Model

In [None]:
# Additional Import Statemtents
from sklearn.ensemble import GradientBoostingRegressor

GB_model = GradientBoostingRegressor(max_depth=5, n_estimators=1000, learning_rate=0.1)

# Baseline
model_base = GB_model.fit(X_base_train, y_base_train)
y_base_pred = model_base.predict(X_base_test)

# With Images
model_1 = GB_model.fit(X_train, y_train)
y_pred = model_1.predict(X_test)

#Median Error
Baseline
base_percentage_error = np.abs((y_base_test - y_base_pred) / y_base_test)
base_median_percentage_error = np.median(base_percentage_error)
print("Baseline GBM Median Percentage Error:", median_percentage_error)

# With Images
percentage_error = np.abs((y_test - y_pred) / y_test)
median_percentage_error = np.median(percentage_error)
print("GBM Median Percentage Error with Images:", median_percentage_error)

### Top 20 Features

In [None]:
# Show Top 10 Features
import matplotlib.pyplot as plt

# Get feature importances from the trained model
feature_importances = model_1.feature_importances_

# Get the names of the features
feature_names = X_train.columns

# Sort feature importances and feature names in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Select the top 20 features
top_20_importances = sorted_importances[:20]
top_20_feature_names = sorted_feature_names[:20]

# Create a bar plot to visualize the top 20 feature importances
plt.figure(figsize=(12, 8))
plt.bar(range(len(top_20_importances)), top_20_importances, align='center')
plt.xticks(range(len(top_20_importances)), top_20_feature_names, rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance Score')
plt.title('Top 20 Feature Importances')
plt.show()

### Scatterplot

In [None]:
# Scatter plot for Baseline Model
plt.scatter(y_base_test, y_base_pred, color='blue', alpha=0.5)
plt.plot([min(y_base_test), max(y_base_test)], [min(y_base_test), max(y_base_test)], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Baseline Model: Actual vs. Predicted Values')
plt.show()

# Scatter plot for Model with Images
plt.scatter(y_test, y_pred, color='red', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Model with Images: Actual vs. Predicted Values')
plt.show()

### Histogram

In [None]:
# Histogram for Baseline Model
plt.hist(base_percentage_error, bins=20, color='blue', alpha=0.5)
plt.xlabel('Percentage Error')
plt.ylabel('Frequency')
plt.title('Baseline Model: Percentage Error Histogram')
plt.show()

# Histogram for Model with Images
plt.hist(percentage_error, bins=20, color='red', alpha=0.5)
plt.xlabel('Percentage Error')
plt.ylabel('Frequency')
plt.title('Model with Images: Percentage Error Histogram')
plt.show()

### Box Plot

In [None]:
# Box Plot for Baseline Model
plt.boxplot(base_percentage_error, vert=False, labels=['Baseline GBM'], showfliers=True)
plt.xlabel('Percentage Error')
plt.title('Baseline Model: Percentage Error Box Plot')
plt.show()

# Box Plot for Model with Images
plt.boxplot(percentage_error, vert=False, labels=['GBM with Images'], showfliers=True)
plt.xlabel('Percentage Error')
plt.title('Model with Images: Percentage Error Box Plot')
plt.show()

### Residuals

In [None]:
# Residual Plot for Baseline Model
residuals_base = y_base_test - y_base_pred
plt.scatter(y_base_pred, residuals_base, color='blue', alpha=0.5)
plt.axhline(y=0, color='black', linestyle='--', lw=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Baseline Model: Residual Plot')
plt.show()

# Residual Plot for Model with Images
residuals_model = y_test - y_pred
plt.scatter(y_pred, residuals_model, color='red', alpha=0.5)
plt.axhline(y=0, color='black', linestyle='--', lw=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Model with Images: Residual Plot')
plt.show()

### Learning Curve

In [None]:
# Learning Curve for Baseline Model
train_sizes, train_scores, test_scores = learning_curve(GB_model, X_base_train, y_base_train, cv=5, scoring='neg_mean_squared_error')
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training Error')
plt.plot(train_sizes, test_scores_mean, 'o-', color='orange', label='Validation Error')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Baseline Model: Learning Curve')
plt.legend()
plt.show()

# Learning Curve for Model with Images
train_sizes, train_scores, test_scores = learning_curve(GB_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color='red', label='Training Error')
plt.plot(train_sizes, test_scores_mean, 'o-', color='purple', label='Validation Error')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Model with Images: Learning Curve')
plt.legend()
plt.show()