In [7]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
%matplotlib inline

In [8]:
url = ("https://raw.githubusercontent.com/budomike/C964Dataset/main/housing_price_dataset.csv")
df = pd.read_csv(url)
# Create dummy variables for the "Neighborhood" column
df = pd.get_dummies(df, columns=["Neighborhood"], drop_first=False)
df['Neighborhood_Rural'] = df['Neighborhood_Rural'].astype(int)
df['Neighborhood_Suburb'] = df['Neighborhood_Suburb'].astype(int)
df['Neighborhood_Urban'] = df['Neighborhood_Urban'].astype(int)

In [9]:
# Calculate the IQR for 'Price'
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = (df['Price'] < lower_bound) | (df['Price'] > upper_bound)

# Remove outliers
df_no_outliers = df[~outliers]

df_shape = df
df_no_outliers_shape = df_no_outliers

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define features and target variable
X = df_no_outliers.drop("Price", axis=1)
y = df_no_outliers["Price"]

categorical_features = ['Neighborhood_Rural', 'Neighborhood_Suburb', 'Neighborhood_Urban']
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Create linear regression model
model = LinearRegression()

# Create a pipeline that applies preprocessing and then fits the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

In [13]:
import ipywidgets as widgets
from IPython.display import display

output = widgets.Output()
residuals = y_test - y_pred
# Function to make predictions based on user input
def predict_price(square_feet, bedrooms, bathrooms, neighborhood, year_built):
    # Convert neighborhood input to dummy variables
    neighborhoods = ["Rural", "Suburb", "Urban"]
    neighborhood_values = [1 if neighborhood == n else 0 for n in neighborhoods]

    # Create a DataFrame with user input
    new_data = pd.DataFrame({
        "SquareFeet": [square_feet],
        "Bedrooms": [bedrooms],
        "Bathrooms": [bathrooms],
        "Neighborhood_Rural": [neighborhood_values[0]],
        "Neighborhood_Suburb": [neighborhood_values[1]],
        "Neighborhood_Urban": [neighborhood_values[2]],
        "YearBuilt": [year_built]
    })

    # Make prediction
    prediction_array = pipeline.predict(new_data)
    print(f"The predicted price of this house is: ${prediction_array[0]:.0f}")
    

# Create input widgets
square_feet_input = widgets.IntSlider(description="Square Feet:", min=0, max=5000, value=2000)
bedrooms_input = widgets.IntSlider(description="Bedrooms:", min=0, max=10, value=3)
bathrooms_input = widgets.IntSlider(description="Bathrooms:", min=0, max=5, value=2)
neighborhood_input = widgets.Dropdown(description="Neighborhood:", options=["Rural", "Suburb", "Urban"], value="Rural")
year_built_input = widgets.IntSlider(description="Year Built:", min=1900, max=2023, value=2000)

# Create a button to trigger predictions
predict_button = widgets.Button(description="Predict Price")
heatmap_button = widgets.Button(description="Heatmap")
histogram_button = widgets.Button(description="Residuals")
outliers_button = widgets.Button(description="Outliers")
rep_button = widgets.Button(description="Accuracy Metrics")

# Define callback function for button click
def on_predict_button_click(b):
    # Get values from input widgets
    square_feet = square_feet_input.value
    bedrooms = bedrooms_input.value
    bathrooms = bathrooms_input.value
    neighborhood = neighborhood_input.value
    year_built = year_built_input.value

    # Call prediction function
    with output:
        # Clear previous output
        output.clear_output(wait=True)
        
        # Call the prediction function
        predict_price(square_feet, bedrooms, bathrooms, neighborhood, year_built)

def on_heatmap_button_click(b):
    with output:
        output.clear_output(wait=True)
        plt.figure(figsize=(10, 8))
        sns.heatmap(df.corr(), annot=True, cmap="YlGnBu", fmt=".2f", linewidths=0.5)
        plt.title("Correlation Heatmap")
        plt.xticks(rotation=45)
        plt.show()

def on_histogram_button_click(b):
    with output:
        output.clear_output(wait=True)
        plt.figure(figsize=(10, 6))
        plt.hist(residuals, bins=30, edgecolor='black')
        plt.title("Histogram of Residuals")
        plt.xlabel("Residuals")
        plt.ylabel("Frequency")
        plt.show()

def on_outliers_button_click(b):
    with output:
        output.clear_output(wait=True)
        fig, axs = plt.subplots(2,3, figsize = (15,8))
        sns.boxplot(df['Price'], ax=axs[0, 0])
        sns.boxplot(df['SquareFeet'], ax=axs[0, 1])
        sns.boxplot(df['Bedrooms'], ax=axs[0, 2])
        sns.boxplot(df['Bathrooms'], ax=axs[1, 0])
        sns.boxplot(df['YearBuilt'], ax=axs[1, 1])

        # Plot the boxplot with outliers removed
        sns.boxplot(df_no_outliers['Price'], ax=axs[1, 2])
        axs[0, 0].set_title('Price Outliers')
        axs[0, 1].set_title('Square Feet Outliers')
        axs[0, 2].set_title('Bedroom Outliers')
        axs[1, 0].set_title('Bathroom Outliers')
        axs[1, 1].set_title('Year Built Outliers')
        axs[1, 2].set_title('Price Outliers Removed')

        plt.show()
        print(f'Shape before removing price outliers: {df_shape.shape}')
        print(f'Shape after removing price outliers: {df_no_outliers_shape.shape}')

def on_rep_button_click(b):
    with output:
        output.clear_output(wait=True)
        mae = mean_absolute_error(y_test, y_pred)
        max_price = df_no_outliers['Price'].max()
        min_price = df_no_outliers['Price'].min()
    
        # Calculate the range of the target variable
        range_price = max_price - min_price
    
        # Calculate the relative error percentage
        relative_error_percentage = (mae / range_price) * 100
        print(f"Mean Absolute Error: {mae:.2f}")
        print(f"Relative Error Percentage: {relative_error_percentage:.2f}%")

# Attach the callback function to the button click event
predict_button.on_click(on_predict_button_click)
heatmap_button.on_click(on_heatmap_button_click)
histogram_button.on_click(on_histogram_button_click)
outliers_button.on_click(on_outliers_button_click)
rep_button.on_click(on_rep_button_click)

# Display input widgets and button
display(square_feet_input, bedrooms_input, bathrooms_input, neighborhood_input, year_built_input, predict_button, heatmap_button, histogram_button, outliers_button, rep_button, output)

IntSlider(value=2000, description='Square Feet:', max=5000)

IntSlider(value=3, description='Bedrooms:', max=10)

IntSlider(value=2, description='Bathrooms:', max=5)

Dropdown(description='Neighborhood:', options=('Rural', 'Suburb', 'Urban'), value='Rural')

IntSlider(value=2000, description='Year Built:', max=2023, min=1900)

Button(description='Predict Price', style=ButtonStyle())

Button(description='Heatmap', style=ButtonStyle())

Button(description='Residuals', style=ButtonStyle())

Button(description='Outliers', style=ButtonStyle())

Button(description='Accuracy Metrics', style=ButtonStyle())

Output()