<a href="https://colab.research.google.com/github/ZacharyFry1/DD-Science-Cohort15/blob/main/Project_2_Version7_RMSPE_14_27.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 2 - Housing Prices

## Problem Definition


The goal of this project is to design a linear regression model with the smallest amount of RMS percentage error that accurately predicts the sale price of the house. The sale price is the target, making this is a supervised problem. The regression model will be unidimensional because we only care about how different features affect the home's sale price.

## Data Collection/Sources


### Imports

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

### Reading In Data

In [None]:
url = 'https://ddc-datascience.s3.amazonaws.com/Projects/Project.2-Housing/Data/Housing.Data.csv'

In [None]:
housing_df = pd.read_csv(url)
housing_df

## Data Cleaning


### Dropping Null Values

In [None]:
housing_df.isnull().sum().sum()

In [None]:
null_sums = housing_df.isna().sum()
null_sums[null_sums > 0]

Creating a Threshold For Null Values. All columns above this about will be dropped.

In [None]:
housing_df_clean = housing_df.copy()

In [None]:
# Threshold for non-nulls.
threshold = 10  # Set your threshold
null_percentages = housing_df_clean.isnull().sum() / len(housing_df_clean) * 100
columns_to_keep = null_percentages[null_percentages <= threshold].index
housing_df_clean = housing_df_clean[columns_to_keep]

In [None]:
housing_df_clean = housing_df_clean.dropna()

In [None]:
housing_df_clean = housing_df_clean.drop('PID', axis = 1)

In [None]:
housing_df_clean.isna().sum().sum()

### Creating a DataFrame For the Categorical Columns

In [None]:
housing_df_categorical_cols = housing_df_clean.select_dtypes(include=['object']).columns
housing_df_categorical_cols

### Creating Numerical DataFrame

In [None]:
housing_df_numerical = housing_df_clean.select_dtypes(exclude=['object'])

In [None]:
housing_df_numerical

### Checking The Target Column

In [None]:
housing_df_clean['SalePrice'].value_counts()
housing_df_clean['SalePrice']

## Exploratory Data Analysis


In [None]:
# Let's take a look at a correlation plot
plt.figure(figsize=(30,30))
correlation_matrix = housing_df_numerical.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True) ;

In [None]:
corrs = housing_df_numerical.corr().round(2).unstack().abs()
corrs = corrs[corrs < 1]
corrs.sort_values(ascending = False)[::2]

## Processing


### One Hot Encoding

In [None]:
housing_df_categorical_cols = housing_df_clean.select_dtypes(include=['object']).columns

In [None]:
housing_df_encoded = pd.get_dummies(housing_df_categorical_cols)

In [None]:
housing_df_encoded.isna().sum().sum()

### Combining the Numerical and Encoded DataFrames

In [None]:
combined_housing_df =  pd.concat([housing_df_numerical, housing_df_encoded], axis = 1)

Fill the NA's with the means of each column.

In [None]:
column_means = combined_housing_df.mean()
combined_housing_df = combined_housing_df.fillna(column_means)

In [None]:
# First we will break up our data into training and testing sets
X = combined_housing_df.drop('SalePrice', axis = 1)
y = combined_housing_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4)

In [None]:
# Fit a linear model using Sklearn
model = LinearRegression()
combined_housing_df_fit_SK = model.fit(X_train, y_train)

(combined_housing_df_fit_SK.intercept_, combined_housing_df_fit_SK.coef_ )


In [None]:
y_pred = combined_housing_df_fit_SK.predict(X_test)
list(zip(y_test, y_pred))[:5]

### RMSPE: 14.27

In [140]:
 rmspe = (np. sqrt(np. mean(np. square((y_test - y_pred) / y_test)))) * 100
 rmspe

14.278548833445894

### Filtering Data/ Feature Selection


In [None]:
housing_df_numerical.corr()['SalePrice'].abs().sort_values(ascending = False)

In [None]:
housing_df_numerical.info()

In [None]:
housing_df_numerical.corr()['SalePrice'].abs().sort_values(ascending = False).plot(kind = 'bar', figsize = (10,5)) ;

In [None]:
corrs = housing_df_numerical.corr()['SalePrice'].abs().sort_values(ascending = False)
keep = corrs[(corrs>.2) & (corrs <1)]
keep

In [None]:
X_corr = X[keep.index]
X_corr.info()

In [None]:
X = pd.concat([X_corr, housing_df_encoded], axis = 1)
X.info()

In [None]:
X = combined_housing_df.drop('SalePrice', axis = 1)
y = combined_housing_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4)


model = LinearRegression()
combined_housing_df_fit_SK = model.fit(X_train, y_train)

(combined_housing_df_fit_SK.intercept_, combined_housing_df_fit_SK.coef_ )


In [None]:
y_pred = combined_housing_df_fit_SK.predict(X_test)
list(zip(y_test, y_pred))[:5]

In [None]:
 rmspe = (np. sqrt(np. mean(np. square((y_test - y_pred) / y_test)))) * 100
 rmspe

## Data Visualization/Communication of Results


In [None]:
y_pred = combined_housing_df_fit_SK.predict(X_test)
plt.scatter(y_test, y_pred)
plt.plot([min(y_pred), max(y_pred)],[min(y_pred), max(y_pred)], c='red')
plt.xlabel('Actual Sales Price')
plt.ylabel('Predicted Sales Price') ;

**QQ PLOT**

In [None]:
res = y_test - y_pred
sm.qqplot(res, fit=True, line="45") ;

In [None]:
# Residuals vs Fitted (Predicted) Values - Constant variance
plt.figure(figsize = (10,6))
plt.scatter(y_pred, res)
plt.xlabel("Fitted")
plt.ylabel("Residuals")
plt.hlines(0, min(y_pred), max(y_pred), colors = 'red', linestyles = 'dashed') ;

In [None]:
# Residuals vs Time - indepedence
plt.figure(figsize = (10,6))
plt.scatter(range(len(res)), res)
plt.plot(range(len(res)), res, 'b')
plt.xlabel("Time")
plt.ylabel("Residuals")
plt.hlines(0, min(range(len(res))), max(range(len(res))), colors = 'red', linestyles = 'dashed') ;

### SUMMARY


Dropped all null values. One hot encoded the categorical columns. Dropped columns where the non-null values where above 15% of the data. Replaced all 0 values with the mean of each column. RSMPE value of 14.24.