# Model building for house price index

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [40]:
df = pd.read_csv('CSUSHPISA.csv')
df.head()

Unnamed: 0,DATE,Year,CSUSHPISA,CSUSHPISA Average,GDP growth,Unemployment rates,inflation,Interest Rates,Population growth,net migration rate
0,1/1/2003,2003,128.461,133.731333,0.03,0.06,0.02,0.04,0.01,3.7
1,2/1/2003,2003,129.355,,,,,,,
2,3/1/2003,2003,130.148,,,,,,,
3,4/1/2003,2003,130.884,,,,,,,
4,5/1/2003,2003,131.735,,,,,,,


In [41]:
#handling missing values and delete columns which will not needed 
df = df.drop(['DATE', 'CSUSHPISA'], axis=1)
df = df.dropna()
df

Unnamed: 0,Year,CSUSHPISA Average,GDP growth,Unemployment rates,inflation,Interest Rates,Population growth,net migration rate
0,2003,133.731333,0.03,0.06,0.02,0.04,0.01,3.7
12,2004,150.44025,0.04,0.06,0.03,0.04,0.01,3.679
24,2005,171.737,0.03,0.05,0.03,0.04,0.01,3.658
36,2006,183.4475,0.03,0.05,0.03,0.05,0.01,3.637
48,2007,179.918917,0.02,0.05,0.03,0.05,0.01,3.616
60,2008,164.057417,0.0,0.06,0.04,0.04,0.01,3.595
72,2009,148.545083,-0.03,0.09,0.0,0.03,0.01,3.506
84,2010,144.6745,0.03,0.1,0.02,0.03,0.01,3.417
96,2011,139.2595,0.02,0.09,0.03,0.03,0.01,3.329
108,2012,140.993917,0.02,0.08,0.02,0.02,0.01,3.24


In [42]:
df.describe()

Unnamed: 0,Year,CSUSHPISA Average,GDP growth,Unemployment rates,inflation,Interest Rates,Population growth,net migration rate
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,2013.0,186.4392,0.021429,0.059524,0.025238,0.03,0.008571,3.224714
std,6.204837,48.718257,0.021044,0.018568,0.017782,0.010954,0.003586,0.337434
min,2003.0,133.731333,-0.03,0.04,0.0,0.01,0.0,2.748
25%,2008.0,150.44025,0.02,0.05,0.02,0.02,0.01,2.929
50%,2013.0,172.181833,0.02,0.05,0.02,0.03,0.01,3.151
75%,2018.0,202.47775,0.03,0.07,0.03,0.04,0.01,3.595
max,2023.0,302.066875,0.06,0.1,0.08,0.05,0.01,3.7


In [43]:
df.columns

Index(['Year', 'CSUSHPISA Average', 'GDP growth', 'Unemployment rates',
       'inflation', 'Interest Rates', 'Population growth',
       'net migration rate'],
      dtype='object')

In [44]:
# Select the independent variables
X = df[['GDP growth','Unemployment rates', 'Interest Rates', 'inflation','Population growth', 'net migration rate']]

# Define the dependent variable
y = df['CSUSHPISA Average']

# Multiple Regression 

In [45]:
# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create and train the multiple regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 740.4331121617663
R-squared: 0.7871006072075525


# Polynomial Regression

In [48]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2)  # can be adjusted the degree as needed
X_poly = poly.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=0)

# Create and train the polynomial regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 146.31769053615875
R-squared: 0.9579287487846225


# Decision Tree

In [47]:
from sklearn.tree import DecisionTreeRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create and train the decision tree regression model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 186.67170871758327
R-squared: 0.9463256129625665


Achieving high R-squared values, particularly in the range of 0.94 to 0.96, is a strong indication the models are effectively explaining the variations in home prices.

The substantial increase in R-squared values for both the decision tree and polynomial regression models demonstrates that they are now providing a highly accurate representation of the relationships between the included factors and home prices.

Both the decision tree and polynomial regression models have performed impressively.