<a href="https://colab.research.google.com/github/brendanpshea/data-science/blob/main/MakeShireHouseData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from scipy.stats import norm, gamma
import statsmodels.formula.api as smf

# Set random seed for reproducibility
np.random.seed(42)

# Number of houses
n_houses = 1000

# Generate attributes
square_footage = np.random.normal(800, 200, n_houses)
age = np.random.gamma(shape=2, scale=15, size=n_houses)
neighborhood = np.random.choice(['Hobbiton', 'Bywater', 'Tuckborough', 'Michel Delving', 'Buckland'], n_houses)
garden_size = np.random.normal(500, 100, n_houses)
distance_to_pub = np.random.gamma(shape=2, scale=0.5, size=n_houses)

# Create correlation matrix
corr_matrix = np.array([
    [1.0, -0.3, 0.0, 0.4, -0.2],
    [-0.3, 1.0, 0.0, -0.2, 0.1],
    [0.0, 0.0, 1.0, 0.0, 0.0],
    [0.4, -0.2, 0.0, 1.0, -0.1],
    [-0.2, 0.1, 0.0, -0.1, 1.0]
])

# Generate correlated data
data = np.column_stack((square_footage, age, garden_size, distance_to_pub))
data = np.random.multivariate_normal(mean=np.zeros(4), cov=corr_matrix[:4, :4], size=n_houses)

# Adjust means and scales
data[:, 0] = data[:, 0] * 200 + 800  # square_footage
data[:, 1] = np.abs(data[:, 1] * 15 + 30)  # age
data[:, 2] = data[:, 2] * 100 + 500  # garden_size
data[:, 3] = np.abs(data[:, 3] * 0.5 + 1)  # distance_to_pub

# Generate prices (in gold pieces)
coefficients = [1.5, -0.5, 0.8, 0.3, -0.2, 0.001, 0.002]
base_noise = np.random.normal(0, 200, n_houses)  # Increased standard deviation
additional_noise = np.random.exponential(scale=100, size=n_houses)  # Additional non-normal noise
total_noise = base_noise + additional_noise

# Adding polynomial and interaction terms
polynomial_term = data[:, 0] ** 2
interaction_term = data[:, 0] * data[:, 2]

prices = (
    coefficients[0] * data[:, 0] +
    coefficients[1] * data[:, 1] +
    coefficients[2] * data[:, 2] +
    coefficients[3] * data[:, 3] +
    coefficients[4] * (neighborhood == 'Hobbiton') * 100 +
    coefficients[5] * polynomial_term +
    coefficients[6] * interaction_term +
    total_noise
)
prices = np.abs(prices * 0.5 + 1000)

df = pd.DataFrame({
    'SquareFootage': data[:, 0].astype(int),
    'Age': data[:, 1].astype(int),
    'Neighborhood': neighborhood,
    'GardenSize': data[:, 2].astype(int),
    'DistanceToPub': data[:, 3].round(1),
    'Price': prices.astype(int)
})

# Save to CSV
df.to_csv('shire_house_prices.csv', index=False)

print("CSV file 'shire_house_prices.csv' has been generated.")
df.describe().round(2)

CSV file 'shire_house_prices.csv' has been generated.


Unnamed: 0,SquareFootage,Age,GardenSize,DistanceToPub,Price
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,798.65,29.84,498.57,1.02,2576.99
std,201.64,14.33,100.55,0.49,444.37
min,168.0,0.0,158.0,0.0,1306.0
25%,667.0,20.0,431.5,0.7,2279.0
50%,803.5,30.0,499.0,1.0,2567.0
75%,934.0,40.0,568.0,1.3,2847.5
max,1363.0,76.0,792.0,2.8,4251.0


In [5]:
import numpy as np
import pandas as pd
from scipy.stats import norm, gamma
import statsmodels.formula.api as smf

# Fit the OLS model using R-like formula
model = smf.ols(formula='Price ~ SquareFootage + I(SquareFootage**2) + Age + Neighborhood + GardenSize + DistanceToPub + SquareFootage:GardenSize', data=df).fit()

model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.935
Model:,OLS,Adj. R-squared:,0.935
Method:,Least Squares,F-statistic:,1432.0
Date:,"Wed, 10 Jul 2024",Prob (F-statistic):,0.0
Time:,01:48:06,Log-Likelihood:,-6145.5
No. Observations:,1000,AIC:,12310.0
Df Residuals:,989,BIC:,12370.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1219.2857,83.298,14.638,0.000,1055.824,1382.748
Neighborhood[T.Bywater],-4.3297,11.387,-0.380,0.704,-26.675,18.015
Neighborhood[T.Hobbiton],-4.6007,11.567,-0.398,0.691,-27.299,18.097
Neighborhood[T.Michel Delving],-5.1798,11.138,-0.465,0.642,-27.036,16.677
Neighborhood[T.Tuckborough],0.3382,11.309,0.030,0.976,-21.855,22.531
SquareFootage,0.5236,0.133,3.942,0.000,0.263,0.784
I(SquareFootage ** 2),0.0005,6.43e-05,7.819,0.000,0.000,0.001
Age,0.1157,0.266,0.435,0.663,-0.406,0.637
GardenSize,0.0215,0.145,0.148,0.882,-0.263,0.306

0,1,2,3
Omnibus:,16.599,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.006
Skew:,0.317,Prob(JB):,0.000203
Kurtosis:,3.085,Cond. No.,19800000.0
