In [1]:
# Load Data
import pandas as pd
from sklearn.preprocessing import StandardScaler

csv_file_path = '/Users/macbookpro/Desktop/COMP 562/(Use this) housing_price_dataset.csv' 
label_data = pd.read_csv(csv_file_path)

# Get labels and variables
labels = label_data.iloc[0:50001, 5].values
print(labels)
print(labels.shape)
variables = label_data.iloc[0:50001, 0:5]
print(variables)
print(variables.shape)

# Calculate means and standard deviations
means = variables.mean()
std_devs = variables.std()

print("Means:")
print(means)

print("\nStandard Deviations:")
print(std_devs)

ModuleNotFoundError: No module named 'sklearn'

In [2]:
# Linear Regression

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# train/test split
X_train, X_test, y_train, y_test = train_test_split(variables, labels, test_size=0.3, random_state=42)

# create linear regression model
linear_regressor = LinearRegression()

# evaluation
linear_regressor.fit(X_train, y_train)
y_pred = linear_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test Mean Square Error: {mse}")
print(f"Test R² score: {r2}")

Test Mean Square Error: 2468771544.275607
Test R² score: 0.5728435816569826


In [3]:
# LASSO
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score

# train/test split
X_train, X_test, y_train, y_test = train_test_split(variables, labels, test_size=0.3, random_state=42)

# define param_grid parameters for LASSO
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10]
}

# create LASSO regressor
lasso_regressor = Lasso()

# use GridSearchCV to test all parameters
grid_search = GridSearchCV(lasso_regressor, param_grid, cv=2, scoring='neg_mean_squared_error')
grid_search.fit(variables, labels)

# get best parameters
best_params = grid_search.best_params_
print("Best Parameters: ", best_params)

# create a new regressor with best parameters
best_lasso_regressor = Lasso(**best_params)

# calculate mse in 5-folds using LASSO
cv = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = cross_val_score(best_lasso_regressor, variables, labels, cv=cv, scoring='neg_mean_squared_error')
mean_mse = np.mean(-mse_scores)
std_mse = np.std(-mse_scores)

# calculate R² in 5 folds using LASSO
r2_scores = cross_val_score(best_lasso_regressor, variables, labels, cv=cv, scoring='r2')
mean_r2 = np.mean(r2_scores)
std_r2 = np.std(r2_scores)

print(f'5-Fold Cross Validation MSE: {mean_mse} +/- {std_mse}')
print(f'5-Fold Cross Validation R²: {mean_r2} +/- {std_r2}')

best_lasso_regressor.fit(X_train, y_train)
y_pred = best_lasso_regressor.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print test set results
print(f'Test MSE: {test_mse}')
print(f'Test R²: {test_r2}')


Best Parameters:  {'alpha': 10}
5-Fold Cross Validation MSE: 2492982836.592742 +/- 29111871.97167061
5-Fold Cross Validation R²: 0.5699567726086732 +/- 0.0040731464261042115
Test MSE: 2468745888.303739
Test R²: 0.5728480207526445


In [4]:
# Random Forest
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score

# define param_grid parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

n_splits = 5
# create RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(variables, labels, test_size=0.3, random_state=42)

# use KFold for cross-validation
skf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# create GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=skf, scoring='neg_mean_squared_error', n_jobs=-1)

# use GridSearchCV to test all parameters
grid_search.fit(variables, labels)

# get best parameters
best_params = grid_search.best_params_
print("Best parameters: ", best_params)

# use best parameters to evaluate rf
total_mse = []
total_r2 = []

for fold, (train_index, test_index) in enumerate(skf.split(variables, labels)):
    X_train, X_test = variables.iloc[train_index], variables.iloc[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    model = RandomForestRegressor(**best_params, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    total_mse.append(mse)
    total_r2.append(r2)

# get average accuracy
average_mse = np.mean(total_mse)
std_mse = np.std(total_mse)
average_r2 = np.mean(total_r2)
std_r2 = np.std(total_r2)
print(f"5-Fold Cross Validation MSE: {average_mse} +/- {std_mse}")
print(f"5-Fold Cross Validation R²: {average_r2} +/- {std_r2}")

model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

# Print test set results
print(f'Test MSE: {test_mse}')
print(f'Test R²: {test_r2}')

Best parameters:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
5-Fold Cross Validation MSE: 2526867451.760862 +/- 27138793.35331006
5-Fold Cross Validation R²: 0.5641104831619396 +/- 0.0037409341851049856
Test MSE: 2532594262.239625
Test R²: 0.5639290760524656
