In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv('/content/kc_house_data.csv') # Use read_csv to read the CSV file

# Create the DataFrame using the appropriate columns
df = pd.DataFrame(data)
# Set the target variable
df['MedHouseVal'] = data['price'] # Set the target variable to 'price'

# Convert the 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# Extract numerical features from the 'date' column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original 'date' column
df = df.drop('date', axis=1)

# Split the data into features and target
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_test_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
print(f'Testing MSE: {mse}')

# Predict the price of a new house
# Ensure new_house has the same number of features as X_train
# The previous new_house array was missing a feature (likely 'sqft_living15').
# I have added a placeholder value of 1690 (you should replace this with the actual value)
new_house = np.array([[8, 2014, 10, 13, 4, 2.50, 2570, 7242, 2.0, 0, 0, 4, 4, 2570, 880, 1960, 0, 98028, 47.7210, -122.319, 1690, 1690, 7639]])
predicted_price = model.predict(new_house)
print(f'Predicted price for the new house: ${predicted_price[0]:.2f}')

Testing MSE: 3.8620528950017814e-21
Predicted price for the new house: $2014.00




In [5]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error: {mse:.2f}')

# Calculate R-squared (R2)
r2 = r2_score(y_test, y_test_pred)
print(f'R-squared: {r2:.2f}')

Mean Squared Error: 0.00
R-squared: 1.00
