In [1]:
#Imports
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
# Connect to the database file
conn = sqlite3.connect('../Data/new_database.db')
# Create a cursor object to execute SQL queries
cursor = conn.cursor()

In [3]:
# Load data from the SQLite database
query = 'SELECT bed, bath, acre_lot, house_size, sold_previously, city, state, zip_code, price FROM RhodeIsland'
housing_df = pd.read_sql_query(query, conn)

In [4]:
# Perform one-hot encoding for categorical variables
# housing_df = pd.get_dummies(housing_df, columns=['city', 'zip_code'])
city_encoder = LabelEncoder()
city_df = city_encoder.fit_transform(housing_df['city'])
zip_encoder = LabelEncoder()
zip_df = zip_encoder.fit_transform(housing_df['zip_code'])
city_zip_df = pd.DataFrame({"city_encoded":
    pd.Series(city_df), "zip_encoded": pd.Series(zip_df)})



# Split the data into training and testing sets
X = pd.concat([housing_df[['bed', 'bath', 'acre_lot', 'house_size', 'sold_previously']], city_zip_df], axis=1)
y = housing_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
y.shape

(29596,)

In [6]:
# Create a Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

Mean Absolute Error: 222948.88217988083
Root Mean Squared Error: 404085.64829403156


In [7]:
# Example input data for prediction
new_data = pd.DataFrame({
    'bed': [3],
    'bath': [2],
    'acre_lot': [1.0],
    'house_size': [1500],
    'sold_previously': [0],
    'city': ['Lincoln'],
    'zip_code': ['2838.0']

})

encoded_new_data = city_encoder.transform(new_data['city'])
new_data ['city_encoded'] = encoded_new_data[0]
encoded_new_data = zip_encoder.transform(new_data['zip_code'])
new_data ['zip_encoded'] = encoded_new_data[0]

# Predict the price for the new data
del new_data['zip_code']
del new_data['city']
predicted_price = model.predict(new_data)

print(f"Predicted Price: ${predicted_price[0]}")

Predicted Price: $494878.33739111526


In [8]:
conn.close()