In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# File path specification
file_path = r'C:\Users\burci\OneDrive\Masaüstü\Soostone Data Science Assignment\nyc-rolling-sales.csv'

# Load the data
data = pd.read_csv(file_path)

# Remove unnecessary columns
columns_to_drop = ['Unnamed: 0', 'EASE-MENT', 'SALE DATE', 'ADDRESS', 'APARTMENT NUMBER']
data_cleaned = data.drop(columns=columns_to_drop)

# Convert 'SALE PRICE' to numeric
data_cleaned['SALE PRICE'] = pd.to_numeric(data_cleaned['SALE PRICE'], errors='coerce')

# Drop rows with NaN or zero 'SALE PRICE'
data_cleaned = data_cleaned.dropna(subset=['SALE PRICE'])
data_cleaned = data_cleaned[data_cleaned['SALE PRICE'] > 0]

# Convert other numeric columns
data_cleaned['LAND SQUARE FEET'] = pd.to_numeric(data_cleaned['LAND SQUARE FEET'].str.replace(',', ''), errors='coerce')
data_cleaned['GROSS SQUARE FEET'] = pd.to_numeric(data_cleaned['GROSS SQUARE FEET'].str.replace(',', ''), errors='coerce')

# Fill missing values with zero
data_cleaned['LAND SQUARE FEET'] = data_cleaned['LAND SQUARE FEET'].fillna(0)
data_cleaned['GROSS SQUARE FEET'] = data_cleaned['GROSS SQUARE FEET'].fillna(0)

# Convert categorical columns to category dtype
categorical_columns = ['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT', 
                       'BUILDING CLASS AT PRESENT', 'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE']
for col in categorical_columns:
    data_cleaned[col] = data_cleaned[col].astype('category')

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data_cleaned, drop_first=True)

# Separate features and target variable
X = data_encoded.drop(columns=['SALE PRICE'])
y = data_encoded['SALE PRICE']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Display evaluation metrics
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R²: {r2}')

# Display feature importance (coefficients)
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
coefficients_sorted = coefficients.sort_values(by='Coefficient', ascending=False)
print(coefficients_sorted)


MAE: 1221071.4341685264
RMSE: 15877117.296727758
R²: 0.41256022611950094
                                    Coefficient
NEIGHBORHOOD_BLOOMFIELD            1.929278e+08
BUILDING CLASS AT PRESENT_C5       7.118654e+07
BUILDING CLASS AT PRESENT_C7       6.644649e+07
BUILDING CLASS AT TIME OF SALE_O4  6.445091e+07
BUILDING CLASS AT PRESENT_O4       6.445091e+07
...                                         ...
BUILDING CLASS AT TIME OF SALE_C5 -5.341390e+07
BUILDING CLASS AT TIME OF SALE_HR -6.098554e+07
BUILDING CLASS AT TIME OF SALE_H4 -6.219597e+07
BUILDING CLASS AT PRESENT_R5      -7.661691e+07
BUILDING CLASS AT PRESENT_RB      -1.020037e+08

[630 rows x 1 columns]
