In [13]:
import pandas as pd

# Import Dataset
parcel_data = pd.read_csv('../data/cleaned/fy2023-property-assessment-data-cleaned.csv')

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# assuming the data is in a dataframe named 'parcel_data'
X = parcel_data.drop('TOTAL_VALUE', axis=1)
y = parcel_data['TOTAL_VALUE']

categorical_cols = []
numerical_cols = []

for col in X.columns:
    if X[col].dtype == 'object':
        categorical_cols.append(col)
    else:
        numerical_cols.append(col)

# Create the preprocessing pipelines for both numeric and categorical data.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

#fit and transform the data
X = preprocessor.fit_transform(X)

# Split the data into training and test sets (30% held out for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm

degrees = [1, 2]
for degree in tqdm(degrees):
    # create polynomial features up to specified degree
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # create linear regression model
    lr_model = LinearRegression()

    # fit the model on the training data
    lr_model.fit(X_train_poly, y_train)

    # predict the values for the testing data
    y_pred = lr_model.predict(X_test_poly)

    # evaluate model performance
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print('Degree:', degree)
    print('R2 score:', r2_score(y_test, y_pred))
    print('Mean squared error:', mean_squared_error(y_test, y_pred))


 50%|█████     | 1/2 [00:00<00:00,  2.15it/s]

Degree: 1
R2 score: 0.6483626255654729
Mean squared error: 106284551387945.47


100%|██████████| 2/2 [00:05<00:00,  2.96s/it]

Degree: 2
R2 score: 0.49385947610991077
Mean squared error: 152984075163870.66



