In [1]:
import pandas as pd
import numpy as np
import joblib
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer

In [2]:
sklearn.__version__

'1.3.2'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

slo_re = pd.read_excel('/content/drive/My Drive/SLO_RE_Clean.xlsx')

Mounted at /content/drive


In [4]:
slo_re_anova = slo_re.dropna(subset=['Lot Size', 'Latitude', 'Longitude', 'Year Built', 'New Construction', 'Attached Garage',
                                     'Stories', 'Garage Spaces', 'HOA Fee', 'Lot Size', 'Square Footage', 'Baths', 'Beds',
                                     'Structure Type', 'List Price', 'Days On Market'])

def days_mkt_category_binary(days):
  if days <= 45:
    return 'average'
  else:
    return 'slow'

slo_re_anova['Days Mkt Category Binary'] = slo_re_anova['Days On Market'].apply(days_mkt_category_binary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slo_re_anova['Days Mkt Category Binary'] = slo_re_anova['Days On Market'].apply(days_mkt_category_binary)


In [5]:
slo_re_anova['Structure Type'].unique()

array(['House', 'Triplex', 'Multi-Family'], dtype=object)

In [9]:
X = slo_re_anova[['Lot Size', 'List Price', 'Baths', 'Structure Type', 'Beds', 'Square Footage', 'HOA Fee', 'Stories', 'Attached Garage', 'New Construction']]
y = slo_re_anova['Days Mkt Category Binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Define the column transformer and pipeline
ct = make_column_transformer(
    (StandardScaler(), ['Lot Size', 'List Price', 'Square Footage', 'HOA Fee']),  # Scale numerical features
    (OneHotEncoder(handle_unknown="ignore"), ['Baths', 'Structure Type', 'Beds', 'Stories', 'Attached Garage', 'New Construction']),  # Encode categorical features
    remainder="passthrough"
)

pipeline = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators= 55, random_state=42)  # Random Forest model with default parameters
)


class_RFModel = pipeline.fit(X=X_train,
             y=y_train)

slo_re_anova['Predicted DOM'] = class_RFModel.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slo_re_anova['Predicted DOM'] = class_RFModel.predict(X)


In [10]:
slo_re_anova.to_excel('slo_re_anova.xlsx')

In [7]:
class_RFModel.predict(pd.DataFrame([[4000, 729000, 1, 'House', 2, 900, 0, 1, 'No', 0]],
                                columns=['Lot Size', 'List Price', 'Baths', 'Structure Type', 'Beds', 'Square Footage', 'HOA Fee', 'Stories', 'Attached Garage', 'New Construction']))

array(['average'], dtype=object)

In [8]:
joblib.dump(class_RFModel, 'model.pkl')

['model.pkl']