In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score, mean_absolute_percentage_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import KFold
pd.set_option('display.max_rows', None)


# 1. Data Analyze


In [2]:
df=pd.read_csv('/content/complete_cleaned_data (1).csv')
print(df.shape)
df.head()

(47848, 309)


Unnamed: 0,ClosePrice,ViewYN,PoolPrivateYN,Latitude,Longitude,LivingArea,AttachedGarageYN,ParkingTotal,BathroomsTotalInteger,BedroomsTotal,...,District__West Covina Unified School District,District__Western Placer Unified School District,District__Willits Unified School District,District__Willows Unified School District,District__Windsor Unified School District,District__Wiseburn Unified School District,District__Woodland Joint Unified School District,District__Yosemite Unified School District,District__Yuba City Unified School District,District__Yucaipa-Calimesa Joint Unified School District
0,890000.0,1,0,34.180411,-118.34202,1434.0,0,1,1.0,3.0,...,False,False,False,False,False,False,False,False,False,False
1,1138000.0,0,0,32.574359,-117.023836,2872.0,1,6,3.0,5.0,...,False,False,False,False,False,False,False,False,False,False
2,681877.0,1,0,33.72508,-117.222302,2824.0,1,2,3.0,5.0,...,False,False,False,False,False,False,False,False,False,False
3,900000.0,1,0,34.203479,-118.643567,2500.0,1,2,3.0,5.0,...,False,False,False,False,False,False,False,False,False,False
4,862000.0,0,0,34.460368,-118.490755,2363.0,1,2,3.0,5.0,...,False,False,False,False,False,False,False,False,False,False


Let's identify what data features we have.

In [3]:
df.dtypes

Unnamed: 0,0
ClosePrice,float64
ViewYN,int64
PoolPrivateYN,int64
Latitude,float64
Longitude,float64
LivingArea,float64
AttachedGarageYN,int64
ParkingTotal,int64
BathroomsTotalInteger,float64
BedroomsTotal,float64


We can identify there are many encoding features of **District** and **Flooring**. We will reverse those features to identify any implicit and explicit missing values.

In [4]:
flooring= []

for col in df.columns:
  if col.startswith("Flooring_"):
    flooring.append(col)

df["FlooringType"]=df[flooring].apply(lambda x: x.idxmax(), axis=1)
df["FlooringType"] = df["FlooringType"].str.replace("Flooring_", "")

df=df.drop(columns=flooring)

In [5]:
District = []

for col in df.columns:
  if col.startswith("District_"):
    District.append(col)

df["SchoolDistrict"]=df[District].apply(lambda x: x.idxmax(), axis=1)
df["SchoolDistrict"] = df["SchoolDistrict"].str.replace("Flooring_", "")

df=df.drop(columns=District)

In [6]:
df.dtypes

Unnamed: 0,0
ClosePrice,float64
ViewYN,int64
PoolPrivateYN,int64
Latitude,float64
Longitude,float64
LivingArea,float64
AttachedGarageYN,int64
ParkingTotal,int64
BathroomsTotalInteger,float64
BedroomsTotal,float64


# 2. Data Cleaning

We will identify any missing values in this dataset.

In [7]:
df.isna().sum()

Unnamed: 0,0
ClosePrice,0
ViewYN,0
PoolPrivateYN,0
Latitude,0
Longitude,0
LivingArea,0
AttachedGarageYN,0
ParkingTotal,0
BathroomsTotalInteger,0
BedroomsTotal,0


Due to data cleaning part of **Age** from Tara, we will drop that 11 explicit missing values.

In [8]:
df.dropna(subset=["Age"], inplace=True)
df.isna().sum()

Unnamed: 0,0
ClosePrice,0
ViewYN,0
PoolPrivateYN,0
Latitude,0
Longitude,0
LivingArea,0
AttachedGarageYN,0
ParkingTotal,0
BathroomsTotalInteger,0
BedroomsTotal,0


There are no explicit missing values, now we will check if there is implicit missing values. All the numerical features are not object that there is no implicit missing values. Let's check the categorical features.

In [9]:
df["SchoolDistrict"].value_counts()

Unnamed: 0_level_0,count
SchoolDistrict,Unnamed: 1_level_1
District__Unknown,11385
District__Los Angeles Unified School District,4097
District__San Diego City Unified School District,1226
District__Capistrano Unified School District,994
District__Desert Sands Unified School District,963
District__Corona-Norco Unified School District,795
District__Palm Springs Unified School District,729
District__Riverside Unified School District,713
District__Hemet Unified School District,673
District__Long Beach Unified School District,628


In [10]:
#Take off "District_"
df["SchoolDistrict"] = df["SchoolDistrict"].str.replace("District__", '')

count_district=df["SchoolDistrict"].value_counts()
count_district

Unnamed: 0_level_0,count
SchoolDistrict,Unnamed: 1_level_1
Unknown,11385
Los Angeles Unified School District,4097
San Diego City Unified School District,1226
Capistrano Unified School District,994
Desert Sands Unified School District,963
Corona-Norco Unified School District,795
Palm Springs Unified School District,729
Riverside Unified School District,713
Hemet Unified School District,673
Long Beach Unified School District,628


There are too many unique value in school district that would potentionally lead overfitting and noise for machine learning performance. We will group it into **Top Tier District**, **Mid Tier District**, **Low Tier District**, and **Rare District** without including Unknown.

In [14]:
#call all unique values without "Unknown"
count_district = df[df["SchoolDistrict"] != "Unknown"]["SchoolDistrict"].value_counts()
count_district

#Threshold for grouping the unique values
top = count_district.quantile(0.9)
mid= count_district.quantile(0.7)
low= count_district.quantile(0.5)

#Call out the grouping names including threshold into 4 different groups
def categorize_district(district):
  if district == "Unknown":
    return "Unknown"

  count = count_district[district]
  if count >= top:
    return "Top Tier District"
  elif count >= mid:
    return "Mid Tier District"
  elif count >= low:
    return "Low Tier District"
  else:
    return "Rare District"

#Apply the function
df["SchoolDistrict"] = df["SchoolDistrict"].apply(categorize_district)

#Check
df["SchoolDistrict"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
SchoolDistrict,Unnamed: 1_level_1
Top Tier District,0.404018
Low Tier District,0.241508
Unknown,0.237996
Rare District,0.116479


Let's identify the **Flooring Type** features.

In [13]:
df["FlooringType"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
FlooringType,Unnamed: 1_level_1
Unknown,0.368522
Carpet,0.302088
Laminate,0.105609
Tile,0.092
Wood,0.052783
Vinyl,0.043126
SeeRemarks,0.016243
Stone,0.009031
Concrete,0.005832
Bamboo,0.003867


There are implicit missing values in **Flooring Type** as well. We will use Random Forest Classifier to predict the implicit missing values of **Flooring Type** and **School District**.

# 3. Random Forest Classifier

In [None]:
#Call out the copy for test and replace it to original data
df2 = df.copy()

#Call predictors for Flooring Type
predictors=["ClosePrice", "LivingArea", "BathroomsTotalInteger", "BathroomsTotalInteger", "GarageSpaces", "Longitude", "Latitude"]

le = LabelEncoder()

#Change Unknown values into explicit missing value (Since we know there are no explicit missing values, it is okay to use it.)
df2["FlooringType"]= df2["FlooringType"].replace("Unknown", np.nan)

#Divide into no explicit missing values and missing values(Unknown).
known = df2[df2["FlooringType"].notna()]
unknown=df2[df2["FlooringType"].isna()]

#Change into numerical form
known["FlooringType"]=le.fit_transform(known["FlooringType"])


x_train,x_test,y_train,y_test=train_test_split(known[predictors], known["FlooringType"], test_size=0.2, random_state=42)

#Randomforest perform predicting Flooring Type missing values using predictors of no explicit missing values of Flooring Type.
clf=RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

y_pred=clf.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)

#Predict the missing values of Flooring Type
predict_flooring=clf.predict(unknown[predictors])

#Reverse numerical into categorical feature.
predict_flooring_labels=le.inverse_transform(predict_flooring)

#Replace it in original data of Flooring Type of Unknown
df.loc[df["FlooringType"] == "Unknown", "FlooringType"]=predict_flooring_labels

#Check
accuracy,df["FlooringType"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known["FlooringType"]=le.fit_transform(known["FlooringType"])


(0.46226415094339623,
 FlooringType
 Carpet        27167
 Laminate       7554
 Tile           5618
 Wood           3277
 Vinyl          2399
 SeeRemarks      824
 Stone           461
 Concrete        292
 Bamboo          198
 Brick            47
 Name: count, dtype: int64)

In [None]:
predictors=["ClosePrice", "LivingArea", "BathroomsTotalInteger", "BathroomsTotalInteger", "GarageSpaces", "Longitude", "Latitude"]

df2["SchoolDistrict"]= df2["SchoolDistrict"].replace("Unknown", np.nan)

known = df2[df2["SchoolDistrict"].notna()]
unknown=df2[df2["SchoolDistrict"].isna()]

known["SchoolDistrict"]=le.fit_transform(known["SchoolDistrict"])

x_train,x_test,y_train,y_test=train_test_split(known[predictors], known["SchoolDistrict"], test_size=0.2, random_state=42)

clf.fit(x_train, y_train)


y_pred=clf.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)

predict_sd=clf.predict(unknown[predictors])
predict_sd_labels=le.inverse_transform(predict_sd)
df.loc[df["SchoolDistrict"] == "Unknown", "SchoolDistrict"]=predict_sd_labels

accuracy,df["SchoolDistrict"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known["SchoolDistrict"]=le.fit_transform(known["SchoolDistrict"])


(0.9532300096008778,
 SchoolDistrict
 Top Tier District    23071
 Mid Tier District    15560
 Low Tier District     6441
 Rare District         2765
 Name: count, dtype: int64)

In [None]:
df_dum = pd.get_dummies(df, drop_first=True)

for i in df_dum.columns:
  print(i)

ClosePrice
ViewYN
PoolPrivateYN
Latitude
Longitude
LivingArea
AttachedGarageYN
ParkingTotal
BathroomsTotalInteger
BedroomsTotal
FireplaceYN
Stories
NewConstructionYN
GarageSpaces
LotSizeSquareFeet
SalesTaxRate
UnemploymentRate
MortgageRate30Fixed
FedInterestRate
CPI
Age
LotDensity
FlooringType_Brick
FlooringType_Carpet
FlooringType_Concrete
FlooringType_Laminate
FlooringType_SeeRemarks
FlooringType_Stone
FlooringType_Tile
FlooringType_Vinyl
FlooringType_Wood
SchoolDistrict_Mid Tier District
SchoolDistrict_Rare District
SchoolDistrict_Top Tier District


In [None]:
y=df_dum['ClosePrice']
x=df_dum.drop('ClosePrice', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=50)


In [None]:
#Call Random Forest
rfr=RandomForestRegressor(random_state=42, n_estimators=100)
rfr.fit(x_train, y_train)

y_pred=rfr.predict(x_test)


print("Train MAE:", mean_absolute_error(y_test, y_pred))
print("Train R2 score:", r2_score(y_test, y_pred))
print("Train MAPE:" , mean_absolute_percentage_error(y_test, y_pred)*100,"%")


Train MAE: 133947.2324080668
Train R2 score: 0.8653037093860331
Train MAPE: 11.962155277833801 %


In [None]:
# Call for hyper parameter tunning
param={
    'n_estimators': randint(100, 300),
    'max_depth':randint(1, 20),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 5)
}

In [None]:
#Call MAPE score
mape_score = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

#Call K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#Train and apply K-Fold to RandomSearch Cross-Validation with scoring MAPE
rfr_cv=RandomizedSearchCV(estimator=rfr, param_distributions=param, n_iter=15, cv=kf,scoring=mape_score, n_jobs=-1)
rfr_cv.fit(x_train, y_train)

#Call the best parameter
best_param=rfr_cv.best_params_
print("Best Parameters", best_param)


Best Parameters {'max_depth': 15, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 220}


In [None]:
#Apply the best parameter to Random Forest Regressor
final_model = RandomForestRegressor(**best_param, random_state=50)
final_model.fit(x_train, y_train)

#Call both train and test prediction to identify overfitting
y_pred_train = final_model.predict(x_train)
y_pred_test = final_model.predict(x_test)

print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
print("Train R2 score:", r2_score(y_train, y_pred_train))
print("Train MAPE:" , mean_absolute_percentage_error(y_train, y_pred_train)*100,"%")
print()
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Test R2 score:", r2_score(y_test, y_pred_test))
print("Test MAPE:" , mean_absolute_percentage_error(y_test, y_pred_test)*100,"%")

Train MAE: 97148.94909393135
Train R2 score: 0.9395268555728947
Train MAPE: 9.237295799452983 %

Test MAE: 141114.5708826923
Test R2 score: 0.8559702620043156
Test MAPE: 12.776124747702239 %


In [15]:
#Call the feature importances value used from Random Forest Regressor with sorting it.
importances = pd.Series(rfr_cv.best_estimator_.feature_importances_, index=x_train.columns)
importances = importances.sort_values(ascending=False)

# Plot
plt.figure(figsize=(14, 8))  # Larger size to fit all labels
importances.plot(kind='bar')

plt.title('Feature Importance (All Features Sorted)', fontsize=14)
plt.xlabel("Features", fontsize=12)
plt.ylabel("Importance Score", fontsize=12)
plt.xticks(rotation=75, ha='right')  # rotate labels for readability
plt.tight_layout()
plt.show()

# Set threshold
threshold = 0.002

# Identify low-importance features
low_importance_features = importances[importances < threshold].index.tolist()

# Drop them from your training and testing sets
x_train_reduced = x_train.drop(columns=low_importance_features)
x_test_reduced = x_test.drop(columns=low_importance_features)

#Show what we dropped the features that has low importances
print("Dropped Features:", low_importance_features)

NameError: name 'rfr_cv' is not defined

In [16]:
#Train again
param={
    'n_estimators': randint(100, 300),
    'max_depth':randint(1, 20),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 5)
}

mape_score = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)


rfr_cv=RandomizedSearchCV(estimator=rfr, param_distributions=param, n_iter=15, cv=kf,scoring=mape_score, n_jobs=-1)
rfr_cv.fit(x_train, y_train)

# Get feature importances
importances = pd.Series(rfr_cv.best_estimator_.feature_importances_, index=x_train.columns)

# Set threshold
threshold = 0.002

# Identify low-importance features
low_importance_features = importances[importances < threshold].index.tolist()

# Drop them from your training and testing sets
x_train_reduced = x_train.drop(columns=low_importance_features)
x_test_reduced = x_test.drop(columns=low_importance_features)

#Show what we dropped the features that has low importances
print("Dropped Features:", low_importance_features)

NameError: name 'rfr' is not defined

In [None]:
#Train and apply K-Fold to RandomSearch Cross-Validation with scoring MAPE
rfr_cv=RandomizedSearchCV(estimator=rfr, param_distributions=param, n_iter=15, cv=kf,scoring=mape_score, n_jobs=-1)
rfr_cv.fit(x_train_reduced, y_train)

#Call the best parameter
best_param=rfr_cv.best_params_
print("Best Parameters", best_param)

#Call Random Forest Regressor only using good importances values
final_model = RandomForestRegressor(**best_param, random_state=50)
final_model.fit(x_train_reduced, y_train)

y_pred_train = final_model.predict(x_train_reduced)
y_pred_test = final_model.predict(x_test_reduced)

print("Train MAE:", mean_absolute_error(y_train, y_pred_train))
print("Train R2 score:", r2_score(y_train, y_pred_train))
print("Train MAPE:" , mean_absolute_percentage_error(y_train, y_pred_train)*100,"%")
print()
print("Test MAE:", mean_absolute_error(y_test, y_pred_test))
print("Test R2 score:", r2_score(y_test, y_pred_test))
print("Test MAPE:" , mean_absolute_percentage_error(y_test, y_pred_test)*100,"%")

Train MAE: 92374.83521294624
Train R2 score: 0.9351783509638825
Train MAPE: 8.305160807912532 %

Test MAE: 137703.34572806975
Test R2 score: 0.8582423809991196
Test MAPE: 12.303136232232873 %


In [None]:

param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.05, 0.25),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3)
}

# Call XGBoost model
xgb = XGBRegressor(random_state=42)

# Call RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=30, scoring='neg_mean_absolute_percentage_error', cv=5,verbose=1, n_jobs=-1, random_state=42)

#Fit the model
random_search.fit(x_train, y_train)

#Evaluate best model
best_param = random_search.best_estimator_

best_model = XGBRegressor(**best_param.get_params())
best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", random_search.best_params_)
print(f"Test MAPE: {mape:.4f}")
print(f"Test R² Score: {r2:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'colsample_bytree': np.float64(0.8869894380482674), 'learning_rate': np.float64(0.13272450621316229), 'max_depth': 9, 'n_estimators': 200, 'subsample': np.float64(0.9006523757990821)}
Test MAPE: 0.1178
Test R² Score: 0.8779
