In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [47]:
df=pd.read_csv('resources/Cleaned_data.csv')
df.head()

Unnamed: 0,PRICE,BEDS,BATH,PROPERTYSQFT,PROPERTY_CATEGORY,TYPE,SUBLOCALITY_ext,POSTCODE,STATE_ext,LATITUDE,LONGITUDE,PRICE_PER_SQFT
0,315000,2,2.0,1400.0,Commercial,condo,Manhattan,10022,New York,40.761255,-73.974483,225.0
1,260000,4,2.0,2015.0,Residential,house,Staten Island,10312,New York,40.541805,-74.196109,129.032258
2,69000,3,1.0,445.0,Commercial,condo,Manhattan,10022,New York,40.761398,-73.974613,155.05618
3,899500,2,2.0,2184.207862,Residential,condo,Manhattan,10027,New York,40.809448,-73.946777,411.819779
4,265000,1,1.0,750.0,Residential,co-op,The Bronx,10473,New York,40.821586,-73.874089,353.333333


# Features Engineering


In [48]:
# 1. Create new features
df['PRICE_PER_BED'] = df.apply(lambda row: row['PRICE'] / row['BEDS'] if row['BEDS'] > 0 else 0, axis=1)
df['PRICE_PER_BATH'] = df.apply(lambda row: row['PRICE'] / row['BATH'] if row['BATH'] > 0 else 0, axis=1)
df['TOTAL_ROOMS'] = df['BEDS'] + df['BATH']


In [49]:

# 2. Bin PROPERTYSQFT into categories
def sqft_category(sqft):
    if sqft < 1000:
        return 'Small'
    elif 1000 <= sqft < 2000:
        return 'Medium'
    else:
        return 'Large'

df['SQFT_CATEGORY'] = df['PROPERTYSQFT'].apply(sqft_category)


In [50]:
# 3. Define price ranges and categorize PRICE
bins = [0, 500000, 1000000, 2000000, float('inf')]  # Example price ranges
labels = ['Low', 'Medium', 'High', 'Very High']
df['PRICE_ZONE'] = pd.cut(df['PRICE'], bins=bins, labels=labels, right=False)

Prepare features (X) and target variable (y)


In [51]:
# 4. Prepare features and target variable again
X = df.drop(columns=['PRICE', 'PRICE_ZONE', 'STATE_ext', 'LATITUDE', 'LONGITUDE'])  
y = df['PRICE_ZONE']  # Use 'PRICE_ZONE' as the target variable


In [52]:
# Ensure that ZIPCODE is treated as a string (you have already done this)
X['ZIPCODE'] = X['POSTCODE'].astype(str)

In [8]:
X.head()

Unnamed: 0,BEDS,BATH,PROPERTYSQFT,PROPERTY_CATEGORY,TYPE,SUBLOCALITY_ext,POSTCODE,PRICE_PER_SQFT,PRICE_PER_BED,PRICE_PER_BATH,TOTAL_ROOMS,SQFT_CATEGORY,ZIPCODE
0,2,2.0,1400.0,Commercial,condo,Manhattan,10022,225.0,157500.0,157500.0,4.0,Medium,10022
1,4,2.0,2015.0,Residential,house,Staten Island,10312,129.032258,65000.0,130000.0,6.0,Large,10312
2,3,1.0,445.0,Commercial,condo,Manhattan,10022,155.05618,23000.0,69000.0,4.0,Small,10022
3,2,2.0,2184.207862,Residential,condo,Manhattan,10027,411.819779,449750.0,449750.0,4.0,Large,10027
4,1,1.0,750.0,Residential,co-op,The Bronx,10473,353.333333,265000.0,265000.0,2.0,Small,10473


In [None]:
#check thetarget column
y

0          Low
1          Low
2          Low
3       Medium
4          Low
         ...  
2711       Low
2712    Medium
2713    Medium
2714      High
2715       Low
Name: PRICE_ZONE, Length: 2716, dtype: category
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']

In [73]:
# . Label encode the target variable 'y' if it's categorical
label_encoder = LabelEncoder()

# Fit the encoder on the target variable and transform both train and test sets
y_encoded = label_encoder.fit_transform(y)

In [74]:
# 2. Split the data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [75]:
# 2. Apply OneHotEncoder to categorical features
categorical_columns = ['ZIPCODE', 'PROPERTY_CATEGORY', 'SUBLOCALITY_ext','TYPE','SQFT_CATEGORY'] 
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Fit and transform the training data
X_train_encoded = pd.DataFrame(ohe.fit_transform(X_train[categorical_columns]), columns=ohe.get_feature_names_out(categorical_columns))

# Transform the test data
X_test_encoded = pd.DataFrame(ohe.transform(X_test[categorical_columns]), columns=ohe.get_feature_names_out(categorical_columns))

# Reset index for both train and test encoded datasets
X_train_encoded.reset_index(drop=True, inplace=True)
X_test_encoded.reset_index(drop=True, inplace=True)

In [55]:
 #3. Select Numerical Features (including any additional ones, like SQFT_CATEGORY if needed)
numerical_columns = ['BEDS', 'BATH', 'PROPERTYSQFT', 'PRICE_PER_SQFT', 'PRICE_PER_BED', 'PRICE_PER_BATH', 'TOTAL_ROOMS']
X_train_numerical = X_train[numerical_columns]
X_test_numerical = X_test[numerical_columns]



In [56]:
# 4. Combine Encoded and Numerical Features for both train and test data
X_train_final = pd.concat([X_train_encoded, X_train_numerical.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test_encoded, X_test_numerical.reset_index(drop=True)], axis=1)

# Check for any missing values in the final datasets
print(X_train_final.isnull().sum())
print(X_test_final.isnull().sum())



ZIPCODE_10001     0
ZIPCODE_10002     0
ZIPCODE_10003     0
ZIPCODE_10004     0
ZIPCODE_10005     0
                 ..
PROPERTYSQFT      0
PRICE_PER_SQFT    0
PRICE_PER_BED     0
PRICE_PER_BATH    0
TOTAL_ROOMS       0
Length: 207, dtype: int64
ZIPCODE_10001     0
ZIPCODE_10002     0
ZIPCODE_10003     0
ZIPCODE_10004     0
ZIPCODE_10005     0
                 ..
PROPERTYSQFT      0
PRICE_PER_SQFT    0
PRICE_PER_BED     0
PRICE_PER_BATH    0
TOTAL_ROOMS       0
Length: 207, dtype: int64


In [57]:
# check the first few rows
print(display(X_train_final.head()))
print(display(X_test_final.head()))

Unnamed: 0,ZIPCODE_10001,ZIPCODE_10002,ZIPCODE_10003,ZIPCODE_10004,ZIPCODE_10005,ZIPCODE_10009,ZIPCODE_10010,ZIPCODE_10011,ZIPCODE_10012,ZIPCODE_10013,...,SQFT_CATEGORY_Large,SQFT_CATEGORY_Medium,SQFT_CATEGORY_Small,BEDS,BATH,PROPERTYSQFT,PRICE_PER_SQFT,PRICE_PER_BED,PRICE_PER_BATH,TOTAL_ROOMS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,3,3.0,1855.0,781.671159,483333.333333,483333.3,6.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,3,1.0,2184.207862,526.048834,383000.0,1149000.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1,1.0,2184.207862,128.192927,280000.0,280000.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1,1.0,725.0,627.586207,455000.0,455000.0,2.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4,2.0,2080.0,624.519231,324750.0,649500.0,6.0


None


Unnamed: 0,ZIPCODE_10001,ZIPCODE_10002,ZIPCODE_10003,ZIPCODE_10004,ZIPCODE_10005,ZIPCODE_10009,ZIPCODE_10010,ZIPCODE_10011,ZIPCODE_10012,ZIPCODE_10013,...,SQFT_CATEGORY_Large,SQFT_CATEGORY_Medium,SQFT_CATEGORY_Small,BEDS,BATH,PROPERTYSQFT,PRICE_PER_SQFT,PRICE_PER_BED,PRICE_PER_BATH,TOTAL_ROOMS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,3,2.373861,1618.0,326.946848,176333.333333,222843.726592,5.373861
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,2,1.0,1249.0,360.288231,225000.0,450000.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2,1.0,2184.207862,118.578458,129500.0,259000.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2,2.0,968.0,810.950413,392500.0,392500.0,4.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1,1.0,971.0,1024.716787,995000.0,995000.0,2.0


None


# CLassification Model

## Random Forest Classifier

In [58]:

# 1. Initialize the model
clf = RandomForestClassifier(random_state=42)

In [59]:
# 2. Train the model
clf.fit(X_train_final, y_train)


In [20]:
# 3. Make predictions
y_train_pred = clf.predict(X_train_final)
y_test_pred = clf.predict(X_test_final)


In [60]:
# 4. Evaluate the model
# Training set performance
print("Training Performance:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))


Training Performance:
Accuracy: 1.0
Confusion Matrix:
 [[ 469    0    0    0]
 [   0  593    0    0]
 [   0    0 1053    0]
 [   0    0    0   57]]
Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00       469
         Low       1.00      1.00      1.00       593
      Medium       1.00      1.00      1.00      1053
   Very High       1.00      1.00      1.00        57

    accuracy                           1.00      2172
   macro avg       1.00      1.00      1.00      2172
weighted avg       1.00      1.00      1.00      2172



In [61]:
# Test set performance
print("\nTest Performance:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))



Test Performance:
Accuracy: 0.9669117647058824
Confusion Matrix:
 [[108   0   2   0]
 [  0 161   4   0]
 [  0   1 250   0]
 [ 11   0   0   7]]
Classification Report:
               precision    recall  f1-score   support

        High       0.91      0.98      0.94       110
         Low       0.99      0.98      0.98       165
      Medium       0.98      1.00      0.99       251
   Very High       1.00      0.39      0.56        18

    accuracy                           0.97       544
   macro avg       0.97      0.84      0.87       544
weighted avg       0.97      0.97      0.96       544



### Smote implementation
implement SMOTE to address the class imbalance in the dataset, and adjust class weights to rebalance the training data.

In [62]:

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_final, y_train)

# Check the class distribution after SMOTE
from collections import Counter
print("Class distribution before SMOTE:", Counter(y_train))
print("Class distribution after SMOTE:", Counter(y_train_balanced))


Class distribution before SMOTE: Counter({'Medium': 1053, 'Low': 593, 'High': 469, 'Very High': 57})
Class distribution after SMOTE: Counter({'High': 1053, 'Low': 1053, 'Medium': 1053, 'Very High': 1053})


In [63]:
# Train the model
clf_balanced = RandomForestClassifier(random_state=42)
clf_balanced.fit(X_train_balanced, y_train_balanced)

# Predict on the test data
y_test_pred_balanced = clf_balanced.predict(X_test_final)


In [64]:
# Evaluate the balanced model
print("Test Performance After SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_test_pred_balanced))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_balanced))
print("Classification Report:\n", classification_report(y_test, y_test_pred_balanced))


Test Performance After SMOTE:
Accuracy: 0.9816176470588235
Confusion Matrix:
 [[108   0   1   1]
 [  0 163   2   0]
 [  1   1 249   0]
 [  4   0   0  14]]
Classification Report:
               precision    recall  f1-score   support

        High       0.96      0.98      0.97       110
         Low       0.99      0.99      0.99       165
      Medium       0.99      0.99      0.99       251
   Very High       0.93      0.78      0.85        18

    accuracy                           0.98       544
   macro avg       0.97      0.93      0.95       544
weighted avg       0.98      0.98      0.98       544



To improve the recall for the Very High category, we can perform hyperparameter tuning for the Random Forest model. 

In [65]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

In [66]:
# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,
    scoring='f1_macro',  # Optimize for balanced performance
    cv=5,  # 5-fold cross-validation
    random_state=42,
    verbose=2,
    n_jobs=-1
)


In [67]:
# Fit the model to the training data
random_search.fit(X_train_final, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [31]:
# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30, 'class_weight': 'balanced'}


In [32]:
# Evaluate the best model on the test data
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_final)

In [33]:
# Evaluate test performance
print("Test Performance After Tuning:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Test Performance After Tuning:
Confusion Matrix:
 [[106   0   2   2]
 [  0 164   1   0]
 [  0   1 250   0]
 [  5   0   0  13]]
Classification Report:
               precision    recall  f1-score   support

        High       0.95      0.96      0.96       110
         Low       0.99      0.99      0.99       165
      Medium       0.99      1.00      0.99       251
   Very High       0.87      0.72      0.79        18

    accuracy                           0.98       544
   macro avg       0.95      0.92      0.93       544
weighted avg       0.98      0.98      0.98       544



## XGBoost Classifier

In [None]:
# 1. Initialize the XGBoost classifier
xgb_model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

In [77]:
# 2. Train the model
xgb_model.fit(X_train_final, y_train)


In [78]:
#3. Make Predictions
y_pred = xgb_model.predict(X_test_final)

In [None]:
# 4. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [80]:
# Output the results
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 0.9871323529411765

Confusion Matrix:
 [[110   0   0   0]
 [  0 165   0   0]
 [  0   1 250   0]
 [  6   0   0  12]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       110
           1       0.99      1.00      1.00       165
           2       1.00      1.00      1.00       251
           3       1.00      0.67      0.80        18

    accuracy                           0.99       544
   macro avg       0.99      0.92      0.94       544
weighted avg       0.99      0.99      0.99       544

