# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Importing Preprocessed Dataset

In [2]:
# Load data into dataframe
df_bath = pd.read_csv('preprocessed_dataset_bath.csv')
df_sqft = pd.read_csv('preprocessed_dataset_sqft.csv')

# Model Training - Bath

In [3]:
# Separate rows with NaN values
df_bath_nan = df_bath[df_bath.isna().any(axis=1)]

# Separate rows without NaN values
df_bath_without_nan = df_bath[df_bath.notna().all(axis=1)]

# KNN Model

In [4]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)
y_pred

f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.5733333333333334
F1 Score: 0.5461869513677541
Recall: 0.5733333333333334
Precision: 0.5366242112431929


# Random Forest Model

In [5]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
y_pred

f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.6577777777777778
F1 Score: 0.6593987039716467
Recall: 0.6577777777777778
Precision: 0.6654351256647547


# Support Vector Machine (SVM)

In [6]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.6133333333333333
F1 Score: 0.5848345244676091
Recall: 0.6133333333333333
Precision: 0.6116510687728469


# Model Training - Square Feet

In [7]:
# Separate rows with NaN values
df_sqft_nan = df_sqft[df_sqft.isna().any(axis=1)]

# Separate rows without NaN values
df_sqft_without_nan = df_sqft[df_sqft.notna().all(axis=1)]

# KNN Model

In [8]:
# Split into features (X) and target variable (y)
X = df_sqft_without_nan.drop(columns=['PROPERTYSQFT'])
y = df_sqft_without_nan['PROPERTYSQFT']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN classifier with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Log transformation (using log1p to handle potential 0 values)
log_actuals = np.log1p(y_test)
log_predictions = np.log1p(y_pred)

# Define tolerance within 5%
tolerances = .05 * log_actuals

# Calculate "true positives", "false positives", and "false negatives"
within_tolerance = np.abs(log_actuals - log_predictions) <= tolerances

# Create binary classifications
binary_actuals = np.ones_like(log_actuals)  # All actuals are "positive"
binary_predictions = within_tolerance.astype(int)  # 1 if within tolerance, 0 otherwise

# Evaluate the model's accuracy
f1 = f1_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(binary_actuals, binary_predictions)
recall = recall_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)
precision = precision_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.6446540880503144
F1 Score: 0.7839388145315488
Recall: 0.6446540880503144
Precision: 1.0


# Random Forest Model

In [9]:
# Split into features (X) and target variable (y)
X = df_sqft_without_nan.drop(columns=['PROPERTYSQFT'])
y = df_sqft_without_nan['PROPERTYSQFT']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_scaled, y_train)

y_pred = rf_model.predict(X_test_scaled)

# Log transformation (using log1p to handle potential 0 values)
log_actuals = np.log1p(y_test)
log_predictions = np.log1p(y_pred)

# Define tolerance within 5%
tolerances = .05 * log_actuals

# Calculate "true positives", "false positives", and "false negatives"
within_tolerance = np.abs(log_actuals - log_predictions) <= tolerances

# Create binary classifications
binary_actuals = np.ones_like(log_actuals)  # All actuals are "positive"
binary_predictions = within_tolerance.astype(int)  # 1 if within tolerance, 0 otherwise

# Evaluate the model's accuracy
f1 = f1_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(binary_actuals, binary_predictions)
recall = recall_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)
precision = precision_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.7861635220125787
F1 Score: 0.8802816901408451
Recall: 0.7861635220125787
Precision: 1.0


# Support Vector Machine (SVM)

In [10]:
# Split into features (X) and target variable (y)
X = df_sqft_without_nan.drop(columns=['PROPERTYSQFT'])
y = df_sqft_without_nan['PROPERTYSQFT']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Log transformation (using log1p to handle potential 0 values)
log_actuals = np.log1p(y_test)
log_predictions = np.log1p(y_pred)

# Define tolerance within 5%
tolerances = .05 * log_actuals

# Calculate "true positives", "false positives", and "false negatives"
within_tolerance = np.abs(log_actuals - log_predictions) <= tolerances

# Create binary classifications
binary_actuals = np.ones_like(log_actuals)  # All actuals are "positive"
binary_predictions = within_tolerance.astype(int)  # 1 if within tolerance, 0 otherwise

# Evaluate the model's accuracy
f1 = f1_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(binary_actuals, binary_predictions)
recall = recall_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)
precision = precision_score(binary_actuals, binary_predictions,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.5974842767295597
F1 Score: 0.7480314960629921
Recall: 0.5974842767295597
Precision: 1.0


# Predict missing data using best scoring model
Random Forest performed best for both # of baths and sqft

# Bathrooms

In [11]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.067, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

test = df_bath_nan.drop(columns=['BATH'])

y_pred = rf_model.predict(test)

df_bath_nan['BATH'] = y_pred
df_bath_nan

Unnamed: 0,PRICE,BEDS,BATH,SUBLOCALITY_ENCODED,TYPE_ENCODED
4,55000000,7,9.0,1.0,2.0
11,689000,3,2.0,2.0,2.0
27,2250000,12,6.0,4.0,4.0
68,1489000,6,4.0,2.0,4.0
69,65000000,3,6.0,1.0,2.0
...,...,...,...,...,...
4776,555000,3,2.0,2.0,9.0
4780,2000000,8,4.0,3.0,4.0
4783,799000,6,4.0,11.0,4.0
4788,4750000,3,4.0,4.0,3.0


In [12]:
# Concatenate along rows (axis=0)
df_bath_concat = pd.concat([df_bath_nan, df_bath_without_nan], axis=0)

# Square feet

In [13]:
# Split into features (X) and target variable (y)
X = df_sqft_without_nan.drop(columns=['PROPERTYSQFT'])
y = df_sqft_without_nan['PROPERTYSQFT']
test = df_sqft_nan.drop(columns=['PROPERTYSQFT'])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.067, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(test)

df_sqft_nan['PROPERTYSQFT'] = y_pred
df_sqft_nan

Unnamed: 0,PRICE,BEDS,PROPERTYSQFT,SUBLOCALITY_ENCODED,TYPE_ENCODED
6,899500,2,951.0,4.0,0.0
12,259000,3,590.0,8.0,3.0
13,430000,2,900.0,8.0,3.0
14,895000,3,1700.0,4.0,3.0
26,325000,1,713.0,8.0,3.0
...,...,...,...,...,...
4791,370000,3,440.0,4.0,3.0
4793,1950000,2,960.0,4.0,3.0
4796,599000,1,800.0,4.0,3.0
4797,245000,1,750.0,10.0,3.0


In [14]:
# Concatenate along rows (axis=0)
df_sqft_concat = pd.concat([df_sqft_nan, df_sqft_without_nan], axis=0)

# Finalize Data and Clean up 

In [24]:
# Join based on index
df_join = df_sqft_concat.join(df_bath_concat['BATH'], how='inner')

# Sublocality decoding using original mapping because encoding was done in preprocessing notebook
encoded_data = df_join['SUBLOCALITY_ENCODED']
# Original mapping used for encoding
category_to_number = {
 'Manhattan': 0,
 'New York County': 1,
 'Richmond County': 2,
 'Kings County': 3,
 'New York': 4,
 'East Bronx': 5,
 'Brooklyn': 6,
 'The Bronx': 7,
 'Queens': 8,
 'Staten Island': 9,
 'Queens County': 10,
 'Bronx County': 11,
 'Coney Island': 12,
 'Brooklyn Heights': 13,
 'Jackson Heights': 14,
 'Riverdale': 15,
 'Rego Park': 16,
 'Fort Hamilton': 17,
 'Flushing': 18,
 'Dumbo': 19,
 'Snyder Avenue': 20
}
# Create a reverse mapping
number_to_category = {v: k for k, v in category_to_number.items()}
# Decode manually using the reverse mapping
decoded_data = [number_to_category[num] for num in encoded_data]
# Change New York County:1 to New york, same for Bronx County:11, and Queens County:10
decoded_data = pd.Series(decoded_data).replace('New York County', 'New York').replace('Queens County', 'Queens').replace('Bronx County', 'Bronx').tolist()
# Input decoded data
df_join['SUBLOCALITY_ENCODED'] = decoded_data

# Type decoding
encoded_data = df_join['TYPE_ENCODED']
category_to_number = {
 'Condo for sale': 0,
 'House for sale': 1,
 'Townhouse for sale': 2,
 'Co-op for sale': 3,
 'Multi-family home for sale': 4,
 'For sale': 5,
 'Contingent': 6,
 'Land for sale': 7,
 'Foreclosure': 8,
 'Pending': 9,
 'Coming Soon': 10,
 'Mobile house for sale': 11
}
# Create a reverse mapping
number_to_category = {v: k for k, v in category_to_number.items()}
# Decode manually using the reverse mapping
decoded_data = [number_to_category[num] for num in encoded_data]
# Input decoded data
df_join['TYPE_ENCODED'] = decoded_data

# Change header names 
df_join = df_join.rename(columns={'SUBLOCALITY_ENCODED': 'SUBLOCALITY'})
df_join = df_join.rename(columns={'TYPE_ENCODED': 'TYPE'})

# Reorder columns and sort rows for easier readability
new_order = ['TYPE','SUBLOCALITY','PRICE','PROPERTYSQFT','BEDS','BATH']
df_join = df_join[new_order]
df_join = df_join.sort_index()
#df_join.to_csv('Final.csv', index=False)

Unnamed: 0,TYPE,SUBLOCALITY,PRICE,PROPERTYSQFT,BEDS,BATH
0,Condo for sale,Manhattan,315000,1400.0,2,2.0
1,Condo for sale,New York,195000000,17545.0,7,10.0
2,House for sale,Richmond County,260000,2015.0,4,2.0
3,Condo for sale,New York,69000,445.0,3,1.0
4,Townhouse for sale,New York,55000000,14175.0,7,9.0
5,House for sale,Kings County,690000,4004.0,5,2.0
6,Condo for sale,New York,899500,951.0,2,2.0
7,House for sale,Richmond County,16800000,33000.0,8,16.0
8,Co-op for sale,East Bronx,265000,750.0,1,1.0
9,Co-op for sale,Brooklyn,440000,978.0,2,1.0


# Graphs and Plots and Neat Stuff