In [21]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = 'clean_NY_Housing_data.csv'
try:
    df = pd.read_csv(file_path)
    print("CSV file loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file {file_path} was not found.")
    exit(1)

# Display the first few rows of the dataset to understand its structure
print(df.head())

CSV file loaded successfully.
             TYPE   PRICE  BEDS  BATH  PROPERTYSQFT         LOCALITY
0  Co-op for sale  440000     2     1           978     Kings County
1  Co-op for sale  375000     2     1           850     Bronx County
2  Condo for sale  549000     2     2          1000  Richmond County
3  Co-op for sale  199000     3     1           325     Kings County
4  Co-op for sale  350000     1     1           700     Bronx County


In [23]:
# Define the threshold price
threshold_price = 500000

# Add a new column 'Price_Above_Threshold' to classify properties based on the threshold price
if 'PRICE' in df.columns:
    df['Price_Above_Threshold'] = (df['PRICE'] > threshold_price).astype(int)
else:
    print("Error: 'PRICE' column not found in the dataset.")
    exit(1)

# Define the target variable (above/below threshold) and features
y = df['Price_Above_Threshold']
X = df.drop(['Price_Above_Threshold', 'TYPE', 'LOCALITY'], axis=1)

# Ensure there are no missing values
X = X.fillna(0)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a Logistic Regression model with regularization
lr_model = LogisticRegression()

# Hyperparameter tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Cross-validation on the best model
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Train the best model on the training data
best_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Print the evaluation results
print("Accuracy on test data:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(confusion_mat)

Cross-validation scores: [0.99521531 0.99521531 0.98564593 0.98564593 0.99521531]
Mean cross-validation score: 0.9913875598086126
Accuracy on test data: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       162

    accuracy                           1.00       262
   macro avg       1.00      1.00      1.00       262
weighted avg       1.00      1.00      1.00       262

Confusion Matrix:
[[100   0]
 [  0 162]]


In [25]:
# Add predictions to the original DataFrame
df['Prediction'] = best_model.predict(scaler.transform(df.drop(['Price_Above_Threshold', 'TYPE', 'LOCALITY'], axis=1)))

# Prepare the relevant columns for the JSON output
relevant_columns = ['TYPE', 'PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'LOCALITY', 'Prediction']

# Check if all relevant columns are present in the dataset
if not all(col in df.columns for col in relevant_columns):
    missing_cols = [col for col in relevant_columns if col not in df.columns]
    print(f"Error: Missing columns in the dataset: {missing_cols}")
    exit(1)

# Filter the dataframe to keep only the relevant columns
data_for_json = df[relevant_columns]

# Convert the DataFrame to a list of lists
data_list = data_for_json.values.tolist()

# Define the path for the JSON file
json_file_path = 'filtered_heatmap_data_with_predictions.json'

# Write the data to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(data_list, json_file)

print(f'Data successfully written to {json_file_path}')

Data successfully written to filtered_heatmap_data_with_predictions.json
