In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load data
data = pd.read_csv('Main Data.csv')

# Assuming 'Price' is the target variable and dropping irrelevant or non-numeric columns
X = data.drop(['URL', 'Elementary School Name', 'Middle School Name', 'High School Name', 
               'Address', 'Property Type', 'Style', 'Floor Type', 'Heat Type', 'Cool Type',
               'Noise Level', 'Flood Factor', 'Fire Factor', 'Heat Factor', 'Wind Factor',
               'Price'], axis=1)
y = data['Price']

# Handling categorical variables using One-Hot Encoding
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Create the imputer object
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train the RandomForest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_imputed, y_train)

# Assuming `new_data` is your new dataset with the same structure as the original data
# Load new_data
new_data = pd.read_csv('New Data.csv')

# Keep the URL and address columns for later use
url_address_data = new_data[['URL', 'Address']]

# Dropping irrelevant or non-numeric columns and handling categorical variables
new_data_prep = new_data.drop(['URL', 'Elementary School Name', 'Middle School Name', 'High School Name',
                               'Address', 'Property Type', 'Style', 'Floor Type', 'Heat Type', 'Cool Type',
                               'Noise Level', 'Flood Factor', 'Fire Factor', 'Heat Factor', 'Wind Factor'], axis=1)
new_data_encoded = pd.get_dummies(new_data_prep)


# Ensure columns in new_data match with the columns in the training data
missing_cols = set(X_train.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0  # or use an imputed value

extra_cols = set(new_data_encoded.columns) - set(X_train.columns)
new_data_encoded = new_data_encoded.drop(columns=extra_cols)

# Impute any missing values in the new data
new_data_imputed = imputer.transform(new_data_encoded)

# Make predictions on the new data using the trained RandomForest model
predicted_values = rf.predict(new_data_imputed)

# Merge the URL and address data back into the new_data DataFrame
new_data = pd.concat([url_address_data, new_data], axis=1)

# If you want to add the predicted values back to your DataFrame:
new_data['Predicted Price'] = predicted_values

# Now, to identify good deals:
# Note: Ensure the 'Price' column exists in your new_data
new_data['Price Difference'] = new_data['Price'] - new_data['Predicted Price']

# Properties with negative values in 'Price Difference' are priced lower than predicted, potentially indicating a good deal
good_deals = new_data[new_data['Price Difference'] < 0]

# Save the good deals to a CSV file
good_deals.to_csv('Good Deals.csv', index=False)
