In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load datasets
rainfall_data = pd.read_csv('./dataset/rainfall.csv')  # Replace with actual path
housing_data = pd.read_csv('./dataset/AmesHousing.csv')   # Replace with actual path

# Generate synthetic water table levels based on rainfall
rainfall_data['water_table_level'] = rainfall_data['rainfall'] * np.random.uniform(0.5, 1.5, len(rainfall_data))

# Generate synthetic market trends based on housing data
housing_data['market_trend'] = housing_data['SalePrice'] * np.random.uniform(0.9, 1.1, len(housing_data))

# Combine all synthetic features into a single dataset
combined_data = pd.DataFrame({
    'rainfall': rainfall_data['rainfall'],
    'water_table_level': rainfall_data['water_table_level'],
    'market_trend': housing_data['market_trend'],
})

# Add a final score as a weighted sum of other features
combined_data['final_score'] = (
    0.4 * combined_data['water_table_level'] +
    0.3 * combined_data['market_trend']
)

# Check for missing values
print(combined_data.isnull().sum())

# Handle NaN values
combined_data = combined_data.dropna()  # Remove rows with NaN values
# Alternatively, you can fill NaN values with a value, e.g., the mean
# combined_data = combined_data.fillna(combined_data.mean())

# Separate features and target
X = combined_data.drop('final_score', axis=1)
y = combined_data['final_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Display results
print("Predicted final scores: ", y_pred)



rainfall             2877
water_table_level    2877
market_trend            0
final_score          2877
dtype: int64
Predicted final scores:  [ 66741.05696109  86491.44632595  99587.02438955  53218.00426949
  66704.39698838  57529.99927714 110707.21892227  47824.85902043
  78236.80360924  37906.92327963  52729.29116908]


In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Evaluate the model
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)


Mean Absolute Error (MAE): 1748.0740280538269
Mean Squared Error (MSE): 8497899.8877843
Root Mean Squared Error (RMSE): 2915.115758899516
R-squared (R2): 0.9834183463642933


In [3]:
# Feature importance
feature_importance = model.feature_importances_

# Display the feature importance
for feature, importance in zip(X.columns, feature_importance):
    print(f"Feature: {feature}, Importance: {importance}")


Feature: rainfall, Importance: 0.02456501928050934
Feature: water_table_level, Importance: 0.02596468154237983
Feature: market_trend, Importance: 0.9494702991771108


In [4]:
# Example new data
new_data = pd.DataFrame({
    'rainfall': [50],  # Example rainfall
    'water_table_level': [30],  # Example water table level
    'market_trend': [200000]  # Example market trend (SalePrice)
})

# Predict the final score using the trained model
final_score_prediction = model.predict(new_data)
print("Predicted final score:", final_score_prediction)


Predicted final score: [60769.18502881]


In [5]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'final_score_model.pkl')

# Load the model from the file later
loaded_model = joblib.load('final_score_model.pkl')

# Use the loaded model to make predictions
loaded_model_prediction = loaded_model.predict(new_data)
print("Predicted final score using loaded model:", loaded_model_prediction)


Predicted final score using loaded model: [60769.18502881]
