In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load dataset
data = pd.read_csv('daily_aggregated_data.csv')

# Separate features and target variable
X = data.drop(columns=['precipMM', 'date'])  # Drop 'precipMM' (target) and 'date' column
y = data['precipMM']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [2]:
import pickle

# Save the scaler to a file
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Save the model
with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# Predict on the test set
y_pred_lr = lr_model.predict(X_test_scaled)

# Calculate performance metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Display performance metrics
print("Linear Regression Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse_lr:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"R-squared (R2): {r2_lr:.4f}")
print("-" * 40)


Linear Regression Performance Metrics:
Mean Squared Error (MSE): 96.3639
Root Mean Squared Error (RMSE): 9.8165
R-squared (R2): 0.4087
----------------------------------------


In [4]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Save the model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate performance metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Display performance metrics
print("Random Forest Regressor Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"R-squared (R2): {r2_rf:.4f}")
print("-" * 40)


Random Forest Regressor Performance Metrics:
Mean Squared Error (MSE): 84.8228
Root Mean Squared Error (RMSE): 9.2099
R-squared (R2): 0.4795
----------------------------------------


In [5]:
from xgboost import XGBRegressor

# Train XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Save the model
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Calculate performance metrics
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Display performance metrics
print("XGBoost Regressor Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse_xgb:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_xgb:.4f}")
print(f"R-squared (R2): {r2_xgb:.4f}")
print("-" * 40)


XGBoost Regressor Performance Metrics:
Mean Squared Error (MSE): 81.6511
Root Mean Squared Error (RMSE): 9.0361
R-squared (R2): 0.4990
----------------------------------------


In [6]:
import pandas as pd

# Assuming df is your original DataFrame
correlation_matrix = df.corr()

# Visualize the correlation matrix
plt.figure(figsize=(12,8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt='.2f')
plt.title("Correlation Matrix")
plt.show()


NameError: name 'df' is not defined