In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# Load the transformed dataset
transformed_data_path = 'transformed_rainfall_data.csv'
transformed_data = pd.read_csv(transformed_data_path)

# Check the column names
print("Column Names in Dataset:", transformed_data.columns)

# Ensure all feature columns are numeric and handle missing values
def preprocess_data(df):
    # Drop non-numeric columns
    df = df.select_dtypes(include=[np.number])
    # Fill missing values with the column mean
    df = df.fillna(df.mean())
    return df

# Preprocess the data
transformed_data = preprocess_data(transformed_data)

# Split the data into train and test sets
train_data, test_data = train_test_split(transformed_data, test_size=0.2, random_state=42)

# Function to train Decision Tree Regressor model and plot results
def train_and_plot_dtr(data, column):
    # Define features and target
    X_train = data['train'].drop(columns=[column])
    y_train = data['train'][column]
    X_test = data['test'].drop(columns=[column])
    y_test = data['test'][column]

    # Debugging: Check the shapes of the datasets
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Train Decision Tree Regressor model with GridSearchCV
    param_grid = {'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
    dtr = DecisionTreeRegressor(random_state=42)
    grid_search = GridSearchCV(dtr, param_grid, cv=5, scoring='neg_mean_squared_error')

    try:
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
    except Exception as e:
        print(f"An error occurred during GridSearchCV fitting: {e}")
        return None, None, None

    # Make predictions
    predictions = best_model.predict(X_test)

    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)

    # Plot actual vs predicted data
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.reset_index(drop=True), label='Actual', color='blue')
    plt.plot(predictions, label='Predicted', color='orange')
    plt.xlabel('Time')
    plt.ylabel('Rainfall')
    plt.title(f'Rainfall Prediction for {column} with Decision Tree ')
    plt.legend()
    plt.show()

    return mse, rmse, r2

# Dictionary to store results
results = {}

# Prepare data dictionaries
data_dict = {'train': train_data, 'test': test_data}

# Train and plot for Vavuniya
results['Vavuniya'] = train_and_plot_dtr(data_dict, 'Vavuniya')

# Train and plot for Anuradhapura
results['Anuradhapura'] = train_and_plot_dtr(data_dict, 'Anuradhapura')

# Train and plot for Maha Illuppallama
results['Maha Illuppallama'] = train_and_plot_dtr(data_dict, 'Maha Illuppallama')  # Adjust column name if needed

# Print results
for location, metrics in results.items():
    if metrics[0] is not None:
        print(f"{location} - MSE: {metrics[0]}, RMSE: {metrics[1]}, R²: {metrics[2]}")
    else:
        print(f"{location} - Model training failed")
