# Load Packages

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.linear_model
from sklearn.preprocessing import PolynomialFeatures
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output

# Data Loading

In [2]:
# Load the data
file_path = 'data/Hurricane_Irene_Hudson_River.xlsx'
do_data = pd.read_excel(file_path, sheet_name = 5).drop(['Piermont D.O. (ppm)'], axis = 1)
rainfall_data = pd.read_excel(file_path, sheet_name = "Rainfall").drop(['Piermont  Rainfall Daily Accumulation (Inches)'], axis = 1)
turbidity_data = pd.read_excel(file_path, sheet_name = "Turbidity").drop(['Piermont Turbidity in NTU'], axis = 1)

# Data Cleaning

Since we read in our data through multiple excel sheets, we need to merge our different dataframes. The different data sheets all have a column for date, so we can merge our data on this. To better call on the data in our analysis, we can change the column names. Finally, we will update the date to be of datetime type, and set the date as the index.

In [3]:
# Merge the two datasets
data = do_data.merge(rainfall_data, how = 'inner')
data = data.merge(turbidity_data, how = 'inner')
data.head()

Unnamed: 0,Date Time (ET),Port of Albany D.O. (ppm),Norrie Point D.O. (ppm),Port of Albany Rainfall Daily Accumulation (Inches),Norrie Point Rainfall Daily Accumulation (Inches),Port of Albany Turbidity in NTU,Norrie Point Turbidity in NTU
0,2011-08-25 00:00:00,7.68,7.81,0.0,0.0,4.0,9.3
1,2011-08-25 00:15:00,7.6,7.73,0.0,0.0,3.9,8.4
2,2011-08-25 00:30:00,7.57,7.63,0.0,0.0,4.3,7.9
3,2011-08-25 00:45:00,7.72,7.67,0.0,0.0,4.7,8.1
4,2011-08-25 01:00:00,7.74,7.63,0.0,0.0,4.4,8.4


In [4]:
# Update the column names
data.columns = ['date', 'albany_DO', 'norrie_DO', 'albany_rainfall', 'norrie_rainfall', 'albany_turbidity', 'norrie_turbidity']
data.head()

Unnamed: 0,date,albany_DO,norrie_DO,albany_rainfall,norrie_rainfall,albany_turbidity,norrie_turbidity
0,2011-08-25 00:00:00,7.68,7.81,0.0,0.0,4.0,9.3
1,2011-08-25 00:15:00,7.6,7.73,0.0,0.0,3.9,8.4
2,2011-08-25 00:30:00,7.57,7.63,0.0,0.0,4.3,7.9
3,2011-08-25 00:45:00,7.72,7.67,0.0,0.0,4.7,8.1
4,2011-08-25 01:00:00,7.74,7.63,0.0,0.0,4.4,8.4


In [5]:
# Convert data to datetime format and set it as index
data['date'] = pd.to_datetime(data['date'])
data.dtypes

date                datetime64[ns]
albany_DO                  float64
norrie_DO                  float64
albany_rainfall            float64
norrie_rainfall            float64
albany_turbidity           float64
norrie_turbidity           float64
dtype: object

In [6]:
# Set the date as the index
data.set_index('date', inplace = True)
data.head()

Unnamed: 0_level_0,albany_DO,norrie_DO,albany_rainfall,norrie_rainfall,albany_turbidity,norrie_turbidity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-08-25 00:00:00,7.68,7.81,0.0,0.0,4.0,9.3
2011-08-25 00:15:00,7.6,7.73,0.0,0.0,3.9,8.4
2011-08-25 00:30:00,7.57,7.63,0.0,0.0,4.3,7.9
2011-08-25 00:45:00,7.72,7.67,0.0,0.0,4.7,8.1
2011-08-25 01:00:00,7.74,7.63,0.0,0.0,4.4,8.4


# Multiple Linear Regression
Now that our data is cleaned, we can start our multiple linear regression. We will be predicting the turbidity at the Albany sample site, with the rainfall and D.O. at the Albany sample site as our predictors. Let's check if our model improves at all by adding a second predictor.

In [8]:
# Define predictors and the target variable
X = data[['albany_DO', 'albany_rainfall']]
Y = data[['albany_turbidity']]

In [9]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [11]:
# Create and fit the model
model = LinearRegression()
model.fit(X_train, Y_train)

In [12]:
#Predict and evaluate
Y_pred = model.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(Y_test, Y_pred))}")
print(f"R-Squared: {r2_score(Y_test, Y_pred)}")

RMSE: 221.9143474905527
R-Squared: 0.4907389518457509


# Create a widget to visualize different models

In [18]:
# Create a widget for selecting predictors
predictor_selector = widgets.SelectMultiple(
    options = data.columns,
    value = [data.columns[0]],
    description = 'Predictors'
)

# Create a dropdown for selecting the target variable
target_selector = widgets.Dropdown(
    options = data.columns,
    value = data.columns[1],
    description = 'Target'
)

# Button to evaluate the model
evaluate_button = widgets.Button(
    description = 'Evaluate'
)

# Output widget to display results
output = widgets.Output()

# Define the function to handle button clicks
def evaluate_model(b):
    with output:
        clear_output(wait = True) # Clear output of display area
        
        # Make sure the target is not in the predictors
        selected_predictors = [item for item in predictor_selector.value]
        if target_selector.value in selected_predictors:
            print("Target variable must not be in the predictors.")
            return
        
        # Define predictors and the target variable
        X = data[selected_predictors]
        Y = data[target_selector.value]
        
        # Split the data into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
        
        # Create and fit the model
        model = LinearRegression()
        model.fit(X_train, Y_train)
        
        #Predict and evaluate
        Y_pred = model.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)
        
        # Display the values
        print(f"R^2: {r2:.4f}")
        print(f"MSE: {mse:.4f}")
        
# Display the widgets and connect the button to the function
display(predictor_selector, target_selector, evaluate_button, output)
evaluate_button.on_click(evaluate_model)

SelectMultiple(description='Predictors', index=(0,), options=('albany_DO', 'norrie_DO', 'albany_rainfall', 'no…

Dropdown(description='Target', index=1, options=('albany_DO', 'norrie_DO', 'albany_rainfall', 'norrie_rainfall…

Button(description='Evaluate', style=ButtonStyle())

Output()