In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import folium
import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
def convert_date(df):
    df['Corrected_Dates'] = pd.to_datetime((df['Date'] - 693963), unit='D', origin=datetime.datetime(1900, 1, 1))
    return df

In [None]:
df = pd.read_csv('RRCA_baseflow.csv')
df = convert_date(df)
df['Date'] = df['Corrected_Dates']
df = df.drop('Corrected_Dates', axis=1)
df

In [None]:
fig, axs = plt.subplots(1, 3, sharey=True)
df.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8), s=2)
df.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1], s=2)
df.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2], s=2)

In [None]:
# create X and y
feature_cols = ['Precipitation']
X = df[feature_cols]
y = df.Observed

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print intercept and coefficients
print(lm.intercept_)
print(lm.coef_)

In [None]:
X_new = pd.DataFrame({'Precipitation': [df.Precipitation.min(), df.Precipitation.max()]})
X_new

In [None]:
preds = lm.predict(X_new)
preds

In [None]:
# first, plot the observed data
df.plot(kind='scatter', x='Precipitation', y='Observed')

# then, plot the least squares line
plt.plot(X_new, preds, c='red', linewidth=2)

In [None]:
import statsmodels.formula.api as smf

# create a fitted model in one line
lm = smf.ols(formula='Observed ~ Precipitation', data=df).fit()

# print the coefficients
display(lm.params)

# print the confidence intervals for the model coefficients
display(lm.conf_int())

In [None]:
# Print the R-squared value for the model using statsmodels
lm = smf.ols(formula='Observed ~ Precipitation', data=df).fit()
display(lm.rsquared)

# Print the R-squared value for the model using sklearn
lm = LinearRegression()
lm.fit(df[['Precipitation']], df.Observed)
display(lm.score(df[['Precipitation']], df.Observed))

In [None]:
# create X and y
feature_cols = ['Precipitation','Irrigation_pumping']
X = df[feature_cols]
y = df.Observed

# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)

# print intercept and coefficients
display(lm.intercept_)
display(list(zip(feature_cols, lm.coef_)))


In [None]:
# create a fitted model with all three features
lm = smf.ols(formula='Observed ~ Precipitation + Irrigation_pumping', data=df).fit()

# print the coefficients
display(lm.params)

# print a summary of the fitted model
lm.summary()

## Charts and Correlation Matrix

In [None]:
grouped_data = df.groupby('Segment_id').mean()

# Plot the bar graphs
grouped_data[['Evapotranspiration', 'Precipitation', 'Observed']].plot.bar()
plt.xlabel('Segment ID')
plt.ylabel('Mean Values')
plt.title('Mean Evapotranspiration, Precipitation, and Observed by Segment ID')
plt.show()

In [None]:
plt.scatter(df["Evapotranspiration"], df["Observed"], s=3)
plt.xlabel("Evapotranspiration")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow vs. Evapotranspiration")
plt.show()

In [None]:
plt.scatter(df["Irrigation_pumping"], df["Observed"], s=3)
plt.xlabel("Irrigation_pumping")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow vs. Irrigation_pumping")
plt.show()

In [None]:
plt.scatter(df["Precipitation"], df["Observed"], s=3)
plt.xlabel("Precipitation")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow vs. Precipitation")
plt.show()

In [None]:
plt.scatter(df["Date"], df["Observed"], s=3)
plt.xlabel("Date")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow over time")
plt.show()

In [None]:
%matplotlib inline

data = pd.read_csv('RRCA_baseflow.csv')

data.isnull().sum()
data.describe()

X = data.drop(['Observed', 'Segment_id'], axis=1)
y = data['Observed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R-squared value: {r2}")
print(f"Root mean squared error: {rmse}")

In [None]:
plt.scatter(y_test, y_pred, s=3)
plt.xlabel('Observed Baseflow')
plt.ylabel('Predicted Baseflow')
plt.show()

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
summary_stats = df.describe()
print(summary_stats)

In [None]:
plt.scatter(df['Precipitation'], df['Observed'], s=3)
plt.xlabel('Precipitation')
plt.ylabel('Observed Baseflow')
plt.show()

## Linear Regression

In [None]:
X = df[['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']]
y = df['Observed']
model = LinearRegression()
model.fit(X, y)

print('R-squared:', model.score(X, y))

In [None]:
# Define the features and target variable
X = df[['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']]
y = df['Observed']

# Fit a linear regression model
regressor = LinearRegression()
regressor.fit(X, y)

# Predict the target variable on the same data to calculate MSE
y_pred = regressor.predict(X)
mse = mean_squared_error(y, y_pred)

# Calculate bias and variance using cross-validation
cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='neg_mean_squared_error')
bias = -np.mean(cv_scores)
variance = np.var(cv_scores)

print('Mean Squared Error:', mse)
print('Bias:', bias)
print('Variance:', variance)
print('Cross-Validation Scores:', cv_scores)