In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import folium
import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.formula.api as smf

In [None]:
def convert_date(df):
    df['Corrected_Dates'] = pd.to_datetime((df['Date'] - 693963), unit='D', origin=datetime.datetime(1900, 1, 1))
    return df

In [None]:
df = pd.read_csv('RRCA_baseflow.csv')
df = convert_date(df)
df['Date'] = df['Corrected_Dates']
df = df.drop('Corrected_Dates', axis=1)
df

## Class code

In [None]:
fig, axs = plt.subplots(1, 3, sharey=True)
df.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8), s=2)
df.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1], s=2)
df.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2], s=2)
plt.savefig('scatter-plot.pdf')

In [None]:
feature_cols = ['Evapotranspiration']
X = df[feature_cols]
y = df.Observed

lm = LinearRegression()
lm.fit(X, y)

print(lm.intercept_)
print(lm.coef_)

In [None]:
X_new = pd.DataFrame({'Evapotranspiration': [df.Evapotranspiration.min(), df.Evapotranspiration.max()]})
preds = lm.predict(X_new)

In [None]:
df.plot(kind='scatter', x='Evapotranspiration', y='Observed', s=2)
plt.plot(X_new, preds, c='red', linewidth=2)
plt.savefig('Evapotranspiration-scatter.pdf')

In [None]:
lm = smf.ols(formula='Observed ~ Precipitation', data=df).fit()
display(lm.rsquared)

lm = LinearRegression()
lm.fit(df[['Precipitation']], df.Observed)
display(lm.score(df[['Precipitation']], df.Observed))

In [None]:
lm = smf.ols(formula='Observed ~ Precipitation + Irrigation_pumping', data=df).fit()
display(lm.params)
lm.summary()

## Charts and Correlation Matrix

In [None]:
grouped_data = df.groupby('Segment_id').mean()

grouped_data[['Evapotranspiration', 'Precipitation', 'Observed']].plot.bar()
plt.xlabel('Segment ID')
plt.ylabel('Mean Values')
plt.title('Mean Evapotranspiration, Precipitation, and Observed by Segment ID')
plt.show()
plt.savefig('group-segmentid.pdf')

In [None]:
plt.scatter(df["Date"], df["Observed"], s=3)
plt.xlabel("Date")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow over time")
plt.show()

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
summary_stats = df.describe()
print(summary_stats)

## Linear Regression

In [None]:
%matplotlib inline

data = pd.read_csv('RRCA_baseflow.csv')

X = data.drop(['Observed','Evapotranspiration', 'Irrigation_pumping'], axis=1)
y = data['Observed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R-squared value: {r2}")
print(f"Root mean squared error: {rmse}")

### Remove any Observed value above 200

In [None]:
outliers = data.loc[df['Observed'] > 200]
noOutliers = data.loc[df['Observed'] <= 200]
noOutliers

In [None]:
fig, axs = plt.subplots(1, 3, sharey=True)
noOutliers.plot(kind='scatter', x='Evapotranspiration', y='Observed', ax=axs[0], figsize=(16, 8), s=2)
noOutliers.plot(kind='scatter', x='Precipitation', y='Observed', ax=axs[1], s=2)
noOutliers.plot(kind='scatter', x='Irrigation_pumping', y='Observed', ax=axs[2], s=2)
plt.savefig('nooutlier-scatter-plot.pdf')

In [None]:
feature_cols = ['Evapotranspiration']
X = noOutliers[feature_cols]
y = noOutliers.Observed

lm = LinearRegression()
lm.fit(X, y)

X_new = pd.DataFrame({'Evapotranspiration': [noOutliers.Evapotranspiration.min(), noOutliers.Evapotranspiration.max()]})
preds = lm.predict(X_new)

noOutliers.plot(kind='scatter', x='Evapotranspiration', y='Observed', s=2)
plt.plot(X_new, preds, c='red', linewidth=2)
plt.savefig('nooutlier-Evapotranspiration.pdf')

In [None]:
X = noOutliers[['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']]
y = noOutliers['Observed']
model = LinearRegression()
model.fit(X, y)

print('R-squared:', model.score(X, y))

In [None]:
%matplotlib inline

X = noOutliers.drop(['Observed','Evapotranspiration', 'Irrigation_pumping'], axis=1)
y = noOutliers['Observed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R-squared value: {r2}")
print(f"Root mean squared error: {rmse}")