In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import folium
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
def convert_date(df):
    df['Corrected_Dates'] = pd.to_datetime((df['Date'] - 693963), unit='D', origin=datetime.datetime(1900, 1, 1))
    return df

In [None]:
df = pd.read_csv('RRCA_baseflow.csv')
df['Date'] = df['Corrected_Dates']
df = df.drop('Corrected_Dates', axis=1)
df

In [None]:
plt.scatter(df['Date'], df['Precipitation'], s=5)
plt.title('Date vs Precipitation')
plt.xlabel('Date')
plt.ylabel('Precipitation')
plt.show()

In [None]:
plt.scatter(df['Date'], df['Evapotranspiration'], s=5)
plt.title('Date vs Evapotranspiration')
plt.xlabel('DateCorrected Dates')
plt.ylabel('Evapotranspiration')
plt.show()

In [None]:
plt.scatter(df['Date'], df['Irrigation_pumping'], s=5)
plt.title('Date vs Irrigation pumping')
plt.xlabel('Date')
plt.ylabel('Irrigation pumping')
plt.show()

In [None]:
plt.scatter(df["Evapotranspiration"], df["Observed"])
plt.xlabel("Evapotranspiration")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow vs. evapotranspiration")
plt.show()

In [None]:
plt.scatter(df['Evapotranspiration'], df['Irrigation_pumping'], s=5)
plt.title('Evapotranspiration vs Irrigation pumping')
plt.xlabel('Evapotranspiration')
plt.ylabel('Irrigation pumping')
plt.show()

In [None]:
plt.scatter(df["Date"], df["Observed"])
plt.xlabel("Date")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow over time")
plt.show()

In [None]:
plt.hist(df["Observed"], bins=20)
plt.xlabel("Observed baseflow")
plt.ylabel("Frequency")
plt.title("Histogram of observed baseflow")
plt.show()

In [None]:
plt.boxplot(df.groupby("Segment_id")["Observed"].apply(list), labels=df["Segment_id"].unique())
plt.xlabel("Segment id")
plt.ylabel("Observed baseflow")
plt.title("Observed baseflow by segment id")
plt.show()

In [None]:
%matplotlib inline

df.isnull().sum()
df.describe()

X = df.drop(['Observed', 'Segment_id'], axis=1)
y = df['Observed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"R-squared value: {r2}")
print(f"Root mean squared error: {rmse}")

In [None]:
plt.scatter(y_test, y_pred, s=3)
plt.xlabel('Observed Baseflow')
plt.ylabel('Predicted Baseflow')
plt.show()

In [None]:
corr_matrix = df.corr()
print(corr_matrix)

In [None]:
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
summary_stats = df.describe()
print(summary_stats)

In [None]:
df.hist(figsize=(10,10), bins=20)
pd.plotting.scatter_matrix(df, figsize=(15,15))

In [None]:
map = folium.Map(location=[df['y'].mean(), df['x'].mean()], zoom_start=10)
for lat, lon, name in zip(df['y'], df['x'], df['Segment_id']):
    folium.Marker(location=[lat, lon], popup=name).add_to(map)
map.save('stations_map.pdf')

In [None]:
plt.plot(df['Date'], df['Observed'])
plt.xlabel('Date')
plt.ylabel('Observed Baseflow')
plt.show()

In [None]:
plt.scatter(df['Precipitation'], df['Observed'])
plt.xlabel('Precipitation')
plt.ylabel('Observed Baseflow')
plt.show()

In [None]:
df['Water_Balance'] = df['Precipitation'] - df['Evapotranspiration'] - df['Observed'] - df['Irrigation_pumping']
df['Groundwater_Contribution'] = df['Observed'] / (df['Observed'] + df['Water_Balance']) * 100
df

In [None]:
X = df[['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']]
y = df['Observed']
model = LinearRegression()
model.fit(X, y)

print('R-squared:', model.score(X, y))