# Final Project Title

# Introduction

Proposal:

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas vel mi dui. Aliquam erat volutpat. Ut imperdiet sapien eget molestie pellentesque. Nulla ullamcorper iaculis nulla, vel maximus dolor interdum sit amet. Maecenas malesuada fermentum neque, ut rhoncus diam rhoncus ac. Etiam erat nisi, blandit porttitor consequat a, tincidunt eu ligula. Cras ante felis, tincidunt a pulvinar et, pretium non felis. Sed in ligula feugiat, consequat tellus non, sodales nisi. Phasellus ex urna, faucibus non mattis vitae, sagittis sollicitudin magna. Etiam ut mauris diam. Nam laoreet ante leo, sit amet dapibus sapien tincidunt ac.

Data:

[LA County Boundary](https://egis-lacounty.hub.arcgis.com/datasets/county-boundaries/explore?location=34.439893%2C-118.182786%2C8.50)

# Methods

## Preprocessing

### Import Packages

In [1]:
import pandas as pd
import numpy as np
from datetime import date
import scipy.stats as stats

### Create Functions

In [2]:
# Create well ID (WID) column.
def create_WID(df, col_name):
    
    df['WID'] = df['GLOBAL_ID'] + '-' + df[col_name]
    return df


# Create date column for weather data.
def get_date(row):

    string = str(row['DATE'])

    year = string[:4]
    month = string[4:6]
    day = string[6:]

    date = year + '-' + month + '-' + day

    return date

## Exploratory Data Analysis

### Import Packages and Processed Data 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns

In [None]:
data = pd.read_csv(r'data\full.csv')
data.loc[:,'GW_MEAS_DATE'] = pd.to_datetime(data['GW_MEAS_DATE'])

In [None]:
# Trim data by date to reduce size.
data = data[data['GW_MEAS_DATE'] > '2010-01-01'].reset_index(drop=True)

# Date to numeric for modeling.
data['GW_MEAS_DATE'] = pd.to_numeric(data['GW_MEAS_DATE'])

In [None]:
bounds = gpd.read_file(r'data\County_Boundaries\County_Boundaries.shp')

In [None]:
bounds[bounds['TYPE'] == 'LA County']

### Data Visualization

In [None]:
by_date = data.groupby(['GW_MEAS_DATE']).mean().reset_index()
plt.figure(figsize=(12,8), dpi=80)
plt.plot(by_date['GW_MEAS_DATE'], by_date['DEPTH'])
plt.gca().invert_yaxis()
plt.xlabel('Year')
plt.ylabel('Depth')
plt.title('Depth by Date')
plt.show()

In [None]:
wells = data.groupby('WID').mean().reset_index()

plt.figure(figsize=(12,12), dpi=100)
plt.scatter(wells['LONGITUDE'], wells['LATITUDE'], c=wells['DEPTH'], cmap='viridis', marker='o', s=1)
plt.grid()


plt.show()

In [None]:
ces

In [None]:
plt.scatter(wells['LONGITUDE'], wells['LATITUDE'])
ces.plot()
plt.show()


In [None]:
import geopandas as gpd

ces = gpd.read_file(r'data\calenviroscreen40shpf2021shp\CES4 Final Shapefile.shp')

In [None]:
la = ces[ces['County'] == 'Los Angeles'].reset_index()

In [None]:
union = la.unary_union

In [None]:
type(union)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(la[0].exterior.xy[0], la[0].exterior.xy[1], color='black')

## Modeling

### Random Forest

#### Packages

In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics
import sklearn.model_selection as ms

#### Functions

In [18]:
def rfr_optimize(x,y,hyper_grid):
    '''
    This function is used to optimize the hyperparameters of the Random Forest Regressor.
    Input:
        x: independent variables
        y: dependent variable
        hyper_grid: a dictionary of hyperparameters and ranges to test for optimization
    '''

    # Split data into training and testing sets.
    x_train, x_test, y_train, y_test = ms.train_test_split(x, y, test_size=0.2)

    # Create the model
    rfr = RandomForestRegressor(
        n_jobs=-1,
        verbose=1
        )

    # Create the grid search object
    grid = ms.GridSearchCV(
        rfr,
        hyper_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=2,
        )

    # Perform grid search for optimal hyperparameter fit.
    grid.fit(x_train, y_train)
    
    return grid

In [19]:
def test_model(x,y,grid):

    x_train, x_test, y_train, y_test = ms.train_test_split(x, y, test_size=0.2)

    rfr = RandomForestRegressor(n_jobs=-1, **grid.best_params_)

    rfr.fit(x_train, y_train)

    y_pred = rfr.predict(x_test)

    print('Random Forest Regressor: ')
    print(grid.best_params_, '\n')

    evs = metrics.explained_variance_score(y_test, y_pred)
    print(f'Explained Variance: {evs}')

    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f'Root Mean Squared Error: {rmse}')
    

#### Load Data

In [4]:
data = pd.read_csv(r'data\full.csv')

# Trim data by date to reduce size.
data = data[data['GW_MEAS_DATE'] > '2010-01-01'].reset_index(drop=True)

# Keep only water elevation data greater than 0.
data = data[data['WATER_ELEVATION'] >= 0].reset_index(drop=True)

# Date to numeric for modeling.
data['GW_MEAS_DATE'] = pd.to_numeric(pd.to_datetime(data['GW_MEAS_DATE']))

# Define x and y columns.
x_cols = ['GW_MEAS_DATE','LATITUDE','LONGITUDE', 'PRCP', 'TMAX', 'TMIN', 'ELEVATION']
y_cols = ['DEPTH', 'WATER_ELEVATION']


### Water Elevation

In [5]:
# Split dataframe into x and y.
x = data[x_cols]
y = data[y_cols[1]]

#### Single Hyperarameter Optimization

##### bootstrap

In [None]:
bootstrap = ['True', 'False']

hyper_grid = {
    'bootstrap': bootstrap,
    }

grid = rfr_optimize(x,y,hyper_grid)

In [None]:
test_model(x,y,grid)

##### max_depth

In [17]:
max_depth = list(range(20, 110, 10))

hyper_grid = {
'max_depth': max_depth,
}

grid = rfr_optimize(x,y,hyper_grid)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.8s finished


In [20]:
test_model(x,y,grid)

Random Forest Regressor: 
{'max_depth': 100} 

Explained Variance: 0.9999169531468335
Root Mean Squared Error: 2.6547345150880086


##### n_estimators

In [6]:
n_estimators = list(range(40, 110, 10))

hyper_grid = {
    'n_estimators': n_estimators,
    }

grid = rfr_optimize(x,y,hyper_grid)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    7.9s finished


In [9]:
test_model(grid)

Random Forest Regressor: 
{'n_estimators': 90} 

Explained Variance: 0.9999005453108044
Root Mean Squared Error: 2.8959455058197596


### Multiple Hyperparameter Optimization

In [None]:
n_estimators = list(range(94, 96, 1))
max_depth = list(range(56,58,1))

hyper_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    }

grid = rfr_optimize(x,y,hyper_grid)

## Depth

In [9]:
# Split dataframe into x and y.
x = data[x_cols]
y = data[y_cols[0]]

In [12]:
max_depth = [None]

for n in range(50, 100, 10):
    max_depth.append(n)

max_depth = list(range(50, 100, 10))

hyper_grid = {
'max_depth': max_depth,
}

grid = rfr_optimize(x,y,hyper_grid)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.4s


{'max_depth': 60}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.5s finished


In [13]:
test_model(grid)

Random Forest Regressor: 
{'max_depth': 60} 

Explained Variance: 0.9982556555694703
Root Mean Squared Error: 2.1915102656718815
