# This code will be used to create app.py for Flask APIs

### This API will be used to calculate train and test scores for the chosen model
### 4 models available:  RandomForest, GradientBoostingClassifier, KNN and Deep Neural Net

In [1]:
# import pandas to display the database as a dataframe
import pandas as pd
import numpy as np

# import os for filenames
import os
home = os.path.abspath("")
model_files = os.path.join(home, "model_files")
json_files = os.path.join(home, "json")

# import json to turn dictionaries to json files
import json

# import joblib to load models, scaler and imputer
from joblib import load

# For .load_model() of .h5 files and to .predict()
from tensorflow.keras.models import load_model

# In order to calculate F1 scores
from sklearn.metrics import classification_report

# sqlalchemy dependencies in order to access FAOSTAT database
# import sqlalchemy
from sqlalchemy import create_engine

# Password and user for AWS postgreSQL
from postgres import username, password

In [None]:
# For local database
rds_connection_string = f"postgresql://{username}:{password}@awsgreatlakes.cdb9inonioij.us-east-2.rds.amazonaws.com:5432/awsgreatlakes"

engine = create_engine(rds_connection_string)

In [None]:
# get table names
engine.table_names()

In [None]:
# get training data
train_df = pd.read_sql_query('select * from train_lakes', con=engine)
test_df = pd.read_sql_query('select * from test_lakes', con=engine)
label_df = pd.read_sql_query('select * from encoded_lakes', con=engine)

In [None]:
train_df.head()

In [None]:
X_train = train_df
X_train.drop(["lake"], axis=1, inplace=True)
X_train.head()

In [None]:
# create an X (data) and y (labels)
y_train = train_df["lake"].copy()
y_test = test_df["lake"].copy()

y_array_train = y_train.values.copy()
y_array_test = y_test.values.copy()

X_train = train_df.copy()
X_test = test_df.copy()

X_train.drop(["lake"], axis=1, inplace=True)
X_test.drop(["lake"], axis=1, inplace=True)

X_array_train = X_train.values.copy()
X_array_test = X_test.values.copy()

X_array_train

In [None]:
y_array_train

In [None]:
scaler_minmax = os.path.join(model_files, 'min_max_scaler.scaler')

scaler = load(scaler_minmax)

X_train_scaled = scaler.transform(X_array_train)
X_test_scaled = scaler.transform(X_array_test)

In [None]:
def class_report(X_train, y_train, X_test, y_test, lake_labels_df, model):

    predictions_train = model.predict(X_train)
    predictions_test = model.predict(X_test)

    labels = lake_labels_df["lake"].tolist()

    report_train = classification_report(y_train, predictions_train,\
                               target_names=labels, output_dict=True)
    report_test = classification_report(y_test, predictions_test,\
                               target_names=labels, output_dict=True)
    
    total_dict = {
        "train": report_train,
        "test": report_test
    }
    
    return total_dict

In [None]:
def class_report_deep(X_train, y_train, X_test, y_test, lake_labels_df, model):
    
    predictions_train = model.predict(X_train)
    predictions_test = model.predict(X_test)
    
    predictions_train_cat = np.argmax(predictions_train, axis=-1)
    predictions_test_cat = np.argmax(predictions_test, axis=-1)
    
    labels = lake_labels_df["lake"].tolist()

    report_train = classification_report(y_train, predictions_train_cat,\
                               target_names=labels, output_dict=True)
    report_test = classification_report(y_test, predictions_test_cat,\
                               target_names=labels, output_dict=True)
    
    total_dict = {
        "train": report_train,
        "test": report_test
    }
    
    return total_dict

In [None]:
def f1_score(report):
    erie_train_f1 = round(report['train']['erie']["f1-score"],5)
    huron_train_f1 = round(report['train']['huron']["f1-score"],5)
    ontario_train_f1 = round(report['train']['ontario']["f1-score"],5)
    superior_train_f1 = round(report['train']['superior']["f1-score"],5)
    weighted_train_f1 = round(report['train']['weighted avg']["f1-score"],5)
    erie_test_f1 = round(report['test']['erie']["f1-score"],5)
    huron_test_f1 = round(report['test']['huron']["f1-score"],5)
    ontario_test_f1 = round(report['test']['ontario']["f1-score"],5)
    superior_test_f1 = round(report['test']['superior']["f1-score"],5)
    weighted_test_f1 = round(report['test']['weighted avg']["f1-score"],5)
    
    f1_dict = {
    "erie": {"train": erie_train_f1, "test": erie_test_f1},
    "huron": {"train": huron_train_f1, "test": huron_test_f1},
    "ontario": {"train": ontario_train_f1, "test": ontario_test_f1},
    "superior": {"train": superior_train_f1, "test": superior_test_f1},
    "weighted": {"train": weighted_train_f1, "test": weighted_test_f1}
    }
    
    return f1_dict

In [None]:
# chosen_model = "Random_Forest"
# chosen_model = "K_Nearest_Neighbours"
# chosen_model = "Gradient_Boosting_Classifier"
chosen_model = "Deep_Neural_Net"

if chosen_model == "Random_Forest":
    
    model_path = os.path.join(model_files, 'rf_est-100.joblib')
    model = load(model_path)
    
    report = class_report(X_train_scaled, y_array_train, X_test_scaled,\
                          y_array_test, label_df, model)
    
    jsonify_dict = f1_score(report)
    
elif chosen_model == "K_Nearest_Neighbours":
    
    model_path = os.path.join(model_files, 'knn_k-9.joblib')
    model = load(model_path)
    
    report = class_report(X_train_scaled, y_array_train, X_test_scaled,\
                          y_array_test, label_df, model)
    
    jsonify_dict = f1_score(report)
    
elif chosen_model == "Gradient_Boosting_Classifier":
    
    model_path = os.path.join(model_files, 'Gradient_Boosted.joblib')
    model = load(model_path)
    
    report = class_report(X_train_scaled, y_array_train, X_test_scaled,\
                          y_array_test, label_df, model)
    
    jsonify_dict = f1_score(report)
    
elif chosen_model == "Deep_Neural_Net":
    
    model_path = os.path.join(model_files, 'deep_neural_7_hidden_1000_epoch.h5')
    model = load_model(model_path)
    
    report = class_report_deep(X_train_scaled, y_array_train, X_test_scaled,\
                        y_array_test, label_df, model)
    
    jsonify_dict = f1_score(report)
    
print(jsonify_dict)
# from flask import jsonify
# jsonify(f1_dict)

In [None]:
model_json = os.path.join(json_files, "models.json")

with open(model_json, 'w', encoding="latin-1") as outfile:
    json.dump(jsonify_dict, outfile)

### This API is really simple, it is just the average of each parameter and the total

In [None]:
# For local database
rds_connection_string = f"postgresql://{username}:{password}@awsgreatlakes.cdb9inonioij.us-east-2.rds.amazonaws.com:5432/awsgreatlakes"

engine = create_engine(rds_connection_string)

In [None]:
# get table names
engine.table_names()

In [None]:
# get training data
data_df = pd.read_sql_query('select * from data', con=engine)
data_columns = data_df.columns

In [None]:
data_columns

In [None]:
data_columns[1:]

In [None]:
data_columns[0]

In [None]:
SQL_string = f"SELECT {data_columns[0]}"

for x in data_columns[1:]:
    SQL_string += f", AVG({x}) AS {x}"
SQL_string += f" FROM data GROUP BY {data_columns[0]}"
SQL_string

In [None]:
mean_df = pd.read_sql_query(SQL_string, con=engine)
mean_df

In [None]:
mean_df.set_index('lake', inplace=True)
mean_dict = mean_df.to_dict(orient="index")

In [None]:
SQL_string = f"SELECT AVG({data_columns[1]}) AS {data_columns[1]}"

for x in data_columns[2:]:
    SQL_string += f", AVG({x}) AS {x}"
SQL_string += " FROM data"
SQL_string

In [None]:
total_mean_df = pd.read_sql_query(SQL_string, con=engine)
total_mean_df

In [None]:
total_mean_df.index = ['total']
total_mean_df

In [None]:
total_mean_dict = total_mean_df.to_dict(orient="index")

In [None]:
mean_dict['total'] = total_mean_dict['total']
# from flask import jsonify
# jsonify(mean_dict)

In [None]:
mean_json = os.path.join(json_files, "mean.json")

with open(mean_json, 'w', encoding="latin-1") as outfile:
    json.dump(mean_dict, outfile)

### Create a static json with all the metadata for LEAFLET plotting

In [None]:
# For local database
rds_connection_string = f"postgresql://{username}:{password}@awsgreatlakes.cdb9inonioij.us-east-2.rds.amazonaws.com:5432/awsgreatlakes"

engine = create_engine(rds_connection_string)

In [None]:
# get table names
engine.table_names()

In [None]:
data_columns = ['lake', 'date_collect', 'station_descr',\
                    'latitude', 'longitude']
SQLstring = f"SELECT {data_columns[0]}"
for x in data_columns[1:]:
    SQLstring += f", {x}"
SQLstring += f" FROM metadata"
SQLstring

In [None]:
# get training data
metadata_df = pd.read_sql_query(SQLstring, con=engine)
metadata_columns = metadata_df.columns

In [None]:
metadata_columns

In [None]:
metadata_slice_df = metadata_df[['lake', 'date_collect', 'station_descr',\
                                 'latitude', 'longitude']].copy()
metadata_slice_df.head()

In [None]:
metadata_slice_df = metadata_slice_df[metadata_slice_df["station_descr"] != "-"]
metadata_slice_df = metadata_slice_df[metadata_slice_df["latitude"] != "-"]
metadata_slice_df = metadata_slice_df[metadata_slice_df["longitude"] != "-"]

In [None]:
metadata_slice_df.dropna(inplace=True)

In [None]:
# lowercase the stations and capitalize the first letter
redone_stations = []

for x in metadata_slice_df["station_descr"].unique():
    
    x_lower = x.lower()
    
    split_text = x_lower.split()
    
    combined_text = split_text[0].capitalize()
    
    for y in range(len(split_text)-1):
        
        combined_text += " " + split_text[y+1].capitalize()
    
    redone_stations.append(combined_text)

In [None]:
total_count = []
for index, x in enumerate(metadata_slice_df["station_descr"].unique()):
    
    count = metadata_slice_df.loc[metadata_slice_df['station_descr']==x].count()
    min_date = metadata_slice_df.loc[metadata_slice_df['station_descr']==x,\
                                     ["date_collect"]].min()
    max_date = metadata_slice_df.loc[metadata_slice_df['station_descr']==x,\
                                     ["date_collect"]].max()
    latitude = metadata_slice_df.loc[metadata_slice_df['station_descr']==x,\
                                     ["latitude"]].head(1).values[0][0]
    longitude = metadata_slice_df.loc[metadata_slice_df['station_descr']==x,\
                                     ["longitude"]].head(1).values[0][0]
    lake = metadata_slice_df.loc[metadata_slice_df['station_descr']==x,\
                                     ["lake"]].head(1).values[0][0]
    
    overall_dict = {
        
        redone_stations[index]: {
            "lake": str(lake),
            "count": int(count[0]),
            "date": {
                "min": {
                    "year": int(min_date[0].year),
                    "month": int(min_date[0].month),
                    "day": int(min_date[0].day)
                },
                "max": {
                    "year": int(max_date[0].year),
                    "month": int(max_date[0].month),
                    "day": int(max_date[0].day)
            }
        },
            "coord": {
                "latitude": float(latitude),
                "longitude": float(longitude)
            }
        
    }}
       
    total_count.append(overall_dict)

In [None]:
metadata_json = os.path.join(json_files, "metadata.json")

with open(metadata_json, 'w', encoding="latin-1") as outfile:
    json.dump(total_count, outfile)

### API for plotting call 2 variables and plot over time the change in value

In [2]:
# For local database
rds_connection_string = f"postgresql://{username}:{password}@awsgreatlakes.cdb9inonioij.us-east-2.rds.amazonaws.com:5432/awsgreatlakes"

engine = create_engine(rds_connection_string)

In [3]:
# get table names
engine.table_names()

['master_data',
 'metadata',
 'data',
 'encoded_lakes',
 'train_lakes',
 'test_lakes']

In [None]:
# get training data
data_df = pd.read_sql_query('select * from master_data', con=engine)
data_columns = data_df.columns

In [None]:
data_df["lake"].unique()

In [4]:
data_columns = ['lake', 'water_body', 'date_collect', 'station_num', 'sample_num',
       'station_descr', 'latitude', 'longitude', 'conductivity', 'hardness',
       'turbidity', 'chlorophyll', 'ammonia', 'nitrate_ite', 'aluminum',
       'barium', 'calcium', 'carbon', 'chloride', 'chromium', 'copper',
       'magnesium', 'manganese', 'mercury', 'molybdenum', 'phosphorus',
       'potassium', 'silicon', 'sodium', 'strontium', 'sulphate', 'vanadium',
       'zinc']
len(data_columns)

33

In [5]:
possible_columns = data_columns[8:]
possible_columns


['conductivity',
 'hardness',
 'turbidity',
 'chlorophyll',
 'ammonia',
 'nitrate_ite',
 'aluminum',
 'barium',
 'calcium',
 'carbon',
 'chloride',
 'chromium',
 'copper',
 'magnesium',
 'manganese',
 'mercury',
 'molybdenum',
 'phosphorus',
 'potassium',
 'silicon',
 'sodium',
 'strontium',
 'sulphate',
 'vanadium',
 'zinc']

In [None]:
first_value = "conductivity"
second_value = "chlorophyll"

x = columns_list.index(first_value)
y = columns_list.index(second_value)

In [None]:
SQL_string = f"SELECT EXTRACT(YEAR FROM {data_columns[2]}) AS YYYY, lake"
SQL_string += f", AVG({columns_list[x]}) AS {columns_list[x]}"
SQL_string += f", AVG({columns_list[y]}) AS {columns_list[y]}"
SQL_string += " FROM master_data GROUP BY YYYY, lake"
SQL_string += " ORDER BY lake, YYYY"
SQL_string

In [None]:
visualize_df = pd.read_sql_query(SQL_string, con=engine)
visualize_df.dropna(inplace=True)
visualize_df.head()

In [None]:
visualize_df["lake"].unique()

In [None]:
master_list = []

for z in visualize_df["lake"].unique():

    year = visualize_df.loc[visualize_df['lake']==z, ["yyyy"]]["yyyy"].tolist()
    first_param = visualize_df.loc[visualize_df['lake']==z,\
                                   [columns_list[x]]][columns_list[x]].tolist()
    second_param = visualize_df.loc[visualize_df['lake']==z,\
                                    [columns_list[y]]][columns_list[y]].tolist()

    master_dict = {
        z: {
            "year": year,
            columns_list[x]: first_param,
            columns_list[y]: second_param
                 }}

    master_list.append(master_dict)
master_list

In [None]:
visualization_json = os.path.join(json_files, "visualization.json")

with open(visualization_json, 'w', encoding="latin-1") as outfile:
    json.dump(master_list, outfile)

### There are 25 potential variables that an input can be placed
### Create an API for it

In [None]:
columns_possible = ['conductivity', 'hardness',
       'turbidity', 'chlorophyll', 'ammonia', 'nitrate_ite', 'aluminum',
       'barium', 'calcium', 'carbon', 'chloride', 'chromium', 'copper',
       'magnesium', 'manganese', 'mercury', 'molybdenum', 'phosphorus',
       'potassium', 'silicon', 'sodium', 'strontium', 'sulphate', 'vanadium',
       'zinc']

In [None]:
from flask import Flask, request, jsonify #import main Flask class and request object
from flask_cors import CORS

app = Flask(__name__) #create the Flask app

# ensure that flask server enables CORS
CORS(app)

@app.route('/')
def query_example():
    # massive list of potential variables
    model = request.args.get('model')
    lake = request.args.get('lake')
    con01 = request.args.get('con1')
    har02 = request.args.get('har2')
    tur03 = request.args.get('tur03')
    chl04 = request.args.get('chl04')
    amm05 = request.args.get('amm05')
    nit06 = request.args.get('nit06')
    alu07 = request.args.get('alu07')
    bar08 = request.args.get('bar08')
    cal09 = request.args.get('cal09')
    car10 = request.args.get('car10')
    chl11 = request.args.get('chl11')
    chr12 = request.args.get('chr12')
    cop13 = request.args.get('cop13')
    mag14 = request.args.get('mag14')
    man15 = request.args.get('man15')
    mer16 = request.args.get('mer16')
    mol17 = request.args.get('mol17')
    pho18 = request.args.get('pho18')
    pot19 = request.args.get('pot19')
    sil20 = request.args.get('sil20')
    sod21 = request.args.get('sod21')
    str22 = request.args.get('str22')
    sul23 = request.args.get('sul23')
    van24 = request.args.get('van24')
    zin25 = request.args.get('zin25')
    
    # feed it into this function
    data = etl_flask.etl_user_dynamic(model, con01, har02, tur03, chl04, amm05,\
                                     nit06, alu07, bar08, cal09, car10,\
                                     chl11, chr12, cop13, mag14, man15,\
                                     mer16, mol17, pho18, pot19, sil20,\
                                     sod21, str22, sul23, van24, zin25)
    data["model"] = model
    data["user_lake"] = lake
    
    return jsonify(data)

In [11]:
list_ex = []
for x in range(5):
    list_ex.append(None)
for x in range(5):
    list_ex.append("hi")
for x in range(15):
    list_ex.append(1)
list_ex

[None,
 None,
 None,
 None,
 None,
 'hi',
 'hi',
 'hi',
 'hi',
 'hi',
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [14]:
new_param_list = []
for x in list_ex:
    try:
        float(x)
        new_param_list.append(float(x))
    except:
        new_param_list.append(None)
new_param_list

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [15]:
list_array = np.array(new_param_list)
list_reshape = list_array.reshape((1, 25))
list_reshape.shape

(1, 25)

In [16]:
imputer_simple = os.path.join(home, "model_files", 'SimpleImputer.imputer')

imp = load(imputer_simple)

X_finite = imp.transform(list_reshape)
X_finite

array([[2.63099277e+02, 9.79347905e+07, 2.68536344e+00, 2.75335819e+03,
        2.40971108e+04, 5.17926216e+05, 2.31998158e+04, 1.67195036e+04,
        2.74723253e+07, 2.24293070e+07, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00]])

In [17]:
chosen_model = "Random_Forest"
# chosen_model = "K_Nearest_Neighbours"
# chosen_model = "Gradient_Boosting_Classifier"
# chosen_model = "Deep_Neural_Net"

if chosen_model == "Random_Forest":
    
    model_path = os.path.join(model_files, 'rf_est-100.joblib')
    model = load(model_path)
    
    prediction = model.predict(X_finite)
    
elif chosen_model == "K_Nearest_Neighbours":
    
    model_path = os.path.join(model_files, 'knn_k-9.joblib')
    model = load(model_path)
    
    prediction = model.predict(X_finite)
    
elif chosen_model == "Gradient_Boosting_Classifier":
    
    model_path = os.path.join(model_files, 'Gradient_Boosted.joblib')
    model = load(model_path)
    
    prediction = model.predict(X_finite)
    
elif chosen_model == "Deep_Neural_Net":
    
    model_path = os.path.join(model_files, 'deep_neural_7_hidden_1000_epoch.h5')
    model = load_model(model_path)
    
    prediction = model.predict(X_finite)
    
    prediction = np.argmax(prediction, axis=-1)
    
print(prediction)

[0]


In [None]:
if prediction[0] == 0:
    lake_predict = "erie"
elif prediction[0] == 1:
    lake_predict = "huron"
elif prediction[0] == 2:
    lake_predict = "ontario"
elif prediction[0] == 3:
    lake_predict = "superior"
lake_predict