# Let us create several databases in postgreSQL that will be used to create FLASK APIs for the endpoint to call upon for visualization and for model predictions

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import os

# globally set max columns and max rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

home = os.path.abspath("")
clean_data = os.path.join(home, "clean_data")
print(home)
print(clean_data)

In [None]:
# read in .csv as pandas dataframe
path = os.path.join(clean_data, "total_ML_data.csv")

df = pd.read_csv(path)
df.head()

In [None]:
# number of rows and # of columns
df.shape

In [None]:
# All columns in .csv file, all 43 of them
df.columns

In [None]:
df.dtypes

In [None]:
df_mean = df[['conductivity', 'hardness', 'turbidity', 'chlorophyll',
       'ammonia', 'nitrate_ite', 'aluminum', 'barium', 'calcium', 'carbon',
       'chloride', 'chromium', 'copper', 'magnesium', 'manganese', 'mercury',
       'molybdenum', 'phosphorus', 'potassium', 'silicon', 'sodium',
       'strontium', 'sulphate', 'vanadium', 'zinc']].mean().copy()
df_mean
pd_series = pd.Series(["Total"], index=["lake"])
total = pd_series.append(df_mean)
total

In [None]:
df_groupby = df.groupby("lake").mean().copy()

In [None]:
df_groupby.reset_index(inplace=True)
df_groupby

In [None]:
df_total = df_groupby.append(total, ignore_index=True)
df_total

In [None]:
df_total.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y = df["lake"].unique().copy()

label_encoder.fit(y)
encode_y = label_encoder.transform(y)

encode_y

In [None]:
dic_df = {
    "lake": y,
    "lake_encode": encode_y
}
df2 = pd.DataFrame(dic_df)
df2

In [None]:
df2.dtypes

# Connect to AWS Database

In [None]:
from sqlalchemy import create_engine
from config.postgres import username, password
database_name = "Great_Lake_ML"


rds_connection_string = f"postgresql://{username}:{password}@localhost:5432/{database_name}"

engine = create_engine(rds_connection_string)

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from encoded_lakes', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df2.to_sql(name='encoded_lakes', con=engine, if_exists='append', index=False)

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from lake_means', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_total.to_sql(name='lake_means', con=engine, if_exists='append', index=False)

In [None]:
# load up "total.csv" and export it to the postgresql database
# read in .csv as pandas dataframe
path_all = os.path.join(clean_data, "total.csv")

df_all = pd.read_csv(path_all)
df_all.head()

In [None]:
df_all['date_collect'] =  pd.to_datetime(df_all['date_collect'], infer_datetime_format=True)
df_all.dtypes

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from master_data', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_all.to_sql(name='master_data', con=engine, if_exists='append', index=False)

In [None]:
# load up "total.csv" and export it to the postgresql database
# read in .csv as pandas dataframe
path_meta = os.path.join(clean_data, "total_metadata.csv")

df_meta = pd.read_csv(path_meta)
df_meta.head()

In [None]:
df_meta['date_collect'] =  pd.to_datetime(df_meta['date_collect'], infer_datetime_format=True)
df_meta.dtypes

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from metadata', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_meta.to_sql(name='metadata', con=engine, if_exists='append', index=False)

In [None]:
# load up "total.csv" and export it to the postgresql database
# read in .csv as pandas dataframe
path_data = os.path.join(clean_data, "total_ML_data.csv")

df_data = pd.read_csv(path_data)
df_data.head()

In [None]:
df_data.dtypes

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from data', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_data.to_sql(name='data', con=engine, if_exists='append', index=False)

# Put training and testing data into AWS postgreSQL database

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import os

# globally set max columns and max rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

home = os.path.abspath("")
clean_data = os.path.join(home, "clean_data")
print(home)
print(clean_data)

In [None]:
# read in .csv as pandas dataframe
path = os.path.join(clean_data, "total_ML_data.csv")

df = pd.read_csv(path)
df.head()

In [None]:
df.columns

In [None]:
# create an X (data) and y (labels)
y = df["lake"].copy()
y_array = y.values

# use label encoder to replace string with numerical values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y_array)

encode_y = label_encoder.transform(y_array)

encode_y

X = df.copy()
X.drop(["lake"], axis=1, inplace=True)
X_array = X.values

encode_y

In [None]:
X_array

In [None]:
# random_state=42 for all models
from sklearn.model_selection import train_test_split
X_train_temp, X_test_temp, y_train, y_test = train_test_split(X_array, encode_y,
                                                train_size=0.75,
                                                test_size=0.25,
                                                stratify=encode_y,
                                                random_state=42)

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()
imp.fit(X_train_temp)
X_train = imp.transform(X_train_temp)
X_test = imp.transform(X_test_temp)
X_train

In [None]:
train_df = pd.DataFrame({'conductivity': X_train[:, 0], 'hardness': X_train[:, 1],\
                       'turbidity': X_train[:, 2], 'chlorophyll': X_train[:, 3],\
                       'ammonia': X_train[:, 4], 'nitrate_ite': X_train[:, 5],\
                       'aluminum': X_train[:, 6], 'barium': X_train[:, 7],\
                       'calcium': X_train[:, 8], 'carbon': X_train[:, 9],\
                       'chloride': X_train[:, 10], 'chromium': X_train[:, 11],\
                       'copper': X_train[:, 12], 'magnesium': X_train[:, 13],\
                       'manganese': X_train[:, 14], 'mercury': X_train[:, 15],\
                       'molybdenum': X_train[:, 16], 'phosphorus': X_train[:, 17],\
                       'potassium': X_train[:, 18], 'silicon': X_train[:, 19],\
                       'sodium': X_train[:, 20], 'strontium': X_train[:, 21],\
                       'sulphate': X_train[:, 22], 'vanadium': X_train[:, 23],\
                       'zinc': X_train[:, 24]})


test_df = pd.DataFrame({'conductivity': X_test[:, 0], 'hardness': X_test[:, 1],\
                       'turbidity': X_test[:, 2], 'chlorophyll': X_test[:, 3],\
                       'ammonia': X_test[:, 4], 'nitrate_ite': X_test[:, 5],\
                       'aluminum': X_test[:, 6], 'barium': X_test[:, 7],\
                       'calcium': X_test[:, 8], 'carbon': X_test[:, 9],\
                       'chloride': X_test[:, 10], 'chromium': X_test[:, 11],\
                       'copper': X_test[:, 12], 'magnesium': X_test[:, 13],\
                       'manganese': X_test[:, 14], 'mercury': X_test[:, 15],\
                       'molybdenum': X_test[:, 16], 'phosphorus': X_test[:, 17],\
                       'potassium': X_test[:, 18], 'silicon': X_test[:, 19],\
                       'sodium': X_test[:, 20], 'strontium': X_test[:, 21],\
                       'sulphate': X_test[:, 22], 'vanadium': X_test[:, 23],\
                       'zinc': X_test[:, 24]})

In [None]:
train_df.head()

In [None]:
train_df.insert(0, "lake", y_train.tolist())
train_df.head()

In [None]:
test_df.insert(0, "lake", y_test.tolist())
test_df.head()

In [None]:
from sqlalchemy import create_engine
from config.postgres import username, password
database_name = "Great_Lake_ML"


rds_connection_string = f"postgresql://{username}:{password}@localhost:5432/{database_name}"

engine = create_engine(rds_connection_string)

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from test_lakes', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
test_df.to_sql(name='test_lakes', con=engine, if_exists='append', index=False)

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from train_lakes', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
train_df.to_sql(name='train_lakes', con=engine, if_exists='append', index=False)