# Let us create several databases in postgreSQL that will be used to create FLASK APIs for the endpoint to call upon for visualization and for model predictions

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import os

# globally set max columns and max rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

home = os.path.abspath("")
clean_data = os.path.join(home, "clean_data")
print(home)
print(clean_data)

In [None]:
# read in .csv as pandas dataframe
path = os.path.join(clean_data, "total_ML_data.csv")

df = pd.read_csv(path)
df.head()

In [None]:
# number of rows and # of columns
df.shape

In [None]:
# All columns in .csv file, all 43 of them
df.columns

In [None]:
df.dtypes

In [None]:
df_mean = df[['conductivity', 'hardness', 'turbidity', 'chlorophyll',
       'ammonia', 'nitrate_ite', 'aluminum', 'barium', 'calcium', 'carbon',
       'chloride', 'chromium', 'copper', 'magnesium', 'manganese', 'mercury',
       'molybdenum', 'phosphorus', 'potassium', 'silicon', 'sodium',
       'strontium', 'sulphate', 'vanadium', 'zinc']].mean().copy()
df_mean
pd_series = pd.Series(["Total"], index=["lake"])
total = pd_series.append(df_mean)
total

In [None]:
df_groupby = df.groupby("lake").mean().copy()

In [None]:
df_groupby.reset_index(inplace=True)
df_groupby

In [None]:
df_total = df_groupby.append(total, ignore_index=True)
df_total

In [None]:
df_total.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y = df["lake"].unique().copy()

label_encoder.fit(y)
encode_y = label_encoder.transform(y)

encode_y

In [None]:
dic_df = {
    "lake": y,
    "lake_encode": encode_y
}
df2 = pd.DataFrame(dic_df)
df2

In [None]:
df2.dtypes

# Connect to AWS Database

In [None]:
from sqlalchemy import create_engine
from config.postgres import username, password


rds_connection_string = f"postgresql://{username}:{password}@awsgreatlakes.cdb9inonioij.us-east-2.rds.amazonaws.com:5432/awsgreatlakes"

engine = create_engine(rds_connection_string)

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from encoded_lakes', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df2.to_sql(name='encoded_lakes', con=engine, if_exists='append', index=False)

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from lake_means', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_total.to_sql(name='lake_means', con=engine, if_exists='append', index=False)

In [None]:
# load up "total.csv" and export it to the postgresql database
# read in .csv as pandas dataframe
path_all = os.path.join(clean_data, "total.csv")

df_all = pd.read_csv(path_all)
df_all.head()

In [None]:
df_all['date_collect'] =  pd.to_datetime(df_all['date_collect'], infer_datetime_format=True)
df_all.dtypes

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from master_data', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_all.to_sql(name='master_data', con=engine, if_exists='append', index=False)

In [None]:
# load up "total.csv" and export it to the postgresql database
# read in .csv as pandas dataframe
path_meta = os.path.join(clean_data, "total_metadata.csv")

df_meta = pd.read_csv(path_meta)
df_meta.head()

In [None]:
df_meta['date_collect'] =  pd.to_datetime(df_meta['date_collect'], infer_datetime_format=True)
df_meta.dtypes

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from metadata', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_meta.to_sql(name='metadata', con=engine, if_exists='append', index=False)

In [None]:
# load up "total.csv" and export it to the postgresql database
# read in .csv as pandas dataframe
path_data = os.path.join(clean_data, "total_ML_data.csv")

df_data = pd.read_csv(path_data)
df_data.head()

In [None]:
df_data.dtypes

In [None]:
# get table names
engine.table_names()

In [None]:
# table headers are set up correctly
pd.read_sql_query('select * from data', con=engine)

In [None]:
# read the pandas dataframe to the postgreSQL database
df_data.to_sql(name='data', con=engine, if_exists='append', index=False)

# Put training and testing data into AWS postgreSQL database

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import os

# globally set max columns and max rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

home = os.path.abspath("")
clean_data = os.path.join(home, "clean_data")
print(home)
print(clean_data)

C:\Users\danie\OneDrive\Documents\Great_Lakes_ML
C:\Users\danie\OneDrive\Documents\Great_Lakes_ML\clean_data


In [2]:
# read in .csv as pandas dataframe
path = os.path.join(clean_data, "total_ML_data.csv")

df = pd.read_csv(path)
df.head()

Unnamed: 0,lake,conductivity,hardness,turbidity,chlorophyll,ammonia,nitrate_ite,aluminum,barium,calcium,carbon,chloride,chromium,copper,magnesium,manganese,mercury,molybdenum,phosphorus,potassium,silicon,sodium,strontium,sulphate,vanadium,zinc
0,erie,249.0,,,9200.0,54000.0,1270000.0,,,,,12500000.0,,,,,,,117000.0,,900000.0,,,,,
1,erie,337.0,,,14400.0,14000.0,3440000.0,,,,,24000000.0,,,,,,,342000.0,,1570000.0,,,,,
2,erie,325.0,,,3300.0,20000.0,3580000.0,,,,,16300000.0,,,,,,,90000.0,,1390000.0,,,,,
3,erie,238.0,,,2600.0,16000.0,920000.0,,,,,11200000.0,,,,,,,15000.0,,730000.0,,,,,
4,erie,234.0,,,2500.0,12000.0,610000.0,,,,,9900000.0,,,,,,,22000.0,,380000.0,,,,,


In [3]:
df.columns

Index(['lake', 'conductivity', 'hardness', 'turbidity', 'chlorophyll',
       'ammonia', 'nitrate_ite', 'aluminum', 'barium', 'calcium', 'carbon',
       'chloride', 'chromium', 'copper', 'magnesium', 'manganese', 'mercury',
       'molybdenum', 'phosphorus', 'potassium', 'silicon', 'sodium',
       'strontium', 'sulphate', 'vanadium', 'zinc'],
      dtype='object')

In [4]:
# create an X (data) and y (labels)
y = df["lake"].copy()
y_array = y.values

# use label encoder to replace string with numerical values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y_array)

encode_y = label_encoder.transform(y_array)

encode_y

X = df.copy()
X.drop(["lake"], axis=1, inplace=True)
X_array = X.values

encode_y

array([0, 0, 0, ..., 3, 3, 3])

In [5]:
X_array

array([[2.49e+02,      nan,      nan, ...,      nan,      nan,      nan],
       [3.37e+02,      nan,      nan, ...,      nan,      nan,      nan],
       [3.25e+02,      nan,      nan, ...,      nan,      nan,      nan],
       ...,
       [1.04e+02, 4.80e+07, 5.50e-01, ..., 3.60e+06, 3.00e+02, 4.00e+02],
       [1.05e+02, 4.80e+07, 5.30e-01, ..., 3.70e+06, 3.00e+02, 8.00e+02],
       [1.03e+02, 4.80e+07, 1.06e+00, ..., 3.40e+06, 3.00e+02, 8.00e+02]])

In [6]:
# random_state=42 for all models
from sklearn.model_selection import train_test_split
X_train_temp, X_test_temp, y_train, y_test = train_test_split(X_array, encode_y,
                                                train_size=0.75,
                                                test_size=0.25,
                                                stratify=encode_y,
                                                random_state=42)

In [7]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()
imp.fit(X_train_temp)
X_train = imp.transform(X_train_temp)
X_test = imp.transform(X_test_temp)
X_train

array([[2.62969316e+02, 9.76609876e+07, 2.81968618e+00, ...,
        1.80612646e+07, 3.43256757e+02, 1.67957764e+03],
       [2.62969316e+02, 9.76609876e+07, 2.81968618e+00, ...,
        1.80612646e+07, 3.43256757e+02, 1.67957764e+03],
       [2.62969316e+02, 9.76609876e+07, 2.81968618e+00, ...,
        1.80612646e+07, 3.43256757e+02, 1.67957764e+03],
       ...,
       [6.97000000e+02, 1.75000000e+08, 2.35000000e+00, ...,
        5.31000000e+07, 1.03000000e+03, 5.80000000e+03],
       [2.33000000e+02, 9.76609876e+07, 2.81968618e+00, ...,
        1.80612646e+07, 3.43256757e+02, 1.67957764e+03],
       [2.62969316e+02, 9.76609876e+07, 2.81968618e+00, ...,
        1.80612646e+07, 3.43256757e+02, 1.67957764e+03]])

In [8]:
train_df = pd.DataFrame({'conductivity': X_train[:, 0], 'hardness': X_train[:, 1],\
                       'turbidity': X_train[:, 2], 'chlorophyll': X_train[:, 3],\
                       'ammonia': X_train[:, 4], 'nitrate_ite': X_train[:, 5],\
                       'aluminum': X_train[:, 6], 'barium': X_train[:, 7],\
                       'calcium': X_train[:, 8], 'carbon': X_train[:, 9],\
                       'chloride': X_train[:, 10], 'chromium': X_train[:, 11],\
                       'copper': X_train[:, 12], 'magnesium': X_train[:, 13],\
                       'manganese': X_train[:, 14], 'mercury': X_train[:, 15],\
                       'molybdenum': X_train[:, 16], 'phosphorus': X_train[:, 17],\
                       'potassium': X_train[:, 18], 'silicon': X_train[:, 19],\
                       'sodium': X_train[:, 20], 'strontium': X_train[:, 21],\
                       'sulphate': X_train[:, 22], 'vanadium': X_train[:, 23],\
                       'zinc': X_train[:, 24]})


test_df = pd.DataFrame({'conductivity': X_test[:, 0], 'hardness': X_test[:, 1],\
                       'turbidity': X_test[:, 2], 'chlorophyll': X_test[:, 3],\
                       'ammonia': X_test[:, 4], 'nitrate_ite': X_test[:, 5],\
                       'aluminum': X_test[:, 6], 'barium': X_test[:, 7],\
                       'calcium': X_test[:, 8], 'carbon': X_test[:, 9],\
                       'chloride': X_test[:, 10], 'chromium': X_test[:, 11],\
                       'copper': X_test[:, 12], 'magnesium': X_test[:, 13],\
                       'manganese': X_test[:, 14], 'mercury': X_test[:, 15],\
                       'molybdenum': X_test[:, 16], 'phosphorus': X_test[:, 17],\
                       'potassium': X_test[:, 18], 'silicon': X_test[:, 19],\
                       'sodium': X_test[:, 20], 'strontium': X_test[:, 21],\
                       'sulphate': X_test[:, 22], 'vanadium': X_test[:, 23],\
                       'zinc': X_test[:, 24]})

In [9]:
train_df.head()

Unnamed: 0,conductivity,hardness,turbidity,chlorophyll,ammonia,nitrate_ite,aluminum,barium,calcium,carbon,chloride,chromium,copper,magnesium,manganese,mercury,molybdenum,phosphorus,potassium,silicon,sodium,strontium,sulphate,vanadium,zinc
0,262.969316,97660990.0,2.819686,2759.964436,24063.427843,509731.239396,24015.412371,16681.494801,27404280.0,22418180.0,15702100.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,75200.0,1223461.0,562468.7,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
1,262.969316,97660990.0,2.819686,2700.0,28000.0,93000.0,24015.412371,16681.494801,27404280.0,31800000.0,13300000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,16500.0,1223461.0,1100000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
2,262.969316,97660990.0,2.819686,400.0,40000.0,340000.0,24015.412371,16681.494801,27404280.0,25100000.0,23400000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,7600.0,1223461.0,160000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
3,262.969316,97660990.0,2.819686,300.0,22000.0,400000.0,24015.412371,16681.494801,27404280.0,20700000.0,8200000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,5000.0,1223461.0,620000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
4,292.0,97660990.0,2.819686,300.0,2000.0,360000.0,24015.412371,16681.494801,27404280.0,22418180.0,16400000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,9500.0,1223461.0,380000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764


In [10]:
train_df.insert(0, "lake", y_train.tolist())
train_df.head()

Unnamed: 0,lake,conductivity,hardness,turbidity,chlorophyll,ammonia,nitrate_ite,aluminum,barium,calcium,carbon,chloride,chromium,copper,magnesium,manganese,mercury,molybdenum,phosphorus,potassium,silicon,sodium,strontium,sulphate,vanadium,zinc
0,0,262.969316,97660990.0,2.819686,2759.964436,24063.427843,509731.239396,24015.412371,16681.494801,27404280.0,22418180.0,15702100.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,75200.0,1223461.0,562468.7,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
1,2,262.969316,97660990.0,2.819686,2700.0,28000.0,93000.0,24015.412371,16681.494801,27404280.0,31800000.0,13300000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,16500.0,1223461.0,1100000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
2,2,262.969316,97660990.0,2.819686,400.0,40000.0,340000.0,24015.412371,16681.494801,27404280.0,25100000.0,23400000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,7600.0,1223461.0,160000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
3,1,262.969316,97660990.0,2.819686,300.0,22000.0,400000.0,24015.412371,16681.494801,27404280.0,20700000.0,8200000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,5000.0,1223461.0,620000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
4,0,292.0,97660990.0,2.819686,300.0,2000.0,360000.0,24015.412371,16681.494801,27404280.0,22418180.0,16400000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,9500.0,1223461.0,380000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764


In [11]:
test_df.insert(0, "lake", y_test.tolist())
test_df.head()

Unnamed: 0,lake,conductivity,hardness,turbidity,chlorophyll,ammonia,nitrate_ite,aluminum,barium,calcium,carbon,chloride,chromium,copper,magnesium,manganese,mercury,molybdenum,phosphorus,potassium,silicon,sodium,strontium,sulphate,vanadium,zinc
0,1,248.0,97660990.0,2.819686,1600.0,18000.0,1313000.0,24015.412371,16681.494801,27404280.0,23500000.0,8000000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,21500.0,1223461.0,140000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
1,0,262.969316,97660990.0,2.819686,4500.0,26000.0,912000.0,24015.412371,16681.494801,27404280.0,23200000.0,11700000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,66900.0,1223461.0,860000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
2,3,262.969316,97660990.0,2.819686,2759.964436,24063.427843,509731.2,100.0,16681.494801,27404280.0,22418180.0,15702100.0,490.602837,100.0,7176951.0,5781.001341,19.957377,754.934823,24102.070796,1223461.0,562468.666828,8604703.0,118046.752622,18061260.0,343.256757,1600.0
3,0,297.0,97660990.0,2.819686,6065.0,2000.0,520000.0,24015.412371,16681.494801,27404280.0,22418180.0,16900000.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,31000.0,1223461.0,240000.0,8604703.0,118046.752622,18061260.0,343.256757,1679.57764
4,0,262.969316,97660990.0,2.819686,2759.964436,16000.0,245000.0,24015.412371,16681.494801,27404280.0,22418180.0,15702100.0,490.602837,1582.730721,7176951.0,5781.001341,19.957377,754.934823,14000.0,1223461.0,562468.666828,8604703.0,118046.752622,18061260.0,343.256757,1679.57764


In [12]:
from sqlalchemy import create_engine
from config.postgres import username, password


rds_connection_string = f"postgresql://{username}:{password}@awsgreatlakes.cdb9inonioij.us-east-2.rds.amazonaws.com:5432/awsgreatlakes"

engine = create_engine(rds_connection_string)

In [13]:
# get table names
engine.table_names()

['lake_means',
 'master_data',
 'metadata',
 'data',
 'encoded_lakes',
 'train_lakes',
 'test_lakes']

In [14]:
# table headers are set up correctly
pd.read_sql_query('select * from test_lakes', con=engine)

Unnamed: 0,lake,conductivity,hardness,turbidity,chlorophyll,ammonia,nitrate_ite,aluminum,barium,calcium,carbon,chloride,chromium,copper,magnesium,manganese,mercury,molybdenum,phosphorus,potassium,silicon,sodium,strontium,sulphate,vanadium,zinc


In [15]:
# read the pandas dataframe to the postgreSQL database
test_df.to_sql(name='test_lakes', con=engine, if_exists='append', index=False)

In [16]:
# table headers are set up correctly
pd.read_sql_query('select * from train_lakes', con=engine)

Unnamed: 0,lake,conductivity,hardness,turbidity,chlorophyll,ammonia,nitrate_ite,aluminum,barium,calcium,carbon,chloride,chromium,copper,magnesium,manganese,mercury,molybdenum,phosphorus,potassium,silicon,sodium,strontium,sulphate,vanadium,zinc


In [17]:
# read the pandas dataframe to the postgreSQL database
train_df.to_sql(name='train_lakes', con=engine, if_exists='append', index=False)