## Import Statements

In [4]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

## Read in Data

In [39]:
data_path = '../Data/'

filenames = [
#     'CM2014_edit.csv',
    'CM2015_edit.csv',
    'CM2016_edit.csv',
    'CM2017_edit.csv',
    'CM2018_edit.csv',
    'mdcp.csv'
]

# cla_2014 = pd.read_csv(data_path + filenames[0], low_memory=False)
cla_2015_raw = pd.read_csv(data_path + filenames[0], low_memory=False)
cla_2016_raw = pd.read_csv(data_path + filenames[1], low_memory=False)
cla_2017_raw = pd.read_csv(data_path + filenames[2], low_memory=False)
cla_2018_raw = pd.read_csv(data_path + filenames[3], low_memory=False)
mdcp_raw = pd.read_csv(data_path + filenames[4], low_memory=False)    # Mendota buoy 

## Clean Data

### CLA Data

In [73]:
keep15 = [     # features to keep for years 2015-2017
    'correct_timestamp',
    'collectionSiteId',
    'lake',
    'algalBloom',
    'algalBloomSheen',
    'turbidity',
    'lat',
    'long'
]

keep18 = [    # features to keep for 2018
    'sample_collection_time',
    'collectionSiteId',
    'lake',
    'algalBloom',
    'algalBloomSheen',
    'turbidity',
    'latitiude',
    'longitude'
]

rename15 = {   # rename features for 2015-2017
    'collectionSiteId': 'site',
    'lat': 'latitude',
    'long': 'longitude',
    'correct_timestamp': 'date'
}

rename18 = {   # renamce features for 2018
    'collectionSiteId': 'site',
    'sample_collection_time': 'date',
    'latitiude': 'latitude'
}

cla_2015 = cla_2015_raw[keep15]
cla_2016 = cla_2016_raw[keep15]
cla_2017 = cla_2017_raw[keep15]
cla_2018 = cla_2018_raw[keep18]

cla_2015.rename(rename15, axis='columns', inplace=True)
cla_2016.rename(rename15, axis='columns', inplace=True)
cla_2017.rename(rename15, axis='columns', inplace=True)
cla_2018.rename(rename18, axis='columns', inplace=True)

# change data types
numeric = [    # list of numeric features
    'algalBloom',
    'algalBloomSheen',
    'turbidity',
    'latitude',
    'longitude'
]

# convert data types
features = cla_2015.columns.values

for feat in features:
    if feat in ['algalBloom', 'algalBloomSheen', 'turbidity', 'latitude', 'longitude']:
        cla_2015[feat] = pd.to_numeric(cla_2015[feat], errors='coerce')
        cla_2016[feat] = pd.to_numeric(cla_2016[feat], errors='coerce')
        cla_2017[feat] = pd.to_numeric(cla_2017[feat], errors='coerce')
        cla_2018[feat] = pd.to_numeric(cla_2018[feat], errors='coerce')
    
    if feat in ['site', 'lake']:
        cla_2015[feat] = cla_2015[feat].astype(str)
        cla_2016[feat] = cla_2016[feat].astype(str)
        cla_2017[feat] = cla_2017[feat].astype(str)
        cla_2018[feat] = cla_2018[feat].astype(str)
    
    if feat == 'date':
        cla_2015[feat] = pd.to_datetime(cla_2015[feat], errors='coerce')
        cla_2016[feat] = pd.to_datetime(cla_2016[feat], errors='coerce')
        cla_2017[feat] = pd.to_datetime(cla_2017[feat], errors='coerce')
        cla_2018[feat] = pd.to_datetime(cla_2018[feat], errors='coerce')
        
# remove nans
cla_2015.dropna(axis='rows', how='any', inplace=True)
cla_2016.dropna(axis='rows', how='any', inplace=True)
cla_2017.dropna(axis='rows', how='any', inplace=True)
cla_2018.dropna(axis='rows', how='any', inplace=True)

# remove any data point not on lake mendota
cla_2015 = cla_2015[cla_2015['lake'].str.contains('Mendota')]
cla_2016 = cla_2016[cla_2016['lake'].str.contains('Mendota')]
cla_2017 = cla_2017[cla_2017['lake'].str.contains('Mendota')]
cla_2018 = cla_2018[cla_2018['lake'].str.contains('Mendota')]

# set date as index
cla_2015.set_index('date', inplace=True)
cla_2016.set_index('date', inplace=True)
cla_2017.set_index('date', inplace=True)
cla_2018.set_index('date', inplace=True)

# sort data by dates
cla_2015.sort_values(by='date', inplace=True)
cla_2016.sort_values(by='date', inplace=True)
cla_2017.sort_values(by='date', inplace=True)
cla_2018.sort_values(by='date', inplace=True)

### Mendota Buoy Data

In [59]:

mdcp_raw.columns.values

array(['year4', 'station_id', 'sampledate', 'month', 'daynum',
       'sampletime', 'air_temp', 'flag_air_temp', 'rel_hum',
       'flag_rel_hum', 'wind_speed', 'flag_wind_speed', 'wind_dir',
       'flag_wind_dir', 'chlor', 'flag_chlor', 'phycocyanin',
       'flag_phycocyanin', 'do_raw', 'do_sat', 'do_wtemp', 'flag_do_raw',
       'flag_do_sat', 'flag_do_wtemp', 'pco2_ppm', 'flag_pco2_ppm', 'par',
       'flag_par', 'par_below', 'flag_par_below'], dtype=object)

#### Idea: resample data so that its yes or no to algae for each day. Was there an algal bloom today? maybe do this for just lake mendota

In [75]:
cla_2018.resample('D').mean()
# if algalbloomsheen is >0 after resampling, make it 1. Do this for all years. Append the average daily from mdcp,
# and simply predict whether or not an algalbloom will be happening

Unnamed: 0_level_0,algalBloom,algalBloomSheen,turbidity,latitude,longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-23,1.000000,0.000000,50.000000,43.131667,-89.433139
2018-04-24,,,,,
2018-04-25,3.000000,2.000000,65.000000,43.087544,-89.419706
2018-04-26,1.000000,0.000000,70.000000,43.087305,-89.472000
2018-04-27,,,,,
2018-04-28,1.000000,0.000000,120.000000,43.107139,-89.475167
2018-04-29,1.000000,0.000000,115.000000,43.080972,-89.462111
2018-04-30,,,,,
2018-05-01,1.000000,0.000000,83.500000,43.102828,-89.454146
2018-05-02,1.000000,0.000000,71.500000,43.101886,-89.450473
