In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import ShuffleSplit
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib import pylab
import random
from sklearn.model_selection import KFold
import plotly.express as px
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestRegressor
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)
from  sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.svm import SVR
import dateutil
from datetime import datetime
from dateutil.relativedelta import relativedelta
from mlxtend.evaluate import PredefinedHoldoutSplit
import kneed
import kaleido

In [None]:
%run functions.ipynb

In [None]:
station = "erlabrunn"
flow_station = "erlabrunn"
start = "2016-11-15"
end = "2020-12-30"

In [None]:
#OPTIONS:  "OPO4P", "NO3N", "NH4N", "TRP"
predict = "OPO4P"
# OPTIONS: "lr", "rf", 
test_model = "rf"

results = pd.DataFrame([1])

In [None]:
if test_model == "rf":
    algo = RandomForestRegressor(n_jobs = -1)
elif test_model == "lr":
    algo = linear_model.LinearRegression()

In [None]:
df = pd.read_csv('all_data_%s.csv' %(station))
df['Datum'] = pd.to_datetime(df['Datum'])
df= df.set_index("Datum")
df = df.round(2)
# df

In [None]:
flow = pd.read_csv("flow_%s.csv" %(flow_station))
flow['Datum'] = pd.to_datetime(flow['Datum'])
flow= flow.set_index("Datum")
flow = flow.loc[start:end]
df = pd.concat([df, flow], axis=1)
df = df.dropna(thresh = 3)

In [None]:
df.to_csv('%s_total.csv' % station)
if station == "kahl":
    if predict == "OPO4P" or predict == "NH4N":
        df = df.loc["2019-09-02":]

In [None]:
df.describe()

In [None]:
Pre_count = count(df, "Pre Count")

df = df[["O2", "Temp", "Conduct", "pH", "flow", predict]]         
df = df.dropna(subset=[predict])

Post_count = count(df, "Post Count")

columns = df.columns

In [None]:
Final_data = Data_count(Pre_count, Post_count)
print(station)
Final_data.dropna()

### Cleaning

In [None]:
#ONLY FOR ERLABRUNN TO CLEAR PHOSPHATE A BIT
if station == "erlabrunn":
    df.loc["2018-07-02 09:45:00", "Conduct"] = np.nan
    df.loc["2018-08-20 09:30:00", "Conduct"] = np.nan
    df.loc["2018-08-20 09:45:00", "Conduct"] = np.nan
    df.loc["2018-12-17 11:30:00", "Conduct"] = np.nan
    
    if predict == "OPO4P":
        para = "OPO4P"
        temp = df[[para]]
        df= df.drop([para],axis =1)
        temp = temp[(temp[para] < 0.3)]   
        df = pd.concat([df, temp], axis=1)
        df = df.loc["2019-01-01 00:00:00":]
        
        
    if predict == "NO3N":
        df.loc["2020-11-24 08:30:00", "NO3N"] = np.nan
        
    if predict == "NH4N":
        df['NH4N'].mask(df['NH4N'].between(-0.8, 0.001), inplace=True)    

In [None]:
if station == "kahl":
    # Conduct cleaning
    df.loc["2019-09-06 07:30:00", "Conduct"] = np.nan
    df.loc["2019-09-10 13:30:00", "Conduct"] = np.nan
    df.loc["2019-10-25 07:45:00", "Conduct"] = np.nan
    df.loc["2021-07-23 03:15:00", "Conduct"] = np.nan
    df.loc["2021-08-27 08:00:00", "Conduct"] = np.nan
    df.loc["2017-11-16": "2017-11-22", "Conduct"] = np.nan
    df.loc["2019-08-19 10:15:00", "Conduct"] = np.nan
    df.loc["2019-08-29 10:00:00", "Conduct"] = np.nan
    df.loc["2019-08-30 08:00:00", "Conduct"] = np.nan
    df.loc["2019-06-04 12:30:00", "Conduct"] = np.nan
    
    df['Conduct'].mask(df['Conduct'].between(0, 350), inplace=True)
    
    #Temp Cleaning
    df.loc["2017-11-17": "2017-11-21", "Temp"] = np.nan
    df.loc["2018-04-02", "Temp"] = np.nan
    df.loc["2019-05-21 09:30:00": "2019-05-23 23:45:00", "Temp"] = np.nan
    
    #O2 Cleaning
    df.loc["2017-11-16 09:15:00": "2017-11-16 23:45:00", "O2"] = np.nan
    df.loc["2019-05-21 09:30:00": "2019-05-23 23:45:00", "O2"] = np.nan
    
    #NO3N cleaning
    if predict == "NO3N":
        df['NO3N'].mask(df['NO3N'].between(-1, 2), inplace=True)
        df.loc["2021-08-27 08:30:00", "NO3N"] = np.nan
    
    #NH4N cleaning
    if predict == "NH4N":
        df.loc["2019-05-27 10:15:00", "NH4N"] = np.nan
        df['NH4N'].mask(df['NH4N'].between(0.5, 40), inplace=True)
        df['NH4N'].mask(df['NH4N'].between(-0.8, 0.001), inplace=True)
    
    #OPO4P cleaning
    if predict == "OPO4P":
        df['OPO4P'].mask(df['OPO4P'].between(0.25, 5), inplace=True)
        df['OPO4P'].mask(df['OPO4P'].between(-0.8, 0.001), inplace=True)
        df.loc["2019-08-07 16:15:00", "OPO4P"] = np.nan
    

##### add removal , remove anomaly rows and create a dataframe

In [None]:
removal = 0.1
from sklearn.ensemble import IsolationForest
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(removal), \
                        max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(df)
pred = clf.predict(df)
df['anomaly']=pred
outliers=df.loc[df['anomaly']==-1]
outlier_index=list(outliers.index)
#print(outlier_index)
#Find the number of anomalies and normal points here points classified -1 are anomalous
print(df['anomaly'].value_counts())

### Interpolation

In [None]:
bef_interpol = df
df = df.interpolate(limit = 30)
df = df.dropna()
df = df.round(2)
df.to_csv(r'%s_cleaned_interpolated.csv' %station, index = True, header = True)
#     print(df)

### Time as a feature

In [None]:
df = df.reset_index()
df["Month"] = df['Datum'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['Month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['Month']/12)

df['week_number'] = df["Datum"].dt.isocalendar().week
df["week_sin"] = np.sin(2 * np.pi * df["week_number"] / df["week_number"].max())
df["week_cos"] = np.cos(2 * np.pi * df["week_number"] / df["week_number"].max())
df= df.drop(["Month", "week_number"],axis =1)
df = df.set_index("Datum")

### Transformation

In [None]:
#Transformation
if station == "enborne" and cleaning_model == 'lr':
    df['log_Turb'] = np.exp(df['turb'])
    df['log_Chlorophyll'] = np.power(df['Chlorophyll'], 0.5)
    df['log_O2'] = np.power(df['O2'], 3)
    df['log_flow'] = np.log(df['flow'])
    df['Cube_Conduct']=np.power((df['Conduct']),3)
    
    df = df.drop(["turb", "Chlorophyll", "O2", "flow", "Conduct"],axis =1)
    df.to_csv(r'%s_transformed.csv' %station, index = True, header = True)

In [None]:
before_transform = df
if station == "kahl" or station == "erlabrunn":  
    if cleaning_model == "lr":
        df['log_flow'] = np.log(df['flow'])
        df['log_Conduct']= np.log(df['Conduct'])
        
        df = df.drop(["flow", "Conduct"],axis =1)
        df.to_csv(r'%s_transformed.csv' %station, index = True, header = True)
after_transform = df
# df

In [None]:
df

### Correlation Analysis

In [None]:
columns=df.columns
corr = df.corr()
corr = corr.loc[:,[predict]]
corr = corr.iloc[(-corr[predict].abs()).argsort()]
if predict == "NO3N":
    corr= corr.drop(["NO3N"],axis =0)

if predict == "OPO4P":
    corr= corr.drop(["OPO4P"],axis =0)
    
if predict == "NH4N":
    corr= corr.drop(["NH4N"],axis =0)
    
if predict == "TRP":
    corr= corr.drop(["TRP", "NO3N"],axis =0)
print(corr)
corr_order = list(corr.index)