In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [59]:
pd.set_option("max_rows", 25)

In [60]:
# Apply the default theme
sns.set_theme()
sns.set_style("whitegrid")

In [61]:
data_path = '/home/achara/data/achara/'
df = pd.read_csv(data_path + 'pc_data_2017_to_2019_2.csv')

In [62]:
variables = [
        'PC_Water_EXO',
 # 'DO-Sat_Water_EXO',
    'SensorDepth_Water_EXO',  #
    'SolarRad_Air_LiCor',
    'DO_Water_EXO',
    # 'SpCond_Water_EXO', #
    'Temperature_Water_EXO',  #
    'pH_Water_EXO',  #
    # 'PRECTOTCORR',
    'rel_fl',
    'Chla_Water_EXO',
    'NO3',
    'NH4',  #
    'ALLSKY_SFC_LW_DWN'
]

In [63]:
features = [
    # 'DO-Sat_Water_EXO',
    'SensorDepth_Water_EXO',  #
    'SolarRad_Air_LiCor',
    'DO_Water_EXO',
    # 'SpCond_Water_EXO', #
    'Temperature_Water_EXO',  #
    'pH_Water_EXO',  #
    # 'PRECTOTCORR',
    'rel_fl',
    'Chla_Water_EXO',
    'NO3',
    'NH4',  #
    'ALLSKY_SFC_LW_DWN']

In [64]:
# features = ['DO-Sat_Water_EXO',
#             'PP',
#             'SolarRad_Air_LiCor',
#             'DO_Water_EXO',
#             'PRECTOTCORR',
#             'rel_fl',
#             'Chla_Water_EXO',
#             'NO3',
#             'ALLSKY_SFC_LW_DWN',
#             'Chla_Water_EXO']

In [65]:
y = df.PC_Water_EXO

In [66]:
df = df[variables]

In [67]:
df.head()

Unnamed: 0,PC_Water_EXO,SensorDepth_Water_EXO,SolarRad_Air_LiCor,DO_Water_EXO,Temperature_Water_EXO,pH_Water_EXO,rel_fl,Chla_Water_EXO,NO3,NH4,ALLSKY_SFC_LW_DWN
0,0.1556,1.01,127.38941,9.17,20.38,8.010344,0.88,0.358058,0.191,0.005,310.373726
1,0.152,1.03,127.38941,9.18,20.386667,8.027139,0.862,0.544445,0.191,0.005,310.373726
2,0.1304,1.05,127.38941,9.16,20.22,8.019958,0.844,0.843708,0.191,0.005,310.373726
3,0.1592,1.07,127.38941,9.16,20.41,8.038803,0.826,0.939807,0.191,0.005,310.373726
4,0.134,1.09,127.38941,9.15,20.42,8.024188,0.835,0.797623,0.191,0.005,310.373726


## Splitting the data into train and test

In [68]:
n_rows = len(df)
train_size_percent = 0.8
train_size = round(train_size_percent * n_rows)
test_size = n_rows - train_size

In [69]:
train_df, test_df = train_test_split(df, train_size=train_size, test_size=test_size)

In [70]:
train_x = train_df[features]
train_y = train_df.PC_Water_EXO

test_x = test_df[features]
test_y = test_df.PC_Water_EXO

In [71]:
ms = MinMaxScaler()

In [72]:
train_x_scaled = ms.fit_transform(train_x)
test_x_scaled = ms.fit_transform(test_x)

## Regression model

In [73]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV as RSCV

In [74]:
rf = RandomForestRegressor()

In [75]:
param_grid = {'n_estimators': np.arange(50, 200, 15),
              'max_features': np.arange(0.1, 1, 0.1),
              'max_depth': [3, 5, 7, 9],
              'max_samples': [0.3, 0.5, 0.8]}

In [76]:
model = RSCV(RandomForestRegressor(), param_grid, n_iter=15).fit(train_x, train_y)

In [77]:
model.best_params_

{'n_estimators': 50, 'max_samples': 0.8, 'max_features': 0.8, 'max_depth': 9}

In [78]:
model.best_estimator_

RandomForestRegressor(max_depth=9, max_features=0.8, max_samples=0.8,
                      n_estimators=50)

## Score of the model

In [79]:
model.best_score_

0.9254007208652097

In [80]:
y_pred = model.best_estimator_.predict(test_x)

In [81]:
score = 1 - mean_absolute_error(test_y, y_pred)

In [82]:
score * 100

97.15644728876386

## Build a new model on scaled data

In [83]:
model2 = RSCV(RandomForestRegressor(), param_grid, n_iter=15).fit(train_x_scaled, train_y)

In [84]:
model2.best_score_

0.9263921852857848

In [85]:
model2.best_estimator_

RandomForestRegressor(max_depth=9, max_features=0.8, max_samples=0.5,
                      n_estimators=185)

In [86]:
model2.best_params_

{'n_estimators': 185, 'max_samples': 0.5, 'max_features': 0.8, 'max_depth': 9}

In [87]:
y_pred2 = model2.best_estimator_.predict(test_x_scaled)

In [88]:
1 - mean_absolute_error(test_y, y_pred2)

0.9337273603862619