In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [52]:
pd.set_option("max_rows", 25)

In [53]:
# Apply the default theme
sns.set_theme()
sns.set_style("whitegrid")

In [54]:
data_path = '/home/achara/data/achara/'
df = pd.read_csv(data_path + 'pc_data_2017_to_2019_2.csv')

In [55]:
features = ['DO-Sat_Water_EXO',
            'PP',
            'SolarRad_Air_LiCor',
            'DO_Water_EXO',
            'PRECTOTCORR',
            'rel_fl',
            'Chla_Water_EXO',
            'NO3',
            'rel_fl',
            'ALLSKY_SFC_LW_DWN',
            'Chla_Water_EXO']

y = df.PC_Water_EXO

In [56]:
df = df[['PC_Water_EXO', 'DO-Sat_Water_EXO',
         'PP',
         'SolarRad_Air_LiCor',
         'DO_Water_EXO',
         'PRECTOTCORR',
         'rel_fl',
         'Chla_Water_EXO',
         'NO3',
         'rel_fl',
         'ALLSKY_SFC_LW_DWN',
         'Chla_Water_EXO']]

In [57]:
df.head()

Unnamed: 0,PC_Water_EXO,DO-Sat_Water_EXO,PP,SolarRad_Air_LiCor,DO_Water_EXO,PRECTOTCORR,rel_fl,Chla_Water_EXO,NO3,rel_fl.1,ALLSKY_SFC_LW_DWN,Chla_Water_EXO.1
0,0.1556,101.61,16.681741,127.38941,9.17,4.319851,0.88,0.358058,0.191,0.88,310.373726,0.358058
1,0.152,101.776667,16.681741,127.38941,9.18,4.319851,0.862,0.544445,0.191,0.862,310.373726,0.544445
2,0.1304,101.23,16.681741,127.38941,9.16,4.319851,0.844,0.843708,0.191,0.844,310.373726,0.843708
3,0.1592,101.59,16.681741,127.38941,9.16,4.319851,0.826,0.939807,0.191,0.826,310.373726,0.939807
4,0.134,101.49,16.681741,127.38941,9.15,4.319851,0.835,0.797623,0.191,0.835,310.373726,0.797623


## Splitting the data into train and test

In [58]:
n_rows = len(df)
train_size_percent = 0.8
train_size = round(train_size_percent * n_rows)
test_size = n_rows - train_size

In [59]:
train_df, test_df = train_test_split(df, train_size=train_size, test_size=test_size)

In [60]:
train_x = train_df[features]
train_y = train_df.PC_Water_EXO

test_x = test_df[features]
test_y = test_df.PC_Water_EXO

In [61]:
ms = MinMaxScaler()

In [62]:
train_x_scaled = ms.fit_transform(train_x)
test_x_scaled = ms.fit_transform(test_x)

## Regression model

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV as RSCV

In [64]:
rf = RandomForestRegressor()

In [65]:
param_grid = {'n_estimators': np.arange(50, 200, 15),
              'max_features': np.arange(0.1, 1, 0.1),
              'max_depth': [3, 5, 7, 9],
              'max_samples': [0.3, 0.5, 0.8]}

In [66]:
model = RSCV(RandomForestRegressor(), param_grid, n_iter=15).fit(train_x, train_y)

In [67]:
model.best_params_

{'n_estimators': 170,
 'max_samples': 0.3,
 'max_features': 0.7000000000000001,
 'max_depth': 9}

In [68]:
model.best_estimator_

RandomForestRegressor(max_depth=9, max_features=0.7000000000000001,
                      max_samples=0.3, n_estimators=170)

## Score of the model

In [69]:
model.best_score_

0.8603318646967428

In [70]:
y_pred = model.best_estimator_.predict(test_x)

In [71]:
score = 1 - mean_absolute_error(test_y, y_pred)

In [72]:
score * 100

96.15962960929379

## Build a new model on scaled data

In [73]:
model2 = RSCV(RandomForestRegressor(), param_grid, n_iter=15).fit(train_x_scaled, train_y)

In [74]:
model2.best_score_

0.8587836384725522

In [75]:
model2.best_estimator_

RandomForestRegressor(max_depth=9, max_features=0.6, max_samples=0.5,
                      n_estimators=80)

In [76]:
model2.best_params_

{'n_estimators': 80, 'max_samples': 0.5, 'max_features': 0.6, 'max_depth': 9}

In [77]:
y_pred2 = model2.best_estimator_.predict(test_x_scaled)

In [78]:
1 - mean_absolute_error(test_y, y_pred2)

0.9341962227831088