In [174]:

import pandas as pd
from sklearn.model_selection import train_test_split

In [175]:
pd.set_option("max_rows", 25)

In [176]:
data_path = '/home/achara/data/achara/'
df = pd.read_csv(data_path + 'pc_data_2017_to_2019_2.csv')

In [177]:
df.head()

Unnamed: 0,DateTime,SensorDepth_Water_EXO,PC_Water_EXO,Chla_Water_EXO,DO_Water_EXO,DO-Sat_Water_EXO,SpCond_Water_EXO,Temperature_Water_EXO,pH_Water_EXO,rel_fl,...,TP,TSP,SRP,TN,NO3,NH4,PP,PRECTOTCORR,ALLSKY_SFC_SW_DWN,ALLSKY_SFC_LW_DWN
0,10/1/2017,1.01,0.1556,0.358058,9.17,101.61,139.81,20.38,8.010344,0.88,...,25.691678,9.009938,5.594708,0.498284,0.191,0.005,16.681741,4.319851,2.424178,310.373726
1,10/1/2017,1.03,0.152,0.544445,9.18,101.776667,139.786667,20.386667,8.027139,0.862,...,25.691678,9.009938,5.594708,0.498284,0.191,0.005,16.681741,4.319851,2.424178,310.373726
2,10/1/2017,1.05,0.1304,0.843708,9.16,101.23,139.66,20.22,8.019958,0.844,...,25.691678,9.009938,5.594708,0.498284,0.191,0.005,16.681741,4.319851,2.424178,310.373726
3,10/1/2017,1.07,0.1592,0.939807,9.16,101.59,139.78,20.41,8.038803,0.826,...,25.691678,9.009938,5.594708,0.498284,0.191,0.005,16.681741,4.319851,2.424178,310.373726
4,10/1/2017,1.09,0.134,0.797623,9.15,101.49,139.75,20.42,8.024188,0.835,...,25.691678,9.009938,5.594708,0.498284,0.191,0.005,16.681741,4.319851,2.424178,310.373726


## Feature scaling and Linear model building

In [178]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [179]:
df.columns

Index(['DateTime', 'SensorDepth_Water_EXO', 'PC_Water_EXO', 'Chla_Water_EXO',
       'DO_Water_EXO', 'DO-Sat_Water_EXO', 'SpCond_Water_EXO',
       'Temperature_Water_EXO', 'pH_Water_EXO', 'rel_fl', 'month', 'day',
       'year', 'SolarRad_Air_LiCor', 'TP', 'TSP', 'SRP', 'TN', 'NO3', 'NH4',
       'PP', 'PRECTOTCORR', 'ALLSKY_SFC_SW_DWN', 'ALLSKY_SFC_LW_DWN'],
      dtype='object')

In [180]:
features = [
    # 'DO-Sat_Water_EXO',
    'SensorDepth_Water_EXO',  #
    'SolarRad_Air_LiCor',
    'DO_Water_EXO',
    # 'SpCond_Water_EXO', #
    'Temperature_Water_EXO',  #
    'pH_Water_EXO',  #
    # 'PRECTOTCORR',
    'rel_fl',
    'Chla_Water_EXO',
    'NO3',
    'NH4',  #
    'ALLSKY_SFC_LW_DWN']

### Feature scaling

In [181]:
n_rows = len(df)
train_size_percent = 0.8
train_size = round(train_size_percent * n_rows)
test_size = n_rows - train_size

In [182]:
train_df, test_df = train_test_split(df, train_size=train_size, test_size=test_size)

In [183]:
train_x = train_df[features]
train_y = train_df.PC_Water_EXO

test_x = test_df[features]
test_y = test_df.PC_Water_EXO

In [184]:
train_x_scaled = pd.DataFrame(MinMaxScaler().fit_transform(train_x), columns=features)
test_x_scaled = pd.DataFrame(MinMaxScaler().fit_transform(test_x), columns=features)

### Model building

In [185]:
def adjusted_r2_score(data_frame, r2_score):
    n = data_frame.shape[0]
    k = data_frame.shape[1]
    return 1 - ((1 - r2_score) * (n - 1) / (n - k - 1))

In [186]:
lm = LinearRegression()

In [187]:
lm.fit(train_x_scaled, train_y)

LinearRegression()

In [188]:
lm.score(train_x_scaled, train_y)

0.5909151563050371

In [189]:
y_pred = lm.predict(test_x)

In [190]:
r2 = lm.score(test_x_scaled, test_y)
print(r2)

0.42947316027671745


In [191]:
n = test_x.shape[0]
k = test_x.shape[1]

adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

In [192]:
print("Adjusted R2: ", adjusted_r2_score(test_x, r2))

Adjusted R2:  0.4293870923851347


In [193]:
import statsmodels.api as sm

In [194]:
x2 = sm.add_constant(test_x)

In [195]:
est = sm.OLS(test_y, x2)

In [196]:
est_fit = est.fit()

In [197]:
est_fit.summary()

0,1,2,3
Dep. Variable:,PC_Water_EXO,R-squared:,0.594
Model:,OLS,Adj. R-squared:,0.594
Method:,Least Squares,F-statistic:,9696.0
Date:,"Sat, 22 Apr 2023",Prob (F-statistic):,0.0
Time:,14:04:13,Log-Likelihood:,60613.0
No. Observations:,66299,AIC:,-121200.0
Df Residuals:,66288,BIC:,-121100.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.1267,0.019,-6.639,0.000,-0.164,-0.089
SensorDepth_Water_EXO,0.0021,5.87e-05,36.250,0.000,0.002,0.002
SolarRad_Air_LiCor,-0.0003,9.85e-06,-28.368,0.000,-0.000,-0.000
DO_Water_EXO,0.0185,0.001,34.939,0.000,0.017,0.020
Temperature_Water_EXO,0.0040,0.000,20.949,0.000,0.004,0.004
pH_Water_EXO,-0.0279,0.003,-10.303,0.000,-0.033,-0.023
rel_fl,0.1778,0.001,239.979,0.000,0.176,0.179
Chla_Water_EXO,0.0212,0.000,59.087,0.000,0.020,0.022
NO3,0.1470,0.004,34.136,0.000,0.139,0.155

0,1,2,3
Omnibus:,26.641,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26.664
Skew:,0.048,Prob(JB):,1.62e-06
Kurtosis:,2.977,Cond. No.,87800.0


In [198]:
est_fit.f_pvalue

0.0