# Multivariate Polynomial Regression Interpolation
## Degrees 1 and 2
### Written and compiled by Casey McKean

### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures
import sklearn.metrics

### Load and Prepare Data

In [6]:
filepath = r"..\LTRM data\water_data_qfneg.csv"
water_data = pd.read_csv(filepath, low_memory = False)

# Define our continuous variables
continuous = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']

print("Dropping columns we don't need")
water_data.drop(water_data.columns.difference(continuous), axis = 1, inplace=True)

# Set to NA
print("Setting TN outliers to NA")
water_data.loc[46795,"TN"] = None
water_data.loc[46545,"TN"] = None
water_data.loc[46727,"TN"] = None

water_data.head()

Dropping columns we don't need
Setting TN outliers to NA


Unnamed: 0,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI
0,,,23.0,6.6,28.0,550.0,,42.3,2.2,9.44875,40.0
1,4.876,0.229,23.0,6.6,28.0,554.0,,37.6,8.2,8.2423,42.0
2,,,22.9,6.3,24.0,564.0,,34.1,4.3,8.72488,43.0
3,4.257,0.212,22.9,6.4,28.0,563.0,,33.4,9.1,8.48359,38.0
4,,,23.0,6.6,33.0,556.0,,48.0,6.7,9.52918,45.0


### Build models and report errors

In [8]:
degrees = [1,2]
models = ['TN','TP','VEL']

# Pick a random seed to ensure reproducability
seed = 13


for var in models:
    for deg in degrees:
        print("\n-----------------------------------")
        print("Building model for ",var," degree ",str(deg))
        
        # Predictors for this variable are every other variable
        predictors = continuous.copy()
        predictors.remove(var)
        
        print("Removing rows with missing ",var," values")
        cur_data = water_data[water_data[var].notna()].copy()
        print("Imputing missing values with median for predictor variables")
        cur_data.fillna(cur_data.median(),inplace=True)
    
        X = np.array(cur_data[predictors])
        y = np.array(cur_data[var])
    
        # Good idea to standardize predictor attributes for numerical stability
        scaler = RobustScaler().fit(X)
        X_standard = scaler.transform(X)

        # Optional save scaler for this model (adjust path accordingly)
        #path = "Regression Models\\"+var+"extra\\"
        #pickle.dump(scaler,open(path+"scaler.p", "wb" ))
    
        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_standard, y, train_size=0.8,random_state=seed)
    
        # Save train and test sets
        #pickle.dump(X_train,open(path+"X_train.p","wb"))
        #pickle.dump(X_test,open(path+"X_test.p","wb"))
        #pickle.dump(y_train,open(path+"y_train.p","wb"))
        #pickle.dump(y_test,open(path+"y_test.p","wb"))
        print("Train set size: ",len(X_train))
        print("Test set size: ",len(y_test))
    
        # Create the polynomial features of the current degree
        poly = PolynomialFeatures(deg)
        
        # Create regression model fit intercept isn't needed becuase polynomial features added a 1 feature already
        lm = LinearRegression(fit_intercept=False)
        lm.fit(poly.fit_transform(X_train), y_train)
    
        # Save the model
        #pickle.dump(best_lm,open(path+"best_model_deg"+str(deg)+".p","wb"))
        #pickle.dump(best_poly,open(path+"best_poly.p","wb"))

        # Calculate and report errors on the test set
        MAE = sklearn.metrics.mean_absolute_error(y_test,lm.predict(poly.transform(X_test)))
        RMSE = sklearn.metrics.mean_squared_error(y_test,lm.predict(poly.transform(X_test)),squared=False)
        print(f"The MAE is {MAE:8f}")
        print(f"The RMSE is {RMSE:8f}")


-----------------------------------
Building model for  TN  degree  1
Removing rows with missing  TN  values
Imputing missing values with median for predictor variables
Train set size:  25748
Test set size:  6437
The MAE is 0.852447
The RMSE is 1.176410

-----------------------------------
Building model for  TN  degree  2
Removing rows with missing  TN  values
Imputing missing values with median for predictor variables
Train set size:  25748
Test set size:  6437
The MAE is 0.816302
The RMSE is 1.126301

-----------------------------------
Building model for  TP  degree  1
Removing rows with missing  TP  values
Imputing missing values with median for predictor variables
Train set size:  25160
Test set size:  6290
The MAE is 0.067799
The RMSE is 0.124557

-----------------------------------
Building model for  TP  degree  2
Removing rows with missing  TP  values
Imputing missing values with median for predictor variables
Train set size:  25160
Test set size:  6290
The MAE is 0.056976
T