## Running regression again with median imputation

In [45]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures
import sklearn.metrics
import pandas as pd
import numpy as np
import os
import pickle

In [99]:
# Load the cleaned data
water_path = r"..\LTRM data\water_data_qfneg.csv"
water_data = pd.read_csv(water_path, low_memory = False)

continuous = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
seasons = {3:"SPRING",4:"SPRING",5:"SPRING",6:"SUMMER",7:"SUMMER",8:"SUMMER",9:"FALL",10:"FALL",11:"FALL",12:"WINTER",1:"WINTER",2:"WINTER"}

print("Now adding a year column")
water_data["YEAR"] = pd.DatetimeIndex(water_data["DATE"]).year
print("Now adding a month column")
water_data["MONTH"] = pd.DatetimeIndex(water_data["DATE"]).month
print("Now adding a season column")
water_data["SEASON"] = water_data["MONTH"]
water_data = water_data.replace({"SEASON":seasons})
print("Dropping TN outliers")
water_data.drop([46795,46545,46727],axis=0,inplace=True)
print("\n Water data")
print(water_data.columns)
print(water_data.shape)

Now adding a year column
Now adding a month column
Now adding a season column
Dropping TN outliers

 Water data
Index(['SHEETBAR', 'DATE', 'LATITUDE', 'LONGITUDE', 'FLDNUM', 'STRATUM',
       'LOCATCD', 'TN', 'TP', 'TEMP', 'DO', 'TURB', 'COND', 'VEL', 'SS', 'WDP',
       'CHLcal', 'SECCHI', 'YEAR', 'MONTH', 'SEASON'],
      dtype='object')
(82478, 21)


Looking at extreme outliers for model building, then removing them

In [100]:
water_data["TN"].sort_values(ascending=False)

29136    46.989
55650    32.965
19178    22.939
55611    22.677
59504    22.576
          ...  
82475       NaN
82477       NaN
82478       NaN
82479       NaN
82480       NaN
Name: TN, Length: 82478, dtype: float64

Example for imputing medians into each column, not used after ambers suggesting of imputing separately

In [101]:
print("Testing multivariate polynomial interpolation, using every other variable as a predictor besides target variable")
print("Imputing medians")
imputed_water_data = water_data.copy()

for col in continuous:
    median = imputed_water_data[col].median()
    print("Median for ",col," is ",round(median,4))
    imputed_water_data[col].fillna(value=median,inplace=True)

# Checking to see if we lost any values
qualdata = imputed_water_data.dropna(axis=0, how='any', thresh=None, subset=continuous, inplace=False)
print(qualdata.shape)
print("Filtering out colums that we dont need")
qualdata.drop(qualdata.columns.difference(continuous), axis=1, inplace=True)
print(qualdata.shape)

Testing multivariate polynomial interpolation, using every other variable as a predictor besides target variable
Imputing medians
Median for  TN  is  2.53
Median for  TP  is  0.163
Median for  TEMP  is  14.7
Median for  DO  is  9.7
Median for  TURB  is  21.0
Median for  COND  is  462.0
Median for  VEL  is  0.1
Median for  SS  is  25.7
Median for  WDP  is  2.24
Median for  CHLcal  is  16.6519
Median for  SECCHI  is  41.0
(82478, 21)
Filtering out colums that we dont need
(82478, 11)


Water dataset has no missing variables in any variable

In [43]:
os.mkdir("Regression Models\\Imputed_data")

In [102]:
var = "TN"
predictors = continuous.copy()
predictors.remove(var)

# Define predictor and target data
X = np.array(qualdata[predictors])
y = np.array(qualdata[var])

# Get median of first attribute
np.nanmedian([item[2] for item in X])

9.7

In [103]:
np.nanmedian([item[2] for item in X])

9.7

In [104]:
var = "TN"
predictors = continuous.copy()
predictors.remove(var)

# Define predictor and target data
X = np.array(qualdata[predictors])
y = np.array(qualdata[var])

# Impute medians for X_train
for index,predictor in enumerate(predictors):
    median = np.nanmedian([item[index] for item in X])
    print("Median for ",predictor," is ",round(median,4))

Median for  TP  is  0.163
Median for  TEMP  is  14.7
Median for  DO  is  9.7
Median for  TURB  is  21.0
Median for  COND  is  462.0
Median for  VEL  is  0.1
Median for  SS  is  25.7
Median for  WDP  is  2.24
Median for  CHLcal  is  16.6519
Median for  SECCHI  is  41.0


In [62]:
X.shape[1]

10

In [64]:
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [118]:
models = ["TN","TP","VEL"]
degs = [1,2]

for var in models:
    #os.mkdir("Regression Models\\Imputed_data\\"+var)
    
    # Filter data by non na var columns
    qualdata = water_data.dropna(axis=0, how='any', thresh=None, subset=[var], inplace=False)
    
    for deg in degs:
        print("\n-----------------------------------")
        print("Building model for ",var)
        
        # Make path for outputs of this model
        #os.mkdir("Regression Models\\Imputed_data\\"+var+"\\degree"+str(deg))
        #path = "Regression Models\\Imputed_data\\"+var+"\\degree"+str(deg)+"\\"
        
        predictors = continuous.copy()
        predictors.remove(var)

        # Define predictor and target data
        X = np.array(qualdata[predictors])
        y = np.array(qualdata[var])

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
        
        print("Imputing medians for training set")
        # Impute medians for X_train
        for index,predictor in enumerate(predictors):
            median = np.nanmedian([item[index] for item in X_train])
            #print("Median for ",predictor," is ",round(median,4))
            # Loop through the index of each indice looking for nan, replace with median
            for item in X_train:
                if np.isnan(item[index]):
                    # Impute median
                    item[index] = median
                    
            curvar = [item[index] for item in X_train]
            assert not any(np.isnan(curvar)), "Imputation didn't work"
        
        print("Imputing medians for test set")
        # Impute medians for X_test
        for index,predictor in enumerate(predictors):
            median = np.nanmedian([item[index] for item in X_test])
            #print("Median for ",predictor," is ",round(median,4))
            # Loop through the index of each indice looking for nan, replace with median
            for item in X_test:
                if np.isnan(item[index]):
                    # Impute median
                    item[index] = median
                    
            curvar = [item[index] for item in X_test]
            assert not any(np.isnan(curvar)), "Imputation didn't work"
        

        # Good idea to standardize predictor attributes on only the training set, but use it on both
        scaler = RobustScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        pickle.dump(scaler,open(path+"scaler.p", "wb" ))

        # Save train and test sets
        pickle.dump(X_train,open(path+"X_train.p","wb"))
        pickle.dump(X_test,open(path+"X_test.p","wb"))
        pickle.dump(y_train,open(path+"y_train.p","wb"))
        pickle.dump(y_test,open(path+"y_test.p","wb"))


        # Finally, we build the model, fit it to the full training data, and
        # estimate its out-of-sample performance by applying it to the test set
        best_poly = PolynomialFeatures(deg)
        best_lm = LinearRegression(fit_intercept=False)
        best_lm.fit(best_poly.fit_transform(X_train_scaled), y_train)

        # Save the best model and best polynomialfeatures
        pickle.dump(best_lm,open(path+"best_model_deg"+str(deg)+".p","wb"))
        pickle.dump(best_poly,open(path+"best_poly.p","wb"))


        # Estimate performance on test set:
        MSE = np.mean((y_test - best_lm.predict(best_poly.transform(X_test_scaled))) ** 2)
        RMSE = np.sqrt(MSE)
        MAE = np.mean(abs(y_test - best_lm.predict(best_poly.transform(X_test_scaled))))
        print(f'Degree {deg} polynomial has RMSE = {RMSE:.5f}')
        print(f'Degree {deg} polynomial has MAE = {MAE:.5f}')


-----------------------------------
Building model for  TN
Imputing medians for training set
Imputing medians for test set
Degree 1 polynomial has RMSE = 1.17639
Degree 1 polynomial has MAE = 0.85433

-----------------------------------
Building model for  TN
Imputing medians for training set
Imputing medians for test set
Degree 2 polynomial has RMSE = 1.12375
Degree 2 polynomial has MAE = 0.81562

-----------------------------------
Building model for  TP
Imputing medians for training set
Imputing medians for test set
Degree 1 polynomial has RMSE = 0.12826
Degree 1 polynomial has MAE = 0.06691

-----------------------------------
Building model for  TP
Imputing medians for training set
Imputing medians for test set
Degree 2 polynomial has RMSE = 0.12189
Degree 2 polynomial has MAE = 0.05733

-----------------------------------
Building model for  VEL
Imputing medians for training set
Imputing medians for test set
Degree 1 polynomial has RMSE = 0.29541
Degree 1 polynomial has MAE = 0.