Running regression again with mean imputation

In [45]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures
import sklearn.metrics
import pandas as pd
import numpy as np
import os
import pickle

In [15]:
# Load the cleaned data
water_path = r"..\LTRM data\water_data_qfneg.csv"
water_data = pd.read_csv(water_path, low_memory = False)

continuous = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
seasons = {3:"SPRING",4:"SPRING",5:"SPRING",6:"SUMMER",7:"SUMMER",8:"SUMMER",9:"FALL",10:"FALL",11:"FALL",12:"WINTER",1:"WINTER",2:"WINTER"}

print("Now adding a year column")
water_data["YEAR"] = pd.DatetimeIndex(water_data["DATE"]).year
print("Now adding a month column")
water_data["MONTH"] = pd.DatetimeIndex(water_data["DATE"]).month
print("Now adding a season column")
water_data["SEASON"] = water_data["MONTH"]
water_data = water_data.replace({"SEASON":seasons})

print("\n Water data")
print(water_data.columns)
print(water_data.shape)

Now adding a year column
Now adding a month column
Now adding a season column

 Water data
Index(['SHEETBAR', 'DATE', 'LATITUDE', 'LONGITUDE', 'FLDNUM', 'STRATUM',
       'LOCATCD', 'TN', 'TP', 'TEMP', 'DO', 'TURB', 'COND', 'VEL', 'SS', 'WDP',
       'CHLcal', 'SECCHI', 'YEAR', 'MONTH', 'SEASON'],
      dtype='object')
(82481, 21)


In [63]:
target = ["TP","TN","VEL"]
print("Dropping rows with missing values in target variables")
qualdata = water_data.dropna(axis=0,how='any',thresh=None,subset=target,inplace=False)
print(qualdata.shape)

Dropping rows with missing values in target variables
(21354, 21)


Example for imputing medians into each column, not used after ambers suggesting of imputing separately

In [51]:
print("Testing multivariate polynomial interpolation, using every other variable as a predictor besides target variable")
print("Imputing medians")
imputed_water_data = water_data.copy()

for col in continuous:
    median = imputed_water_data[col].median()
    print("Median for ",col," is ",round(median,4))
    imputed_water_data[col].fillna(value=median,inplace=True)

# Checking to see if we lost any values
qualdata = imputed_water_data.dropna(axis=0, how='any', thresh=None, subset=continuous, inplace=False)
print(qualdata.shape)
print("Filtering out colums that we dont need")
qualdata.drop(qualdata.columns.difference(continuous), axis=1, inplace=True)
print(qualdata.shape)

Testing multivariate polynomial interpolation, using every other variable as a predictor besides target variable
Imputing Means into missing variables
Median for  TN  is  2.531
Median for  TP  is  0.163
Median for  TEMP  is  14.7
Median for  DO  is  9.7
Median for  TURB  is  21.0
Median for  COND  is  462.0
Median for  VEL  is  0.1
Median for  SS  is  25.7
Median for  WDP  is  2.24
Median for  CHLcal  is  16.65
Median for  SECCHI  is  41.0
(82481, 21)
Filtering out colums that we dont need
(82481, 11)


Water dataset has no missing variables in any variable

In [43]:
os.mkdir("Regression Models\\Imputed_data")

In [52]:
predictors = continuous.copy()
predictors.remove(var)

X = np.array(qualdata[predictors])
y = np.array(qualdata[var])

In [59]:
# Get median of first attribute
np.median([item[2] for item in X])

14.7

In [62]:
X.shape[1]

10

In [47]:
models = ["TN","TP","VEL"]
degs = [1,2]

for var in models:
    os.mkdir("Regression Models\\Imputed_data\\"+var)
    for deg in degs:
        print("\n-----------------------------------")
        print("Building model for ",var)
        predictors = continuous.copy()
        predictors.remove(var)

        X = np.array(qualdata[predictors])
        y = np.array(qualdata[var])


        # Save the scaler for this model
        os.mkdir("Regression Models\\Imputed_data\\"+var+"\\degree"+str(deg))
        path = "Regression Models\\Imputed_data\\"+var+"\\degree"+str(deg)+"\\"

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
        
        # Impute medians for X_train
        for index in len
        
        
        
        # Good idea to standardize predictor attributes - assumes each variable has a decently normal distribution
        scaler = RobustScaler().fit(X_train)
        X_standard = scaler.transform(X_train)
        
        pickle.dump(scaler,open(path+"scaler.p", "wb" ))

        # Save train and test sets
        pickle.dump(X_train,open(path+"X_train.p","wb"))
        pickle.dump(X_test,open(path+"X_test.p","wb"))
        pickle.dump(y_train,open(path+"y_train.p","wb"))
        pickle.dump(y_test,open(path+"y_test.p","wb"))


        # Finally, we build the model, fit it to the full training data, and
        # estimate its out-of-sample performance by applying it to the test set
        best_poly = PolynomialFeatures(deg)
        best_lm = LinearRegression(fit_intercept=False)
        best_lm.fit(best_poly.fit_transform(X_train), y_train)

        pickle.dump(best_lm,open(path+"best_model_deg"+str(deg)+".p","wb"))
        pickle.dump(best_poly,open(path+"best_poly.p","wb"))


        # Estimate performance on test set:
        MSE = np.mean((y_test - best_lm.predict(best_poly.transform(X_test))) ** 2)
        RMSE = np.sqrt(MSE)
        MAE = np.mean(abs(y_test - best_lm.predict(best_poly.transform(X_test))))
        print(f'Degree {deg} polynomial has RMSE = {RMSE:.5f}')
        print(f'Degree {deg} polynomial has MAE = {MAE:.5f}')


-----------------------------------
Building model for  TN
Degree 1 polynomial has RMSE = 1.66223
Degree 1 polynomial has MAE = 0.51931

-----------------------------------
Building model for  TN
Degree 2 polynomial has RMSE = 2.00489
Degree 2 polynomial has MAE = 0.52310

-----------------------------------
Building model for  TP
Degree 1 polynomial has RMSE = 0.09866
Degree 1 polynomial has MAE = 0.05231

-----------------------------------
Building model for  TP
Degree 2 polynomial has RMSE = 0.10723
Degree 2 polynomial has MAE = 0.05288

-----------------------------------
Building model for  VEL
Degree 1 polynomial has RMSE = 0.30540
Degree 1 polynomial has MAE = 0.18037

-----------------------------------
Building model for  VEL
Degree 2 polynomial has RMSE = 0.28660
Degree 2 polynomial has MAE = 0.16802
