Running regression again with mean imputation

In [8]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures
import sklearn.metrics
import pandas as pd
import numpy as np

In [4]:
# Load the cleaned data
water_path = r"..\LTRM data\water_data_qfneg.csv"
water_data = pd.read_csv(water_path, low_memory = False)

continuous = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
seasons = {3:"SPRING",4:"SPRING",5:"SPRING",6:"SUMMER",7:"SUMMER",8:"SUMMER",9:"FALL",10:"FALL",11:"FALL",12:"WINTER",1:"WINTER",2:"WINTER"}

print("Now adding a year column")
water_data["YEAR"] = pd.DatetimeIndex(water_data["DATE"]).year
print("Now adding a month column")
water_data["MONTH"] = pd.DatetimeIndex(water_data["DATE"]).month
print("Now adding a season column")
water_data["SEASON"] = water_data["MONTH"]
water_data = water_data.replace({"SEASON":seasons})

print("\n Water data")
print(water_data.columns)
print(water_data.shape)


Now adding a year column
Now adding a month column
Now adding a season column

 Water data
Index(['SHEETBAR', 'DATE', 'LATITUDE', 'LONGITUDE', 'FLDNUM', 'STRATUM',
       'LOCATCD', 'TN', 'TP', 'TEMP', 'DO', 'TURB', 'COND', 'VEL', 'SS', 'WDP',
       'CHLcal', 'SECCHI', 'YEAR', 'MONTH', 'SEASON'],
      dtype='object')
(82481, 21)


In [9]:
print("Testing multivariate polynomial interpolation, using every other variable as a predictor besides target variable")
print("Imputing Means into missing variables")
imputed_water_data = water_data.copy()

for col in continuous:
    imputed_water_data[col].fillna(value=imputed_water_data[col].mean,inplace=True)

# Checking to see if we lost any values
qualdata = imputed_water_data.dropna(axis=0, how='any', thresh=None, subset=continuous, inplace=False).copy()
print(qualdata.shape)
print("Filtering out colums that we dont need")
qualdata.drop(qualdata.columns.difference(continuous), 1, inplace=True)
print(qualdata.shape)

Testing multivariate polynomial interpolation, using every other variable as a predictor besides target variable
Imputing Means into missing variables
(82481, 21)
Filtering out colums that we dont need
(82481, 11)


Water dataset has no missing variables in any variable

In [11]:
models = ["TN","TP","VEL"]
degs = [1,2]

for var in models:
    for deg in degs:
        print("\n-----------------------------------")
        print("Building model for ",var)
        predictors = continuous.copy()
        predictors.remove(var)

        X = np.array(qualdata[predictors])
        y = np.array(qualdata[var])

        # Good idea to standardize predictor attributes - assumes each variable has a decently normal distribution
        scaler = RobustScaler().fit(X)
        X_standard = scaler.transform(X)

        # Save the scaler for this model
        os.mkdir("Regression Models\\Imputed_data\\"+var+"\\degree"+deg)
        path = "Regression Models\\Imputed_data\\"+var+"\\degree"+deg+"\\"
        pickle.dump(scaler,open(path+"scaler.p", "wb" ))

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_standard, y, train_size=0.8)

        # Save train and test sets
        pickle.dump(X_train,open(path+"X_train.p","wb"))
        pickle.dump(X_test,open(path+"X_test.p","wb"))
        pickle.dump(y_train,open(path+"y_train.p","wb"))
        pickle.dump(y_test,open(path+"y_test.p","wb"))


        # Finally, we build the model, fit it to the full training data, and
        # estimate its out-of-sample performance by applying it to the test set
        best_poly = PolynomialFeatures(deg)
        best_lm = LinearRegression(fit_intercept=False)
        best_lm.fit(best_poly.fit_transform(X_train), y_train)

        pickle.dump(best_lm,open(path+"best_model_deg"+str(deg)+".p","wb"))
        pickle.dump(best_poly,open(path+"best_poly.p","wb"))


        # Estimate performance on test set:
        MSE = np.mean((y_test - best_lm.predict(best_poly.transform(X_test))) ** 2)
        RMSE = np.sqrt(MSE)
        MAE = np.mean(abs(y_test - best_lm.predict(best_poly.transform(X_test))))
        print(f'Degree {deg} polynomial has RMSE = {RMSE:.5f}')
        print(f'Degree {deg} polynomial has MAE = {MAE:.5f}')


-----------------------------------
Building model for  TN


TypeError: float() argument must be a string or a number, not 'method'