In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, Normalizer, MaxAbsScaler, RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB

def power_transform(df, method):
    if(method == 'yeo'):
      power = PowerTransformer(method='yeo-johnson', standardize=True)
      df = power.fit_transform(df)
      return df
    elif(method == 'box'):
      power = PowerTransformer(method='box-cox', standardize=True)
      df = power.fit_transform(df)
      return df
    else:
      return "ENTER IN yeo OR box"

In [None]:
df = pd.read_csv('soil-moisture.csv')

In [None]:
df = df.drop('Day', axis = 1)


In [None]:
df['Month'].unique()

In [None]:
def spring(rows):
  if(rows == 'Jul' or rows == 'Aug' or rows == 'Sep'):
    return 1
  else:
    return 0

def fall(rows):
  if(rows == 'Oct' or rows == 'Nov'):
    return 1
  else:
    return 0

def winter(rows):
  if(rows == 'Dec' or rows == 'Jan' or rows == 'Feb'):
    return 1
  else:
    return 0

def summer(rows):
  if(rows == 'Mar'):
    return 1
  else:
    return 0

df['spring'] = df['Month'].apply(spring)
df['winter'] = df['Month'].apply(winter)
df['fall'] = df['Month'].apply(fall)
df['summer'] = df['Month'].apply(summer)

In [None]:
df = df.drop('Month', axis = 1)

In [None]:
X = df.drop('avg_sm', axis = 1)
y = df['avg_sm']

In [None]:
X_continuous = X.drop(['spring', 'winter', 'fall', 'summer'], axis = 1)
X_discrete = X[['spring', 'winter', 'fall', 'summer']]
X_continuous_yeo = power_transform(X_continuous, 'yeo')
X_continuous_yeo = pd.DataFrame(X_continuous_yeo, columns = X_continuous.columns)

scaler = MinMaxScaler()
X_continuous_minmax_scaled = scaler.fit_transform(X_continuous)
X_continuous_minmax_scaled = pd.DataFrame(X_continuous_minmax_scaled, columns = X_continuous.columns)

scaler2 = StandardScaler()
X_continuous_standard_scaled = scaler2.fit_transform(X_continuous)
X_continuous_standard_scaled = pd.DataFrame(X_continuous_minmax_scaled, columns = X_continuous.columns)

scaler3 = RobustScaler()
X_continuous_robust_scaled = scaler3.fit_transform(X_continuous)
X_continuous_robust_scaled = pd.DataFrame(X_continuous_robust_scaled, columns = X_continuous.columns)

scaler4 = MaxAbsScaler()
X_continuous_maxabs_scaled = scaler4.fit_transform(X_continuous)
X_continuous_maxabs_scaled = pd.DataFrame(X_continuous_maxabs_scaled, columns = X_continuous.columns)

scaler5 = QuantileTransformer()
X_continuous_quantile_scaled = scaler5.fit_transform(X_continuous)
X_continuous_quantile_scaled = pd.DataFrame(X_continuous_quantile_scaled, columns = X_continuous.columns)

scaler6 = Normalizer()
X_continuous_normalized = scaler6.fit_transform(X_continuous)
X_continuous_normalized = pd.DataFrame(X_continuous_normalized, columns = X_continuous.columns)

scaler7 = PCA()
X_continuous_pca = scaler7.fit_transform(X_continuous)
X_continuous_pca = pd.DataFrame(X_continuous_pca, columns = X_continuous.columns)

In [None]:
X_full_yeo = pd.concat([X_continuous_yeo, X_discrete], axis = 1)
X_full_minmax_scaled = pd.concat([X_continuous_minmax_scaled, X_discrete], axis = 1)
X_full_standard_scaled = pd.concat([X_continuous_standard_scaled, X_discrete], axis = 1)
X_full_robust_scaled = pd.concat([X_continuous_robust_scaled, X_discrete], axis = 1)
X_full_maxabs_scaled = pd.concat([X_continuous_maxabs_scaled, X_discrete], axis = 1)
X_full_quantile_scaled = pd.concat([X_continuous_quantile_scaled, X_discrete], axis = 1)
X_full_normalized = pd.concat([X_continuous_normalized, X_discrete], axis = 1)
X_full_pca = pd.concat([X_continuous_pca, X_discrete], axis = 1)


In [None]:
X_train_yeo, X_test_yeo, y_train_yeo, y_test_yeo = train_test_split(X_full_yeo, y, test_size = 0.2)
X_train_minmax, X_test_minmax, y_train_minmax, y_test_minmax = train_test_split(X_full_minmax_scaled, y, test_size = 0.2)
X_train_standard, X_test_standard, y_train_standard, y_test_standard = train_test_split(X_full_standard_scaled, y, test_size = 0.2)
X_train_robust, X_test_robust, y_train_robust, y_test_robust = train_test_split(X_full_robust_scaled, y, test_size = 0.2)
X_train_maxabs, X_test_maxabs, y_train_maxabs, y_test_maxabs = train_test_split(X_full_maxabs_scaled, y, test_size = 0.2)
X_train_quantile, X_test_quantile, y_train_quantile, y_test_quantile = train_test_split(X_full_quantile_scaled, y, test_size = 0.2)
X_train_normalized, X_test_normalized, y_train_normalized, y_test_normalized = train_test_split(X_full_normalized, y, test_size = 0.2)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_full_pca, y, test_size = 0.2)

In [None]:
rf = RandomForestRegressor(bootstrap = False, max_depth = 80, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 2, n_estimators=200)
lister = [rf]
datasets = [
    [X_train_yeo, X_test_yeo, y_train_yeo, y_test_yeo],
    [X_train_minmax, X_test_minmax, y_train_minmax, y_test_minmax],
    [X_train_standard, X_test_standard, y_train_standard, y_test_standard],
    [X_train_robust, X_test_robust, y_train_robust, y_test_robust],
    [X_train_maxabs, X_test_maxabs, y_train_maxabs, y_test_maxabs],
    [X_train_quantile, X_test_quantile, y_train_quantile, y_test_quantile],
    [X_train_normalized, X_test_normalized, y_train_normalized, y_test_normalized],
    [X_train_pca, X_test_pca, y_train_pca, y_test_pca],
]
for model in lister:
  for data in datasets:
    X_train = data[0]
    X_test = data[1]
    y_train = data[2]
    y_test = data[3]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if(i == 0):
      print("YEO: ")
    elif(i ==1):
      print("MINMAX: ")
    elif(i == 2):
      print("STANDARD: ")
    elif(i == 3):
      print("ROBUST: ")
    elif(i == 4):
      print("MAXABS: ")
    elif(i == 5):
      print("QUANTILE: ")
    elif(i == 6):
      print("NORMALIZED")
    else:
      print("PCA: ")
    print(mean_squared_error(y_test, y_pred))
    print(mean_absolute_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))
    print()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [None]:
from sklearn.model_selection import GridSearchCV
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_


Fitting 3 folds for each of 4320 candidates, totalling 12960 fits


