In [39]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error

from sklearn import preprocessing 

from sklearn.model_selection import GridSearchCV

In [3]:
test = pd.read_csv('test.zip')
train = pd.read_csv('train.zip')

In [4]:
train.head()

Unnamed: 0,PIDN,m7497.96,m7496.04,m7494.11,m7492.18,m7490.25,m7488.32,m7486.39,m7484.46,m7482.54,...,REF7,RELI,TMAP,TMFI,Depth,Ca,P,pH,SOC,Sand
0,XNhoFZW5,0.302553,0.301137,0.299748,0.300354,0.302679,0.303799,0.301702,0.298936,0.298126,...,-0.646673,1.687734,0.190708,0.056843,Topsoil,-0.295749,-0.041336,-1.129366,0.353258,1.269748
1,9XNspFTd,0.270192,0.268555,0.266964,0.267938,0.271013,0.272346,0.26987,0.266976,0.266544,...,-0.646673,1.687734,0.190708,0.056843,Subsoil,-0.387442,-0.231552,-1.531538,-0.264023,1.692209
2,WDId41qG,0.317433,0.316265,0.314948,0.315224,0.316942,0.317764,0.316067,0.313874,0.313301,...,-0.814516,1.80666,0.190708,0.056843,Topsoil,-0.248601,-0.224635,-0.259551,0.064152,2.091835
3,JrrJf1mN,0.261116,0.259767,0.258384,0.259001,0.26131,0.262417,0.260534,0.258039,0.257246,...,-0.814516,1.80666,0.190708,0.056843,Subsoil,-0.332195,-0.318014,-0.577548,-0.318719,2.118477
4,ZoIitegA,0.260038,0.258425,0.256544,0.25703,0.259602,0.260786,0.258717,0.256352,0.255902,...,-0.780242,0.430513,0.190708,0.056843,Topsoil,-0.43835,-0.01021,-0.699135,-0.310905,2.164148


In [5]:
train.shape

(1157, 3600)

In [6]:
train.isna().sum().max()

0

In [7]:
train.drop(columns = "PIDN", axis = 1, inplace = True)

In [8]:
train.columns

Index(['m7497.96', 'm7496.04', 'm7494.11', 'm7492.18', 'm7490.25', 'm7488.32',
       'm7486.39', 'm7484.46', 'm7482.54', 'm7480.61',
       ...
       'REF7', 'RELI', 'TMAP', 'TMFI', 'Depth', 'Ca', 'P', 'pH', 'SOC',
       'Sand'],
      dtype='object', length=3599)

In [9]:
label_encoder = preprocessing.LabelEncoder() 
  
train['Depth']= label_encoder.fit_transform(train['Depth']) 

In [10]:
train['Depth'] = train['Depth'].replace({'Topsoil' : 0 , 'Subsoil' : 1})

In [11]:
selection = SelectKBest(score_func=f_regression,k = 3200)

# Prediction for Ca

In [12]:
y_ca = train['Ca']
X_ca = train.drop(['Ca','P', 'pH', 'SOC', 'Sand'], axis = 1)

In [13]:
X_can = selection.fit_transform(X_ca, y_ca)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_can, y_ca, test_size=0.2, random_state=42)
#model = RandomForestRegressor(n_estimators = 200, max_depth = 8, random_state = 1)
model = KernelRidge(alpha=0.1, kernel='polynomial', degree=7, coef0=2.5)
model.fit(X_train,y_train)
prediction = model.predict(X_test)

mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) for P: {rmse:.2f}")

Root Mean Squared Error (RMSE) for P: 0.33


# Prediction for P

In [113]:
y_p = train['P']
X_p = train.drop(['Ca','P', 'pH', 'SOC', 'Sand'], axis = 1)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p, test_size=0.2, random_state=42)
#model = RandomForestRegressor(n_estimators = 200, max_depth = 8, random_state = 1)
model = KernelRidge(alpha = 0.1, gamma = 0.1, kernel='rbf', coef0 = 2.5, degree = 7)
model.fit(X_train,y_train)
prediction = model.predict(X_test)

mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) for P: {rmse:.2f}")

Root Mean Squared Error (RMSE) for P: 0.57


# Prediction for pH

In [19]:
y_pH = train['pH']
X_pH = train.drop(['Ca','P', 'pH', 'SOC', 'Sand'], axis = 1)

In [33]:
X_pHn = selection.fit_transform(X_pH, y_pH)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_pHn, y_pH, test_size=0.2, random_state=42)
#model = RandomForestRegressor(n_estimators = 200, max_depth = 8, random_state = 1)
model = KernelRidge(alpha=0.1, kernel='polynomial', degree=7, coef0=2.5)
model.fit(X_train,y_train)
prediction = model.predict(X_test)

mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) for pH: {rmse:.2f}")

Root Mean Squared Error (RMSE) for pH: 0.40


# Prediction for SOC

In [24]:
y_soc = train['SOC']
X_soc = train.drop(['Ca','P', 'pH', 'SOC', 'Sand'], axis = 1)

In [25]:
selection_soc = SelectKBest(score_func = f_regression,k = 1500)

In [26]:
X_socn = selection_soc.fit_transform(X_soc, y_soc)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_socn, y_soc, test_size=0.20, random_state=42)
#model = RandomForestRegressor(n_estimators = 200, max_depth = 8, random_state = 1)
model = KernelRidge(alpha=0.1, kernel='polynomial', degree=7, coef0=2.5)
model.fit(X_train,y_train)
prediction = model.predict(X_test)

mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) for SOC: {rmse:.2f}")

Root Mean Squared Error (RMSE) for SOC: 0.28


# Prediction for Sand

In [28]:
y_sand = train['Sand']
X_sand = train.drop(['Ca','P', 'pH', 'SOC', 'Sand'], axis = 1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_sand, y_sand, test_size=0.2, random_state=42)
#model = RandomForestRegressor(n_estimators = 200, max_depth = 8, random_state = 1)
model = KernelRidge(alpha=0.1, kernel='polynomial', degree=7, coef0=2.5)
model.fit(X_train,y_train)
prediction = model.predict(X_test)

mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) for Sand: {rmse:.2f}")

Root Mean Squared Error (RMSE) for Sand: 0.26


# Grid Search Cross-Validation for RandomForestRegressort

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 8, 10],
    # Add more parameters to tune if needed
}

# Function to train and evaluate model
def train_and_evaluate(X, y, target_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestRegressor(random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    prediction = best_model.predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error (RMSE) for {target_name}: {rmse:.2f}")
    
# Apply function for each target variable
targets = {
    'Ca': (train['Ca'], X_ca),
    'P': (train['P'], X_p),
    'pH': (train['pH'], X_pH),
    'SOC': (train['SOC'], X_soc),
    'Sand': (train['Sand'], X_sand)
}

for target_name, (y, X) in targets.items():
    train_and_evaluate(X, y, target_name)


# Grid Search Cross-Validation for Kernel Ridge

In [51]:
# Define the parameter grid to search for Kernel Ridge
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1.0],
    'degree': [range(1,10)],  # For polynomial kernel
    'coef0': [0.0, 1.0],  # For polynomial and sigmoid kernels
    # Additional parameters as needed
}


# Function to train and evaluate Kernel Ridge model
def train_and_evaluate_kernel_ridge(X, y, target_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = KernelRidge()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    # Get best parameters
    best_params = grid_search.best_params_
    print(f"Best Parameters for {target_name}: {best_params}")
    
    prediction = best_model.predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error (RMSE) for {target_name}: {rmse:.2f}")
    
# Apply function for each target variable
targets = {
    'Ca': (train['Ca'], X_ca),
    'P': (train['P'], X_p),
    'pH': (train['pH'], X_pH),
    'SOC': (train['SOC'], X_soc),
    'Sand': (train['Sand'], X_sand)
}

for target_name, (y, X) in targets.items():
    train_and_evaluate_kernel_ridge(X, y, target_name)


Best Parameters for Ca: {'alpha': 10.0, 'coef0': 0.0, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
Root Mean Squared Error (RMSE) for Ca: 0.39
Best Parameters for P: {'alpha': 0.1, 'coef0': 0.0, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
Root Mean Squared Error (RMSE) for P: 0.57
Best Parameters for pH: {'alpha': 1.0, 'coef0': 1.0, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
Root Mean Squared Error (RMSE) for pH: 0.42
Best Parameters for SOC: {'alpha': 10.0, 'coef0': 1.0, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
Root Mean Squared Error (RMSE) for SOC: 0.27
Best Parameters for Sand: {'alpha': 10.0, 'coef0': 1.0, 'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}
Root Mean Squared Error (RMSE) for Sand: 0.28
