In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
import math

# Variable setting and value seeding
np.random.seed(42)

n_samples = 1500

def gdm3_converter(ion):
    ion_mol = ion / 1.0008
    ion_conc = np.log(10) * ion_mol
    return ion_conc

# Rainfall (mm)
rainfall = np.random.uniform(1, 101, n_samples)
# Hydrogen (g dm-3)
h_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
H = gdm3_converter(h_gdm3)
# Zinc (g dm-3)
zn_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
Zn = gdm3_converter(zn_gdm3)
# Manganese (g dm-3)
mn_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
Mn = gdm3_converter(mn_gdm3)
# Iron (g dm-3)
fe_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
Fe = gdm3_converter(fe_gdm3)
# Copper (g dm-3)
cu_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
Cu = gdm3_converter(cu_gdm3)
# Phosphorus (g dm-3)
p_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
P = gdm3_converter(p_gdm3)
# Sodium (g dm-3)
na_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
Na = gdm3_converter(na_gdm3)
# Nitrogen (g dm-3)
n_gdm3 = np.random.uniform(1e-14, 1e-1, n_samples)
N = gdm3_converter(n_gdm3)

In [4]:
# Noise generation
noise = np.random.choice([0, 1], size=n_samples, p=[0.9, 0.1])

In [5]:
# Define acidity as 'Low pH' or 'High pH' for acidic H+ concentraton
acidity = np.where((H > 1e-8) & (H < 1e-2) | noise, 'High soil acidity', 'Low soil acidity')

In [6]:
# Rainfall increase leading to increase in H+ dissociation/concentration
for i in range(len(rainfall)-1):
    rainfall_change = (rainfall[i] - rainfall[i+1]) * 100
    if rainfall_change >= 10:
        H[i+1] += 10e-9

In [7]:
pHIndicator = np.where(acidity == 'High soil acidity', 1, 0)

data = pd.DataFrame({
    'Rainfall': rainfall,
    'H': H,
    'Zn': Zn,
    'Mn': Mn,
    'Fe': Fe,
    'Cu': Cu,
    'P': P,
    'Na': Na,
    'N': N,
    'pH Indicator': pHIndicator
})

print(data.head())

    Rainfall         H        Zn        Mn        Fe        Cu         P  \
0  38.454012  0.119427  0.154772  0.137883  0.149147  0.124902  0.210191   
1  96.071431  0.110248  0.183296  0.118191  0.039662  0.191628  0.120872   
2  74.199394  0.005900  0.057626  0.066304  0.200716  0.028780  0.166783   
3  60.865848  0.078512  0.143768  0.001487  0.141062  0.029860  0.100324   
4  16.601864  0.087473  0.131544  0.114172  0.036169  0.128110  0.144955   

         Na         N  pH Indicator  
0  0.008571  0.150539             0  
1  0.006062  0.018413             0  
2  0.170054  0.055754             1  
3  0.073166  0.178004             0  
4  0.183208  0.121637             0  


In [8]:
# Train model

target_col = 'pH Indicator'

X = data.loc[:, data.columns != target_col]
y = data.loc[:, target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {X_train.shape[0]} samples')
print(f'Testing set size: {X_test.shape[0]} samples')

X_train.head()

y_train.head()

features = list(X_train.columns)

print(y_train.value_counts())
print(y_test.value_counts())

model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, verbose = 100, eval_metric='F1')

model.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

y_pred = model.predict(X_test)

Training set size: 1200 samples
Testing set size: 300 samples
pH Indicator
0    1041
1     159
Name: count, dtype: int64
pH Indicator
0    254
1     46
Name: count, dtype: int64


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4930233	test: 0.3793103	best: 0.3793103 (0)	total: 181ms	remaining: 3m
100:	learn: 0.7578125	test: 0.4838710	best: 0.4838710 (1)	total: 659ms	remaining: 5.87s
200:	learn: 0.9840256	test: 0.4918033	best: 0.4918033 (140)	total: 1.14s	remaining: 4.51s
300:	learn: 1.0000000	test: 0.4918033	best: 0.4918033 (140)	total: 1.55s	remaining: 3.6s
400:	learn: 1.0000000	test: 0.4918033	best: 0.4918033 (140)	total: 1.92s	remaining: 2.87s
500:	learn: 1.0000000	test: 0.4838710	best: 0.4918033 (140)	total: 2.3s	remaining: 2.29s
600:	learn: 1.0000000	test: 0.4838710	best: 0.4918033 (140)	total: 2.66s	remaining: 1.76s
700:	learn: 1.0000000	test: 0.4761905	best: 0.4918033 (140)	total: 3.11s	remaining: 1.33s
800:	learn: 1.0000000	test: 0.4838710	best: 0.4918033 (140)	total: 3.5s	remaining: 870ms
900:	learn: 1.0000000	test: 0.4761905	best: 0.4918033 (140)	total: 3.89s	remaining: 427ms
999:	learn: 1.0000000	test: 0.4761905	best: 0.4918033 (140)	total: 4.25s	remaining: 0us

bestTest = 0.4918032787

In [9]:
# Evaluate model
print(f'Predictions: {y_pred[:100]}')

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'F1: {f1_score(y_test, y_pred):.4f}')
print(f'Precision: {precision_score(y_test, y_pred):.4f}')
print(f'Recall: {recall_score(y_test, y_pred):.4f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Predictions: [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
Accuracy: 0.8967
F1: 0.4918
Precision: 1.0000
Recall: 0.3261

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       254
           1       1.00      0.33      0.49        46

    accuracy                           0.90       300
   macro avg       0.95      0.66      0.72       300
weighted avg       0.91      0.90      0.87       300



In [14]:
model.save_model("soil_acidity_model_v3.bin")
print("Model saved as soil_acidity_model_v3.bin")

Model saved as soil_acidity_model_v3.bin


In [15]:
# Load model to file path
def load_model(file_path):
    loaded_model = CatBoostClassifier()
    loaded_model.load_model(file_path)  # Load model using CatBoost's method
    print(f"Model loaded from {file_path}")
    return loaded_model

In [16]:
# Test loading the model
loaded_model = load_model("soil_acidity_model_v3.bin")

Model loaded from soil_acidity_model_v3.bin
