In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1.Load the libraries

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from PIL import Image
import os


2.Load the Data files and data processing

In [3]:
train_dir = '/content/drive/MyDrive/soil_classification-2025/train'
test_dir = '/content/drive/MyDrive/soil_classification-2025/test'
train_df = pd.read_csv('/content/drive/MyDrive/soil_classification-2025/train_labels.csv')
test_df = pd.read_csv('/content/drive/MyDrive/soil_classification-2025/test_ids.csv')

In [4]:
# This is done to convert the images into the loadable image format to work with the image
test_df.loc[test_df.iloc[:, 0] == 'img_f22972ea.webp', test_df.columns[0]] = 'img_f22972ea.jpg'
test_df.loc[test_df.iloc[:, 0] == 'img_91cbc6e5.gif', test_df.columns[0]] = 'img_91cbc6e5.png'

In [5]:
# the image is loaded and resized
def load_and_resize_image(filepath, target_size=(64, 64)):
    try:
        img = Image.open(filepath).convert('RGB')
        img = img.resize(target_size)
        return np.array(img)
    except Exception as e:
        print(f"Error loading image {filepath}: {e}")
        return None

In [6]:
# here the images are converted into array to work with traditional model we have to work with array
train_images = []
train_labels = []
for index, row in train_df.iterrows():
    img_path = os.path.join(train_dir, row[train_df.columns[0]])
    img = load_and_resize_image(img_path)
    if img is not None:
        train_images.append(img.flatten())
        train_labels.append(row[train_df.columns[1]])

train_images = np.array(train_images)
train_labels = np.array(train_labels)

test_images = []
test_image_ids = []

if test_df.shape[1] == 1:
    test_df['label'] = 'unknown'

for index, row in test_df.iterrows():
    img_path = os.path.join(test_dir, row[test_df.columns[0]])
    img = load_and_resize_image(img_path)
    if img is not None:
        test_images.append(img.flatten())
        test_image_ids.append(row[test_df.columns[0]])

test_images = np.array(test_images)
# train and valdation split is done here
X_train, X_val, y_train, y_val = train_test_split(train_images, train_labels, test_size=0.2, random_state=42)

2.Model deployment

In [22]:
# here i have used 2 models which gave the best results.
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

test_predictions = rf_model.predict(test_images)

val_predictions = rf_model.predict(X_val)

In [24]:
import json
from google.colab import files
from sklearn.metrics import precision_score, recall_score, f1_score

f1 = f1_score(y_val, val_predictions, average='weighted')

precision = precision_score(y_val, val_predictions, average='weighted')

recall = recall_score(y_val, val_predictions, average='weighted')

metrics_data = {
    "RF_validation_weighted_f1_score": f1,
    "RF_validation_weighted_precision": precision,
    "RF_validation_weighted_recall": recall
}

filename = 'metrics_RF.json'

with open(filename, 'w') as f:
    json.dump(metrics_data, f, indent=4)

files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)

y_val_encoded = label_encoder.transform(y_val)

dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dval = xgb.DMatrix(X_val, label=y_val_encoded)
dtest = xgb.DMatrix(test_images)

params = {
        'objective': 'multi:softmax',
        'num_class': len(label_encoder.classes_),
        'eval_metric': 'merror',
        'eta': 0.1,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': 42
    }


model3 = xgb.train(params, dtrain, num_boost_round=30, evals=[(dval, 'eval')])

[0]	eval-merror:0.17959
[1]	eval-merror:0.11837
[2]	eval-merror:0.10612
[3]	eval-merror:0.09796
[4]	eval-merror:0.10612
[5]	eval-merror:0.10204
[6]	eval-merror:0.08571
[7]	eval-merror:0.08980
[8]	eval-merror:0.08571
[9]	eval-merror:0.07755
[10]	eval-merror:0.07755
[11]	eval-merror:0.07755
[12]	eval-merror:0.06122
[13]	eval-merror:0.06531
[14]	eval-merror:0.06122
[15]	eval-merror:0.06122
[16]	eval-merror:0.06122
[17]	eval-merror:0.06531
[18]	eval-merror:0.06939
[19]	eval-merror:0.06122
[20]	eval-merror:0.06122
[21]	eval-merror:0.05714
[22]	eval-merror:0.06122
[23]	eval-merror:0.06122
[24]	eval-merror:0.06122
[25]	eval-merror:0.06122
[26]	eval-merror:0.05714
[27]	eval-merror:0.05714
[28]	eval-merror:0.05714
[29]	eval-merror:0.05306


In [11]:
test_predictions_encoded = model3.predict(dtest)
test_predictions_original = label_encoder.inverse_transform(test_predictions_encoded.astype(int))

In [12]:
# The predicted outputs
test_predictions_original

array(['Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Clay soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Black Soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvial soil', 'Alluvial soil', 'Alluvial soil',
       'Alluvial soil', 'Alluvia

In [13]:
# evaluation
val_predictions_encoded = model3.predict(dval)

val_predictions_original = label_encoder.inverse_transform(val_predictions_encoded.astype(int))

from sklearn.metrics import f1_score
f1 = f1_score(y_val, val_predictions_original, average='weighted')
print(f"XGBoost Validation F1 Score (weighted): {f1}")

XGBoost Validation F1 Score (weighted): 0.9463850089269877


In [25]:
import json
from google.colab import files
from sklearn.metrics import precision_score, recall_score, f1_score

f1 = f1_score(y_val, val_predictions_original, average='weighted')

precision = precision_score(y_val, val_predictions_original, average='weighted')

recall = recall_score(y_val, val_predictions_original, average='weighted')

metrics_data = {
    "xgboost_validation_weighted_f1_score": f1,
    "xgboost_validation_weighted_precision": precision,
    "xgboost_validation_weighted_recall": recall
}

filename = 'metrics_XG.json'

with open(filename, 'w') as f:
    json.dump(metrics_data, f, indent=4)

files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Save the model

In [27]:
submission_df = pd.DataFrame({'image_id': test_image_ids, 'soil_type': test_predictions})

submission_df.to_csv('/content/rf_predictions.csv', index=False)