In [1]:
import os

OUTPUT_DIR = './'
MODEL_DIR = '/kaggle/input/soil-02-effnetb7/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

TRAIN_PATH = '/kaggle/input/soil-classification/soil_classification-2025/train'
TEST_PATH = '/kaggle/input/soil-classification/soil_classification-2025/test'

In [2]:
!pip install timm

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->timm)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch->timm)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch->timm)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch->ti

In [3]:
# CFG
# ====================================================
class CFG:
    debug=False
    num_workers=4
    model_name='tf_efficientnet_b7_ns'
    size=512
    batch_size=16
    seed=42
    target_size=4
    target_col='label_encoded'
    n_fold=5
    trn_fold=[0,1]
    train=False
    inference=True

In [4]:
import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter

import scipy as sp
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

from albumentations import *
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

import timm

import warnings 
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  check_for_updates()


In [5]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    return f1_score(y_true, y_pred,average='weighted')


@contextmanager
def timer(name):
    t0 = time.time()
    LOGGER.info(f'[{name}] start')
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s.')


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(seed=CFG.seed)

In [6]:
test_files = os.listdir(TEST_PATH)
print(len(test_files))
print(test_files[:4])

341
['img_0f035b97.jpg', 'img_f13af256.jpg', 'img_15b41dbc.jpg', 'img_cfb4fc7a.jpg']


In [7]:
test = pd.DataFrame()
test["image_id"] = test_files
test.head()

Unnamed: 0,image_id
0,img_0f035b97.jpg
1,img_f13af256.jpg
2,img_15b41dbc.jpg
3,img_cfb4fc7a.jpg
4,img_683111fb.jpg


In [8]:
# Dataset
# ====================================================
class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_id'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        file_path = f'{TEST_PATH}/{file_name}'
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

In [9]:
# ====================================================
# Transforms
# ====================================================
def get_transforms(*, data):
    
    if data == 'train':
        return Compose([
            #Resize(CFG.size, CFG.size),
            RandomResizedCrop(CFG.size, CFG.size),
            Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            ShiftScaleRotate(p=0.5),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

    elif data == 'valid':
        return Compose([
            Resize(CFG.size, CFG.size),
            Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])

In [10]:
# ====================================================
# MODEL
# ====================================================
class CustomResNext(nn.Module):
    def __init__(self, model_name='resnext50_32x4d', pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=CFG.target_size)

    def forward(self, x):
        x = self.model(x)
        return x

In [11]:
    
class CustomEfficientNet(nn.Module):
    def __init__(self, model_name=CFG.model_name, pretrained=False):
        super().__init__()
        self.model = timm.create_model(CFG.model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, CFG.target_size)

    def forward(self, x):
        x = self.model(x)
        return x

In [12]:
# ====================================================
# Helper functions
# ====================================================
def inference(model, states, test_loader, device):
    model.to(device)
    tk0 = tqdm(enumerate(test_loader), total=len(test_loader))
    probs = []
    for i, (images) in tk0:
        images = images.to(device)
        avg_preds = []
        for state in states:
            model.load_state_dict(state['model'])
            model.eval()
            with torch.no_grad():
                y_preds = model(images)
            avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
        avg_preds = np.mean(avg_preds, axis=0)
        probs.append(avg_preds)
    probs = np.concatenate(probs)
    return probs


In [13]:
model = CustomEfficientNet(CFG.model_name, pretrained=False)
states = [torch.load(MODEL_DIR+f'{CFG.model_name}_fold{fold}_best.pth',weights_only = False) for fold in CFG.trn_fold]
test_dataset = TestDataset(test, transform=get_transforms(data='valid'))
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True)
predictions = inference(model, states, test_loader, device)
# submission
test['soil_type'] = predictions.argmax(1)
test.head()

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,image_id,soil_type
0,img_0f035b97.jpg,2
1,img_f13af256.jpg,2
2,img_15b41dbc.jpg,0
3,img_cfb4fc7a.jpg,1
4,img_683111fb.jpg,1


In [14]:
test["soil_type"].unique()

array([2, 0, 1, 3])

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [16]:
train = pd.read_csv('/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv')

display(train.head())

Unnamed: 0,image_id,soil_type
0,img_ed005410.jpg,Alluvial soil
1,img_0c5ecd2a.jpg,Alluvial soil
2,img_ed713bb5.jpg,Alluvial soil
3,img_12c58874.jpg,Alluvial soil
4,img_eff357af.jpg,Alluvial soil


In [17]:
le = LabelEncoder()
train["label_encoded"] = le.fit_transform(train["soil_type"])

In [18]:
train["label"] = train["soil_type"]

In [19]:
cols = ["label_encoded","label"]

In [20]:
df=train[cols].value_counts()

In [21]:
df = pd.DataFrame(df)

In [22]:
df_master = df.reset_index()

In [23]:
df_master[cols]

Unnamed: 0,label_encoded,label
0,0,Alluvial soil
1,3,Red soil
2,1,Black Soil
3,2,Clay soil


In [24]:
test

Unnamed: 0,image_id,soil_type
0,img_0f035b97.jpg,2
1,img_f13af256.jpg,2
2,img_15b41dbc.jpg,0
3,img_cfb4fc7a.jpg,1
4,img_683111fb.jpg,1
...,...,...
336,img_ab698e86.jpg,2
337,img_d3c1f03b.jpg,1
338,img_0225b7de.jpg,0
339,img_2a1fe606.jpg,0


In [25]:
import pandas as pd
sub = pd.merge(test, df_master, left_on='soil_type', right_on='label_encoded', how='inner')

In [26]:
df_sub = pd.DataFrame()

In [27]:
df_sub["image_id"] = sub["image_id"]
df_sub["soil_type"] = sub["label"]

In [28]:
df_sub.head()

Unnamed: 0,image_id,soil_type
0,img_0f035b97.jpg,Clay soil
1,img_f13af256.jpg,Clay soil
2,img_15b41dbc.jpg,Alluvial soil
3,img_cfb4fc7a.jpg,Black Soil
4,img_683111fb.jpg,Black Soil


In [29]:
df_sub.to_csv(OUTPUT_DIR+'submission.csv', index=False)