In [None]:
# !pip download -d packages transformers
!pip install -q -U transformers --no-index -f /kaggle/input/metadino-v3-convnext/pytorch/default/3/packages
# !pip install -q -U transformers xgboost opencv-python 'numpy<2.0' 'pandas>=2.2'

In [None]:
# import huggingface_hub
# huggingface_hub.login()

In [None]:
import os
import glob
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.auto import tqdm

import torch
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
print(f"PyTorch: {torch.__version__}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

In [None]:
PATH_DATA = '/kaggle/input/csiro-biomass'
PATH_TRAIN_CSV = os.path.join(PATH_DATA, 'train.csv')
PATH_TRAIN_IMG = os.path.join(PATH_DATA, 'train')
PATH_TEST_IMG = os.path.join(PATH_DATA, 'test')

df = pd.read_csv(PATH_TRAIN_CSV)
print(f"Dataset size: {df.shape}")
display(df.head())

In [None]:
TARGET_COLS = [c for c in df.columns if c not in ['image_id', 'Image']]
print(f"Target columns: {TARGET_COLS}")
print(f"Number of targets: {len(TARGET_COLS)}")

In [None]:
# Exclude non-numeric or identifier columns from histogram plotting
cols_to_plot = [col for col in TARGET_COLS if col not in ['sample_id', 'image_path', 'State', 'target_name']]

for col in cols_to_plot:
    plt.figure(figsize=(8, 3)) # Create a new figure for each histogram
    plt.hist(df[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.title(f'{col} Distribution', fontsize=14, fontweight='bold')
    plt.grid(alpha=0.3)
    plt.xticks(rotation=45, ha="right") # Rotate x-axis labels
    plt.tight_layout() # Adjust layout to prevent overlap
    plt.show()


In [None]:
cols_to_plot = ['State', 'target_name']
n_rows, n_cols = 1, len(cols_to_plot)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 4 * n_rows))

# Ensure axes is an array even for a single subplot
axes = axes.flatten()

for ax, col in zip(axes, cols_to_plot):
    counts = df[col].value_counts()
    ax.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140)
    ax.set_title(f'Distribution of {col}', fontsize=14, fontweight='bold')
    ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

grouped_train_data = df.groupby('target_name')

plt.figure(figsize=(12, 8))
for target_name, group_data in grouped_train_data:
    sns.histplot(data=group_data, x='target', kde=True, label=target_name)

plt.title('Distribution of Target for Each Target Name Class')
plt.xlabel('Target')
plt.ylabel('Frequency')
plt.legend(title='Target Name')
plt.grid(True) # Added grid here
plt.show()


In [None]:
def show_images(df_sample, n=12, path_img=PATH_DATA):
    """Displays a linear sampling of images sorted by target value."""

    # Sort the DataFrame by the 'target' column
    df_sorted = df_sample.sort_values(by='target').reset_index(drop=True)

    # Perform linear sampling
    indices_to_show = np.linspace(0, len(df_sorted) - 1, n, dtype=int)
    df_to_show = df_sorted.iloc[indices_to_show]

    # Determine the number of rows and columns for subplots
    n_cols = 3  # You can adjust this number
    n_rows = (n + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
    axes = axes.flatten()

    # Remove unused subplots if any
    for i in range(n, len(axes)):
        fig.delaxes(axes[i])

    for i, (idx, row) in enumerate(df_to_show.iterrows()):
        # Use image_path directly (includes train/ID....jpg)
        img_path = os.path.join(path_img, row['image_path'])

        if os.path.exists(img_path):
            img = Image.open(img_path).convert('RGB')
            axes[i].imshow(img)
            # Include the target value in the title
            title = f"ID: {row['sample_id']}\nTarget: {row['target']:.2f}"
            axes[i].set_title(title, fontsize=10)
        axes[i].axis('off')

    plt.tight_layout()
    plt.show()

# Example usage: Show 12 images linearly sampled based on target value
show_images(df, n=12)

In [None]:
from transformers import pipeline

# Define the feature extraction pipeline
feature_extractor = pipeline(
    # model="facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
    model="/kaggle/input/metadino-v3-convnext/pytorch/default/3/dinov3-convnext-tiny-pretrain-lvd1689m",
    task="image-feature-extraction",
    device=0 if torch.cuda.is_available() else -1 # Use GPU if available
)

def extract_image_features_pipeline(image_path, feature_extractor, path_data):
    """
    Splits an image into two 1000x1000 parts and extracts features from each using a pipeline.
    """
    full_image_path = os.path.join(path_data, image_path)
    img = Image.open(full_image_path)#.convert('RGB')

    # Split the image into two 1000x1000 parts
    width, height = img.size
    img1 = img.crop((0, 0, width // 2, height))
    img2 = img.crop((width // 2, 0, width, height))

    # The pipeline expects PIL Images or paths, use 'inputs'
    extracted_features = feature_extractor(inputs=[img1, img2], pool=True)
    # extracted_features = feature_extractor(inputs=[img], pool=True)
    # print(f"extracted_features: {np.array(extracted_features).shape}")

    # Concatenate features from both parts
    combined_features = list(np.array(extracted_features).flatten())
    # print(f"combined_features: {len(combined_features)}")

    return combined_features

print("Image feature extraction function defined using pipeline.")


In [None]:
# Extract features for all images
image_features = {}
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    image_path = row['image_path']
    # Use the pipeline function
    features = extract_image_features_pipeline(image_path, feature_extractor, PATH_DATA)
    # Use the sample_id as the key for the features
    image_features[row['sample_id']] = features

print("Image feature extraction complete.")

In [None]:
# Convert the image features dictionary to a DataFrame
image_features_df = pd.DataFrame.from_dict(image_features, orient='index')
image_features_df.index.name = 'sample_id'
image_features_df.columns = [f'img_feature_{i}' for i in range(image_features_df.shape[1])]

# One-hot encode the 'target_name' column
df_one_hot = pd.get_dummies(df['target_name'], prefix='target_name').astype(int) # Convert to int

# Merge the image features and one-hot encoded features with the original DataFrame
df_combined = df.merge(image_features_df, on='sample_id', how='left')
df_combined = df_combined.merge(df_one_hot, left_index=True, right_index=True)

print("Image features and one-hot encoded target names combined with the original DataFrame.")
display(df_combined.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Flatten all image features into a single Series
image_features_cols = [col for col in df_combined.columns if col.startswith("img_feature_")]
all_features_flat = image_features_df[image_features_cols].values.flatten()
all_features_series = pd.Series(all_features_flat)

# Plot a single histogram for all features
plt.figure(figsize=(10, 6))
sns.histplot(all_features_series, kde=True, bins=100) # Adjust bins as needed
plt.title('Distribution of All Extracted Image Feature Values')
plt.xlabel('Feature Value')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Separate image features and other features
image_feature_cols = [col for col in df_combined.columns if col.startswith('img_feature_')]
other_features_cols = [col for col in df_combined.columns if col.startswith('target_name_')]

X_image_features = df_combined[image_feature_cols]
X_other_features = df_combined[other_features_cols]

# Standardize the image features before applying PCA
scaler = StandardScaler()
X_image_scaled = scaler.fit_transform(X_image_features)

# Apply PCA to reduce dimensionality
# You can adjust the number of components (n_components)
pca = PCA(n_components=75) # Example: Reduce components by 20x
X_image_pca = pca.fit_transform(X_image_scaled)

# Convert the PCA reduced features to a DataFrame
X_image_pca_df = pd.DataFrame(
    X_image_pca, index=df_combined.index,
    columns=[f'pca_img_feature_{i}' for i in range(pca.n_components)])

# Combine the PCA reduced image features with the other features
X_combined_pca = pd.concat([X_other_features, X_image_pca_df], axis=1)

print(f"Original number of image features: {X_image_features.shape[1]}")
print(f"Reduced number of image features after PCA: {X_image_pca_df.shape[1]}")
print(f"Total number of features for regression after PCA: {X_combined_pca.shape[1]}")

# Display the first few rows of the combined feature DataFrame after PCA
display(X_combined_pca.head())