In [1]:
# https://www.kaggle.com/competitions/playground-joai-competition-2025
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from PIL import Image

In [2]:
train = pd.read_csv("/kaggle/input/playground-joai-competition-2025/train.csv")
test = pd.read_csv("/kaggle/input/playground-joai-competition-2025/test.csv")
subm = pd.read_csv("/kaggle/input/playground-joai-competition-2025/sample_submission.csv")

In [3]:
class2idx = {
    'NoGas': 0,
    'Perfume': 1,
    'Smoke': 2,
    'Mixture': 3
}

idx2class = {v: k for k, v in class2idx.items()}

train['Caption'] = train['Caption'].str.lower()
train['Gas'] = train['Gas'].map(class2idx)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_FEATURES = 256

vectorizer = TfidfVectorizer(stop_words='english', max_features=MAX_FEATURES)

vectorizer.fit(train['Caption'])

train = pd.concat([train, pd.DataFrame(vectorizer.transform(train['Caption']).toarray(), columns=[f"f_{i}" for i in range(MAX_FEATURES)])],
                  axis=1)

In [5]:
from sklearn.model_selection import train_test_split

features = ['MQ8', 'MQ5', 'Caption'] + [f"f_{i}" for i in range(MAX_FEATURES)]
text_features = ['Caption']

X, y = train[features], train['Gas']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

In [6]:
from catboost import Pool

train_pool = Pool(X_train, y_train, text_features=text_features)
valid_pool = Pool(X_valid, y_valid, text_features=text_features)

In [7]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=100,
    metric_period=10,
    eval_metric='TotalF1',
    loss_function='MultiClass',
    max_depth=8
)

model.fit(train_pool, eval_set=valid_pool)

Learning rate set to 0.268919
0:	learn: 0.8348771	test: 0.8197187	best: 0.8197187 (0)	total: 1.99s	remaining: 3m 16s
10:	learn: 0.9393562	test: 0.9224030	best: 0.9224030 (10)	total: 21s	remaining: 2m 50s
20:	learn: 0.9508767	test: 0.9293082	best: 0.9293082 (20)	total: 40s	remaining: 2m 30s
30:	learn: 0.9588986	test: 0.9395141	best: 0.9395141 (30)	total: 59s	remaining: 2m 11s
40:	learn: 0.9648615	test: 0.9479041	best: 0.9479041 (40)	total: 1m 18s	remaining: 1m 52s
50:	learn: 0.9710600	test: 0.9495600	best: 0.9495600 (50)	total: 1m 36s	remaining: 1m 33s
60:	learn: 0.9753003	test: 0.9547964	best: 0.9547964 (60)	total: 1m 55s	remaining: 1m 14s
70:	learn: 0.9774212	test: 0.9512578	best: 0.9547964 (60)	total: 2m 15s	remaining: 55.1s
80:	learn: 0.9808963	test: 0.9547964	best: 0.9547964 (60)	total: 2m 34s	remaining: 36.1s
90:	learn: 0.9822484	test: 0.9513282	best: 0.9547964 (60)	total: 2m 52s	remaining: 17.1s
99:	learn: 0.9845691	test: 0.9548576	best: 0.9548576 (99)	total: 3m 10s	remaining: 0u

<catboost.core.CatBoostClassifier at 0x79332364c5d0>

In [8]:
from sklearn.metrics import f1_score

preds = model.predict(X_valid)
f1 = f1_score(y_valid, preds, average='macro')
print(f"F1 Score: {f1:.5f}")

F1 Score: 0.95493


In [11]:
test = pd.concat([test, pd.DataFrame(vectorizer.transform(test['Caption']).toarray(), columns=[f"f_{i}" for i in range(MAX_FEATURES)])],
                  axis=1)

In [13]:
X_test = test[features]

predictions = model.predict(X_test)

subm['Gas'] = list(map(idx2class.get, predictions.flatten().tolist()))

subm.to_csv("submission.csv", index=False)

subm

Unnamed: 0,index,Gas
0,0,Smoke
1,1,Perfume
2,2,NoGas
3,3,NoGas
4,4,Mixture
...,...,...
635,635,Smoke
636,636,Perfume
637,637,Perfume
638,638,Smoke


In [14]:
subm['Gas'].value_counts()

Gas
Perfume    171
Smoke      161
Mixture    160
NoGas      148
Name: count, dtype: int64