Competion Descruption (TBA)

In [1]:
# libraries import
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams
from nltk.tokenize import word_tokenize, ToktokTokenizer
import nltk

import spacy

from sklearn.metrics import f1_score

from string import punctuation
from nltk.corpus import stopwords
import re

KeyboardInterrupt: 

In [2]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [3]:
# data load
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

Let's have a look at the data:

In [None]:
train_df.sample(5)

In [None]:
test_df.sample(5)

In [None]:
sample_submission.sample(5)

In [None]:
train_df.shape

In [None]:
test_df.shape

Let's split train_df into train and validation sets:

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'], train_df['target'])

## Basemodel: CountVectorizer

In [None]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(X_train)

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)

In [None]:
y_pred = clf.predict(vec.transform(X_valid))

In [None]:
f1_score(y_valid, y_pred)

## TfidfVectorizer

In [None]:
vec = TfidfVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(X_train)

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)

In [None]:
y_pred = clf.predict(vec.transform(X_valid))

In [None]:
f1_score(y_valid, y_pred)

## Tokenization and Lemmatization

In [None]:
tt_tok = ToktokTokenizer()

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
noise = stopwords.words('english') + list(punctuation)

In [None]:
def my_preproc(text):
    text = re.sub(f"[{punctuation}]", "", text)
    doc = nlp(text)
    return [
        token.lemma_ 
        for token in doc 
        if not token.is_punct 
        and token.lemma_ not in noise 
        and token.lemma_.strip() != ""
    ]

In [None]:
# my_preproc('The dogs are running fast.')

In [None]:
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=my_preproc)
bow = vec.fit_transform(X_train)

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)

In [None]:
y_pred = clf.predict(vec.transform(X_valid))

In [None]:
f1_score(y_valid, y_pred)

In [None]:
X_test = test_df['text']

In [None]:
y_pred = clf.predict(vec.transform(X_test))

In [None]:
output = pd.concat([test_df['id'], pd.Series(y_pred)], axis=1)
output.columns = ['id', 'target']
output.head()

In [None]:
# output.to_csv("submission.csv", index=False)

## Embeddings: Word2Vec

### Training a Skip-Gram Model

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # keep only letters
    tokens = word_tokenize(text)
    return tokens

X_train_tokens = [preprocess(text) for text in X_train]
X_valid_tokens = [preprocess(text) for text in X_valid]

In [None]:
w2v_model = Word2Vec(
    sentences=X_train_tokens,
    vector_size=100,     # you can try 200 or 300
    window=5,            # context window
    min_count=2,         # ignore rare words
    workers=4,
    sg=1                 # 1 for skip-gram, 0 for CBOW
)

In [None]:
w2v_model.save("word2vec_tweets.model")

In [None]:
def tweet_to_vec(tokens, model, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word].reshape((1, size))
            count += 1
    if count != 0:
        vec /= count
    return vec

SIZE = 100
X_train_vecs = np.concatenate([tweet_to_vec(tokens, w2v_model, SIZE) for tokens in X_train_tokens])
X_valid_vecs = np.concatenate([tweet_to_vec(tokens, w2v_model, SIZE) for tokens in X_valid_tokens])

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vecs, y_train)

y_pred = clf.predict(X_valid_vecs)
print("F1-score:", f1_score(y_valid, y_pred))

### GloVe

In [None]:
embeddings_index = {}
with open("/kaggle/input/glove-6b-300d/glove.6B.300d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print("Loaded %d word vectors." % len(embeddings_index))

In [None]:
def tweet_to_vec(tokens, embeddings, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        if word in embeddings:
            vec += embeddings[word].reshape((1, size))
            count += 1
    if count != 0:
        vec /= count
    return vec


size = 300  # because we're using GloVe 100d

X_train_vecs = np.concatenate([tweet_to_vec(tokens, embeddings_index, size) for tokens in X_train_tokens])
X_valid_vecs = np.concatenate([tweet_to_vec(tokens, embeddings_index, size) for tokens in X_valid_tokens])

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vecs, y_train)
y_pred = clf.predict(X_valid_vecs)
print("F1-score:", f1_score(y_valid, y_pred))

## Deep Learning Models: CNN

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 1. Настроим токенайзер
max_words = 20000   # словарь: топ-20k самых частых слов
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# 2. Превратим тексты в последовательности чисел
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)

# 3. Паддинг до одинаковой длины
max_len = 50  # можно подобрать по распределению длин
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_len, padding='post')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [None]:
embedding_dim = 100  # размерность эмбеддингов

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # бинарная классификация
])

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_valid_pad, y_valid)
)

In [None]:
from sklearn.metrics import f1_score
import numpy as np

In [None]:
# Предсказания
y_pred_prob = model.predict(X_valid_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

# F1-score
f1 = f1_score(y_valid, y_pred)
print("F1-score:", f1)

In [None]:
from sklearn.metrics import f1_score
import numpy as np

probs = model.predict(X_valid_pad).ravel()
ths = np.linspace(0.1, 0.9, 81)
best = max(((t, f1_score(y_valid, (probs>=t).astype(int))) for t in ths), key=lambda x: x[1])
best_threshold, best_f1 = best
print(best_threshold, best_f1)

In [None]:
#2

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
import string 
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()

    # лемматизация
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    return text

In [None]:
train_df['text_clean'] = train_df['text'].apply(clean_text)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['text_clean'],
    train_df['target'],
    test_size=0.2,
    random_state=42
)

In [None]:
# ========================================
# 1️⃣ Импорты
# ========================================
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# ========================================
# 2️⃣ Предобработка текста (создаём копию!)
# ========================================
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)            # удаляем ссылки
    text = re.sub(r'@\w+', '', text)                      # убираем упоминания
    text = re.sub(r'#\w+', '', text)                      # убираем хэштеги
    text = text.translate(str.maketrans('', '', string.punctuation))  # пунктуация
    text = re.sub(r'\d+', '', text)                       # цифры
    text = text.lower().strip()                          # нижний регистр
    text = re.sub(r'\s+', ' ', text)                     # лишние пробелы
    return text

train_df['text_clean'] = train_df['text'].apply(clean_text)

# ========================================
# 3️⃣ train/valid split
# ========================================
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df['text_clean'],
    train_df['target'],
    test_size=0.2,
    random_state=42
)

# ========================================
# 4️⃣ Токенизация и паддинг
# ========================================
max_words = 20000
max_len = 50

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_len, padding='post')

# ========================================
# 5️⃣ Архитектура TextCNN
# ========================================
embedding_dim = 100

inp = Input(shape=(max_len,))
emb = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len)(inp)

# три свёртки с разными окнами
conv_blocks = []
for kernel_size in [3, 4, 5]:
    conv = Conv1D(filters=128, kernel_size=kernel_size, activation='relu', padding='valid')(emb)
    pool = GlobalMaxPooling1D()(conv)
    conv_blocks.append(pool)

x = Concatenate()(conv_blocks)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inp, out)

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

# ========================================
# 6️⃣ Колбэки (ранняя остановка + LR scheduler)
# ========================================
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-5)
]

# ========================================
# 7️⃣ Обучение модели
# ========================================
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_valid_pad, y_valid),
    epochs=10,
    batch_size=64,
    callbacks=callbacks,
    verbose=1
)

# ========================================
# 8️⃣ Предсказания и подбор оптимального порога
# ========================================
y_prob = model.predict(X_valid_pad).ravel()

best_f1, best_thr = 0, 0
for thr in np.linspace(0.1, 0.9, 81):
    f1 = f1_score(y_valid, (y_prob >= thr).astype(int))
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"✅ Лучший порог: {best_thr:.2f}")
print(f"✅ Лучший F1-score: {best_f1:.4f}")

# ========================================
# 9️⃣ Финальная метрика
# ========================================
y_pred = (y_prob >= best_thr).astype(int)
print("F1-score (final):", f1_score(y_valid, y_pred))


## Fine Tuning Bert

In [5]:
# =========================================================
# 0) Установка библиотек (в Kaggle ноутбуке раскомментируй)
# =========================================================
!pip -q install transformers==4.44.2 datasets==3.0.1 accelerate==0.34.2

import re, os, random, math, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report

import torch
from torch.utils.data import Dataset

from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, Trainer, TrainingArguments)

[0m[31mERROR: Could not find a version that satisfies the requirement transformers==4.44.2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for transformers==4.44.2[0m[31m
[0m

2025-10-07 20:22:21.357679: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759868541.391606     741 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759868541.401945     741 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
set_seed()

# =========================================================
# 2) Мягкая очистка твитов (не трогаем оригинал)
#    Важно: ссылки/упоминания -> специальные токены (не удаляем!)
# =========================================================
URL_TOKEN  = "__URL__"
USER_TOKEN = "__USER__"

def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Превращаем ссылки/упоминания в токены
    text = re.sub(r'http\S+|www\S+', URL_TOKEN, text)
    text = re.sub(r'@\w+', USER_TOKEN, text)
    # Оставляем хэштеги как слова без '#'
    text = re.sub(r'#', '', text)
    # Нормализуем пробелы, нижний регистр
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df["text_clean"] = train_df["text"].apply(clean_tweet)
test_df["text_clean"]  = test_df["text"].apply(clean_tweet)

# =========================================================
# 3) Сплит (если свой уже есть — можно использовать его)
# =========================================================
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df["text_clean"], train_df["target"],
    test_size=0.15, random_state=SEED, stratify=train_df["target"]
)

# =========================================================
# 4) Токенизация под твиты (BERTweet)
# =========================================================
MODEL_PATH = "/kaggle/input/bertweet/bertweet-local"   # альтернатива: "roberta-base" / "microsoft/deberta-v3-base"

# У BERTweet есть нормализатор эмодзи/твит-специфики в токенизаторе
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)

MAX_LEN = 96  # 80–112 обычно оптимально для твитов

def tokenize_texts(texts):
    return tokenizer(
        list(texts),
        padding=False,              # паддинг сделает collator
        truncation=True,
        max_length=MAX_LEN,
        return_attention_mask=True
    )

train_enc = tokenize_texts(X_train)
valid_enc = tokenize_texts(X_valid)
test_enc  = tokenize_texts(test_df["text_clean"])

# =========================================================
# 5) Torch Dataset
# =========================================================
class TxtDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(int(self.labels.iloc[idx]))
        return item

ds_train = TxtDataset(train_enc, y_train.reset_index(drop=True))
ds_valid = TxtDataset(valid_enc, y_valid.reset_index(drop=True))
ds_test  = TxtDataset(test_enc,  None)

# =========================================================
# 6) Модель
# =========================================================
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=num_labels
)

# =========================================================
# 7) Метрики: F1 по валидации (дальше подберём порог)
# =========================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    preds = (probs >= 0.5).astype(int)  # временно 0.5
    return {"f1": f1_score(labels, preds)}

# =========================================================
# 8) Тренировка
# =========================================================
BATCH_SIZE = 32
EPOCHS = 4
LR = 2e-5

# Data collator сам паддит до батча
collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    output_dir="./checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to=[]  # отключим wandb/tensorboard по умолчанию
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

trainer.train()

# =========================================================
# 9) Подбор Порога по F1 на валидации
# =========================================================
valid_logits = trainer.predict(ds_valid).predictions
valid_probs  = torch.softmax(torch.tensor(valid_logits), dim=1)[:, 1].numpy()

best_thr, best_f1 = 0.5, 0.0
for thr in np.linspace(0.1, 0.9, 161):
    f1 = f1_score(y_valid.values, (valid_probs >= thr).astype(int))
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best F1={best_f1:.4f} @ threshold={best_thr:.3f}")

print(classification_report(
    y_valid.values, (valid_probs >= best_thr).astype(int), digits=4)
)

# =========================================================
# 10) Обучение на полном train (опционально для сабмита)
#     Склеим train+valid, дообучим 1 эпоху c маленьким LR
# =========================================================
FULL_FINETUNE = True
if FULL_FINETUNE:
    X_full = pd.concat([X_train, X_valid], ignore_index=True)
    y_full = pd.concat([y_train, y_valid], ignore_index=True)

    full_enc = tokenize_texts(X_full)
    ds_full  = TxtDataset(full_enc, y_full)

    # уменьшим LR и эпоху — модель уже натренирована
    args_full = TrainingArguments(
        output_dir="./checkpoints_full",
        per_device_train_batch_size=BATCH_SIZE,
        num_train_epochs=1,
        learning_rate=1e-5,
        warmup_ratio=0.0,
        weight_decay=0.01,
        logging_steps=50,
        fp16=torch.cuda.is_available(),
        evaluation_strategy="no",
        save_strategy="no",
        report_to=[]
    )

    trainer_full = Trainer(
        model=trainer.model,  # берём лучшую модель
        args=args_full,
        train_dataset=ds_full,
        tokenizer=tokenizer,
        data_collator=collator
    )
    trainer_full.train()

# =========================================================
# 11) Предсказания на test и формирование submission.csv
# =========================================================
test_logits = trainer.model(**collator(ds_test[:])).logits.detach().cpu().numpy() \
    if len(ds_test) < BATCH_SIZE else trainer.predict(ds_test).predictions
test_probs  = torch.softmax(torch.tensor(test_logits), dim=1)[:, 1].numpy()
test_preds  = (test_probs >= best_thr).astype(int)

submission = pd.DataFrame({
    "id": test_df["id"],
    "target": test_preds
})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
!pip install -U transformers