In [1]:
!pip install pandas scikit-learn transformers datasets torch tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch

In [3]:
df_nb = pd.read_csv("sentences.csv")

In [4]:
len(df_nb)

1348

In [5]:
df_nb.head()

Unnamed: 0,sentence,highlight,category,subcategory,polarity
0,The people in the United Kingdom will cease to...,NONE,NONE,NONE,NONE
1,The goverment claim that it will not effect hu...,NONE,NONE,NONE,NONE
2,The recent experiment on mice which supposidly...,not seen as good evidence,APPRECIATION,REACTION,NEG
3,"If the papers keep plugging away, saying ""beef...",beef is bad for you,APPRECIATION,REACTION,NEG
4,It is in my view therefore that it is press bl...,blowing the situation out of all proportions,JUDGEMENT,PROPRIETY,NEG


In [6]:
# Убираем половину случайных предложений без оценки из выборки
df_none = df_nb[df_nb['subcategory'] == 'NONE']
df_other = df_nb[df_nb['subcategory'] != 'NONE']

df_none_reduced = df_none.sample(frac=0.5, random_state=42)
df = pd.concat([df_none_reduced, df_other]).sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,sentence,highlight,category,subcategory,polarity
0,Fox hunting is a 'bloodsport' and in my opinio...,bloodsport,JUDGEMENT,PROPRIETY,NEG
1,£ billions are spent by the government alone t...,NONE,NONE,NONE,NONE
2,As well as the pointlessness of fox hunting it...,too brutal,JUDGEMENT,PROPRIETY,NEG
3,"Although the magnitude of the rôle is, perhaps...",can inspire much excitement,JUDGEMENT,CAPACITY,POS
4,"Therefore, the human brain whilst dependent up...",is able to establish more lines of thought and...,JUDGEMENT,CAPACITY,POS


In [8]:
# Если есть пропуски -- заполняем как NONE
df['sentence'] = df['sentence'].fillna("")
df['category'] = df['category'].fillna("NONE")
df['subcategory'] = df['subcategory'].fillna("NONE")

In [9]:
le_cat = LabelEncoder()
X_cat = df['sentence']
y_cat = le_cat.fit_transform(df['category'])

In [10]:
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
    X_cat, y_cat, test_size=0.2, stratify=y_cat, random_state=42
)

In [11]:
# Модель для baseline-1: определение категорий
category_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
category_model.fit(X_train_cat, y_train_cat)

In [12]:
# Модель для baseline-1: определение подкатегорий Judgement
df_jud = df[df['category'] == 'JUDGEMENT'] # Обучаем только на предложениях J
le_jud = LabelEncoder()
X_jud = df_jud['sentence']
y_jud = le_jud.fit_transform(df_jud['subcategory'])

print(df_jud['subcategory'].value_counts())


X_train_jud, X_test_jud, y_train_jud, y_test_jud = train_test_split(
    X_jud, y_jud, test_size=0.2, stratify=y_jud, random_state=42
)
judgement_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
judgement_model.fit(X_train_jud, y_train_jud)

subcategory
CAPACITY     140
PROPRIETY    135
TENACITY      34
VERACITY      14
NORMALITY     12
Name: count, dtype: int64


In [13]:
# Модель для baseline-1: определение подкатегорий Appreciation
df_app = df[df['category'] == 'APPRECIATION']
le_app = LabelEncoder()
X_app = df_app['sentence']
y_app = le_app.fit_transform(df_app['subcategory'])

print(df_app['subcategory'].value_counts())

X_train_app, X_test_app, y_train_app, y_test_app = train_test_split(
    X_app, y_app, test_size=0.2, stratify=y_app, random_state=42
)

appreciation_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
appreciation_model.fit(X_train_app, y_train_app)

subcategory
REACTION       192
WORTH          170
COMPOSITION     21
Name: count, dtype: int64


In [14]:
# Функция алгоритма для первого подхода
def classify(sentence):
    cat_pred = category_model.predict([sentence])[0]
    category_label = le_cat.inverse_transform([cat_pred])[0]

    if category_label == 'JUDGEMENT':
        sub_pred = judgement_model.predict([sentence])[0]
        subcategory_label = le_jud.inverse_transform([sub_pred])[0]
    elif category_label == 'APPRECIATION':
        sub_pred = appreciation_model.predict([sentence])[0]
        subcategory_label = le_app.inverse_transform([sub_pred])[0]
    else:
        subcategory_label = 'NONE'

    return category_label, subcategory_label

In [15]:
df_test = df.loc[X_test_cat.index].copy()
df_test['sentence'] = X_test_cat
df_test['category'] = le_cat.inverse_transform(y_test_cat)

In [16]:
df_test['predicted_category'], df_test['predicted_subcategory'] = zip(*df_test['sentence'].map(classify))

In [17]:
df_test.to_csv("test_predictions.csv", index=False)

print(classification_report(df_test['category'], df_test['predicted_category']))

mask = df_test['category'] != 'NONE'
print(classification_report(
    df_test[mask]['subcategory'],
    df_test[mask]['predicted_subcategory']
))

              precision    recall  f1-score   support

APPRECIATION       0.44      0.49      0.47        77
   JUDGEMENT       0.51      0.52      0.51        67
        NONE       0.40      0.33      0.36        64

    accuracy                           0.45       208
   macro avg       0.45      0.45      0.45       208
weighted avg       0.45      0.45      0.45       208

              precision    recall  f1-score   support

    CAPACITY       0.43      0.43      0.43        21
 COMPOSITION       0.00      0.00      0.00         3
        NONE       0.00      0.00      0.00         0
   NORMALITY       0.50      0.50      0.50         2
   PROPRIETY       0.78      0.54      0.64        39
    REACTION       0.58      0.43      0.49        44
    TENACITY       0.25      0.33      0.29         3
    VERACITY       0.00      0.00      0.00         2
       WORTH       0.64      0.53      0.58        30

    accuracy                           0.47       144
   macro avg       0.35

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Второй подход: subcategory + NONE
X = df['sentence']
y = df['subcategory'].fillna('NONE')

le_sub = LabelEncoder()
y_enc = le_sub.fit_transform(y)

# Используем тот же train/test split
X_train_sub = X_train_cat
X_test_sub = X_test_cat
y_train_sub = le_sub.transform(df.loc[X_train_cat.index, 'subcategory'].fillna('NONE'))
y_test_sub = le_sub.transform(df.loc[X_test_cat.index, 'subcategory'].fillna('NONE'))

subcategory_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
subcategory_model.fit(X_train_sub, y_train_sub)

In [19]:
y_pred_sub = subcategory_model.predict(X_test_sub)

print(classification_report(y_test_sub, y_pred_sub, target_names=le_sub.classes_))


              precision    recall  f1-score   support

    CAPACITY       0.29      0.38      0.33        21
 COMPOSITION       0.00      0.00      0.00         3
        NONE       0.40      0.33      0.36        64
   NORMALITY       0.00      0.00      0.00         2
   PROPRIETY       0.53      0.44      0.48        39
    REACTION       0.25      0.27      0.26        44
    TENACITY       0.00      0.00      0.00         3
    VERACITY       0.00      0.00      0.00         2
       WORTH       0.26      0.37      0.31        30

    accuracy                           0.33       208
   macro avg       0.19      0.20      0.19       208
weighted avg       0.34      0.33      0.33       208



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RoBERTA

In [1]:
pip install numpy==1.26.4 # Без этого падал trainer

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but yo

In [20]:
# Random oversampling
train_df = pd.DataFrame({
    'text': X_train_cat.astype(str),
    'label': df.loc[X_train_cat.index, 'subcategory'].fillna('NONE').astype(str)
})

max_size = train_df['label'].value_counts().max()
balanced_train_df = pd.concat([
    resample(group, replace=True, n_samples=max_size, random_state=42)
    for _, group in train_df.groupby('label')
])

balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

X_train_balanced = balanced_train_df['text'].tolist()
y_train_balanced = balanced_train_df['label'].tolist()
y_train_enc = le_sub.transform(y_train_balanced)

X_test_sub = X_test_cat.astype(str).tolist()
y_test_raw = df.loc[X_test_cat.index, 'subcategory'].fillna('NONE').astype(str)
y_test_enc = le_sub.transform(y_test_raw)


tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(le_sub.classes_)
)


train_dataset = Dataset.from_dict({
    "text": X_train_balanced,
    "labels": y_train_enc
})
test_dataset = Dataset.from_dict({
    "text": X_test_sub,
    "labels": y_test_enc
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def tokenize(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokenized["labels"] = batch["labels"]
    return tokenized

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

training_args = TrainingArguments(
    output_dir="./results_roberta_oversampled",
    num_train_epochs=10, # Пробовала сначала 5, потом 10 -- разницы нет
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=4e-5,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Map:   0%|          | 0/2295 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Step,Training Loss
10,2.2093
20,2.2018
30,2.1589
40,2.0658
50,1.7584
60,1.447
70,1.37
80,1.1164
90,1.0435
100,0.9457


TrainOutput(global_step=1440, training_loss=0.2036355051250818, metrics={'train_runtime': 530.1238, 'train_samples_per_second': 43.292, 'train_steps_per_second': 2.716, 'total_flos': 1509694558732800.0, 'train_loss': 0.2036355051250818, 'epoch': 10.0})

In [24]:
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)

y_pred_labels = le_sub.inverse_transform(y_pred)
y_test_labels = le_sub.inverse_transform(y_test_enc)

y_pred_labels = [str(label) for label in y_pred_labels]
y_test_labels = [str(label) for label in y_test_labels]
class_order = le_sub.classes_.tolist()

print(classification_report(
    y_test_labels,
    y_pred_labels,
    labels=class_order,
    target_names=class_order
))


              precision    recall  f1-score   support

    CAPACITY       0.33      0.48      0.39        21
 COMPOSITION       0.00      0.00      0.00         3
        NONE       0.58      0.61      0.60        64
   NORMALITY       0.00      0.00      0.00         2
   PROPRIETY       0.73      0.56      0.64        39
    REACTION       0.34      0.34      0.34        44
    TENACITY       0.00      0.00      0.00         3
    VERACITY       0.00      0.00      0.00         2
       WORTH       0.39      0.47      0.42        30

    accuracy                           0.48       208
   macro avg       0.26      0.27      0.27       208
weighted avg       0.48      0.48      0.48       208



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
