In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import classification_report
import os
import time
import re

In [5]:
df = pd.read_csv("dataset.csv")

In [6]:
# Предобработка данных
df.columns = df.columns.str.replace(' ', '')
df = df.replace('none', np.nan)

def create_text(row):
    symptoms = []
    for col in df.columns:
        if col.startswith('Symptom') and not pd.isna(row[col]):
            symptoms.append(row[col])
    return ' '.join(symptoms)

df['text_description'] = df.apply(create_text, axis=1)

df = df.dropna(subset=['text_description'])

class_counts = df['Disease'].value_counts()
rare_classes = class_counts[class_counts < 5].index
df = df[~df['Disease'].isin(rare_classes)]

le = LabelEncoder()
df['Disease'] = le.fit_transform(df['Disease'])

In [7]:
# Предобработка текста:
def preprocess_text(text):
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('?', ' <QUESTION> ')
    text = text.replace('_', ' ')
    text = text.lower()
    return text

df['text_description'] = df['text_description'].apply(preprocess_text)

In [8]:
# Подготовка данных для BERT
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 128

# Токенизация текста
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf'
    )

X = [tokenize_text(text) for text in df['text_description']]
Y = df['Disease'].values

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [10]:
# Подготовка данных для TensorFlow
def create_dataset(tokenized_texts, labels):
    input_ids = [text['input_ids'] for text in tokenized_texts]
    attention_masks = [text['attention_mask'] for text in tokenized_texts]
    input_ids = np.concatenate(input_ids, axis=0)
    attention_masks = np.concatenate(attention_masks, axis=0)
    labels = np.array(labels)

    return tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks}, labels))

train_dataset = create_dataset(X_train, y_train).batch(32)
test_dataset = create_dataset(X_test, y_test).batch(32)

In [11]:
# Создание BERT модели
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le.classes_))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Обучение модели
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

epochs = 3

# Замер времени обучения
start_time = time.time()
history = model.fit(train_dataset, epochs=epochs, validation_data=test_dataset, verbose=1)
training_time = time.time() - start_time
print(f"Training Time: {training_time:.4f} seconds")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Training Time: 362.0883 seconds


In [13]:
# Оценка модели
start_time = time.time()
loss, accuracy = model.evaluate(test_dataset, verbose=0)
prediction_time = time.time() - start_time

print(f'Accuracy: {accuracy:.4f}')
print(f"Prediction Time: {prediction_time:.4f} seconds")

Accuracy: 1.0000
Prediction Time: 8.2912 seconds


In [14]:
# Classification Report
y_pred = np.array([])
y_true = np.array([])

for batch in test_dataset:
    inputs = batch[0]
    labels = batch[1]
    predictions = model.predict(inputs)
    y_pred = np.concatenate([y_pred, np.argmax(predictions.logits, axis=-1)])
    y_true = np.concatenate([y_true, labels.numpy()])

print(classification_report(y_true, y_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        24
         1.0       1.00      1.00      1.00        24
         2.0       1.00      1.00      1.00        24
         3.0       1.00      1.00      1.00        24
         4.0       1.00      1.00      1.00        24
         5.0       1.00      1.00      1.00        24
         6.0       1.00      1.00      1.00        24
         7.0       1.00      1.00      1.00        24
         8.0       1.00      1.00      1.00        24
         9.0       1.00      1.00      1.00        24
        10.0       1.00      1.00      1.00        24
        11.0       1.00      1.00      1.00        24
        12.0       1.00      1.00      1.00        24
        13.0       1.00      1.00      1.00        24
        14.0       1.00      1.00      1.00        24
        15.0       1.00      1.00      1.00        24
        16.0       1.00      1.00      1.00        24
        17.0       1.00    

In [15]:
# Сохранение модели
model.save_pretrained("bert_model")

# Определение размера модели
model_size = os.path.getsize("bert_model/config.json") / (1024 * 1024)
model_size+= os.path.getsize("bert_model/tf_model.h5") / (1024 * 1024)
print(f"Model Size: {model_size:.4f} MB")

Model Size: 418.0388 MB
