In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical
import time
import re
import os

In [11]:
df = pd.read_csv("dataset.csv")

In [12]:
# 2. Предобработка данных
df.columns = df.columns.str.replace(' ', '')
df = df.replace('none', np.nan)

def create_text(row):
    symptoms = []
    for col in df.columns:
        if col.startswith('Symptom') and not pd.isna(row[col]):
            symptoms.append(row[col])
    return ' '.join(symptoms)

df['text_description'] = df.apply(create_text, axis=1)

df = df.dropna(subset=['text_description'])

class_counts = df['Disease'].value_counts()
rare_classes = class_counts[class_counts < 5].index
df = df[~df['Disease'].isin(rare_classes)]

le = LabelEncoder()
df['Disease'] = le.fit_transform(df['Disease'])

In [13]:
# 3. Предобработка текста:
def preprocess_text(text):
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('?', ' <QUESTION> ')
    text = text.replace('_', ' ')
    text = text.lower()
    return text

df['text_description'] = df['text_description'].apply(preprocess_text)

In [14]:
# 4. Подготовка данных для GRU
MAX_NB_WORDS = 30000
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="<UNK>")
tokenizer.fit_on_texts(df['text_description'])

X = tokenizer.texts_to_sequences(df['text_description'])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
Y = df['Disease']

In [15]:
# 5. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)
y_train = to_categorical(y_train, num_classes=len(le.classes_))
y_test = to_categorical(y_test, num_classes=len(le.classes_))

In [16]:
# 6. Создание GRU модели
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(GRU(128))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()])
print(model.summary())



None


In [17]:
# 7. Обучение модели
epochs = 3
batch_size = 32

# Замер времени обучения
start_time = time.time()
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1)
training_time = time.time() - start_time
print(f"Training Time: {training_time:.4f} seconds")

Epoch 1/3
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.5603 - loss: 3.0373 - precision_1: 0.4004 - recall_1: 0.0406 - val_accuracy: 0.9924 - val_loss: 0.2405 - val_precision_1: 1.0000 - val_recall_1: 0.9442
Epoch 2/3
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9940 - loss: 0.1348 - precision_1: 0.9975 - recall_1: 0.9804 - val_accuracy: 0.9975 - val_loss: 0.0413 - val_precision_1: 0.9975 - val_recall_1: 0.9949
Epoch 3/3
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9972 - loss: 0.0332 - precision_1: 0.9972 - recall_1: 0.9961 - val_accuracy: 0.9975 - val_loss: 0.0189 - val_precision_1: 0.9975 - val_recall_1: 0.9975
Training Time: 9.9229 seconds


In [18]:
# 8. Оценка модели
start_time = time.time()
loss, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
prediction_time = time.time() - start_time

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f"Prediction Time: {prediction_time:.4f} seconds")

Accuracy: 0.9980
Precision: 0.9980
Recall: 0.9980
Prediction Time: 0.3793 seconds


In [19]:
# 9. Classification Report
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
print(classification_report(y_test_labels, y_pred, zero_division=0))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        24
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        24
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        24
           9       0.92      1.00      0.96        24
          10       1.00      1.00      1.00        24
          11       1.00      1.00      1.00        24
          12       1.00      1.00      1.00        24
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        24
       

In [20]:
# Сохранение модели
model.save("gru_model.h5")

# Определение размера модели
model_size = os.path.getsize("gru_model.h5") / (1024 * 1024)
print(f"Model Size: {model_size:.4f} MB")



Model Size: 35.4366 MB


In [21]:
import pickle

# Сохранение токенизатора
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Сохранение LabelEncoder
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(le, handle, protocol=pickle.HIGHEST_PROTOCOL)
