In [27]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D
from sklearn.preprocessing import LabelEncoder

In [11]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Загрузка данных
df = pd.read_csv('labelled_newscatcher_dataset.csv', sep=";")

In [12]:
# Предобработка текстов
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['title'] = df['title'].apply(preprocess_text)

In [14]:
# Векторизация текстов
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['title'])
sequences = tokenizer.texts_to_sequences(df['title'])
padded = pad_sequences(sequences, maxlen=200)

In [25]:
# Преобразование меток в числовой формат
le = LabelEncoder()
labels = le.fit_transform(df['topic'])
labels = labels.reshape(-1)

In [24]:
labels.reshape(-1)

array([4, 4, 4, ..., 2, 1, 5])

In [28]:
# Создание модели
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    Dense(64, activation='relu'),
    GlobalAveragePooling1D(),
    # Dropout(0.2),
    Dense(len(set(labels)), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [29]:
# Обучение модели
model.fit(padded, labels, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m2720/2720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 27ms/step - accuracy: 0.4382 - loss: 1.5586 - val_accuracy: 0.7137 - val_loss: 0.8348
Epoch 2/5
[1m2720/2720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 25ms/step - accuracy: 0.7631 - loss: 0.7263 - val_accuracy: 0.7700 - val_loss: 0.6974
Epoch 3/5
[1m2720/2720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 26ms/step - accuracy: 0.8074 - loss: 0.5842 - val_accuracy: 0.7771 - val_loss: 0.6728
Epoch 4/5
[1m2720/2720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 25ms/step - accuracy: 0.8290 - loss: 0.5153 - val_accuracy: 0.7817 - val_loss: 0.6631
Epoch 5/5
[1m2720/2720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 26ms/step - accuracy: 0.8424 - loss: 0.4772 - val_accuracy: 0.7796 - val_loss: 0.6777


<keras.src.callbacks.history.History at 0x79e96ad40d90>

In [30]:
# Оценка модели
loss, accuracy = model.evaluate(padded, labels)
print(f'Accuracy: {accuracy:.3f}')

[1m3400/3400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - accuracy: 0.8655 - loss: 0.4193
Accuracy: 0.845
