In [11]:
!pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D
from sklearn.preprocessing import LabelEncoder
from data_prepare import preprocess_text

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Загрузка данных
df = pd.read_csv('data/labelled_newscatcher_dataset.csv', sep=";")

In [4]:
# Предобработка текстов
df['title'] = df['title'].apply(preprocess_text)

In [5]:
# Векторизация текстов
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['title'])
sequences = tokenizer.texts_to_sequences(df['title'])
padded = pad_sequences(sequences, maxlen=200)

In [6]:
# Преобразование меток
le = LabelEncoder()
labels = le.fit_transform(df['topic'])

In [7]:
# Модель
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    Dense(64, activation='relu'),
    GlobalAveragePooling1D(),
    Dropout(0.2),
    Dense(len(set(labels)), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
model.fit(padded, labels, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f117ee83940>

In [9]:
# Оценка модели
loss, accuracy = model.evaluate(padded, labels)
print(f'Accuracy: {accuracy:.3f}')

Accuracy: 0.839


In [10]:
# Сохранение модели
models_dir = 'models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    
model_path = os.path.join(models_dir, 'news_classification_model')
model.save(model_path)

INFO:tensorflow:Assets written to: models/news_classification_model/assets


INFO:tensorflow:Assets written to: models/news_classification_model/assets
