In [None]:
# Импорт модулей

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
import cv2 as cv
import re
import nltk
from nltk.tokenize import RegexpTokenizer 
from collections import Counter

from tensorflow import keras
%matplotlib inline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [2]:
#импорт датасета

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!unzip /content/gdrive/MyDrive/neural-network-datasets/text-style-dataset.zip

In [4]:
DATASET = 'text-style-dataset'

In [5]:
def get_normal_array(arrays):
  resultArray = []
  for array in arrays:
    res = []
    for ar in array:
      res.append(ar[0])
    resultArray.append(res)
  return resultArray

In [6]:
def pad_arrays(arrays):
    padded_arrays = []
    target_length = max([len(seq) for seq in arrays])
    for array in arrays:
        padded_array = list(array)
        while len(padded_array) < target_length:
            padded_array.append([0])
        padded_arrays.append(padded_array)
    return padded_arrays

In [7]:
def tokenize_words(input): 
 # lowercase everything to standardize it 
 input = input.lower() 
 # instantiate the tokenizer 
 tokenizer = RegexpTokenizer(r'\w+') 
 tokens = tokenizer.tokenize(input) 
 return " ".join(tokens) 


In [8]:
def create_dataset(DATASET_PATH):
    text_data_array = []
    class_name = []
    for directory in os.listdir(DATASET_PATH):
        for file in os.listdir(os.path.join(DATASET_PATH, directory)):
            text_path = os.path.join(DATASET_PATH, directory, file)
            file = open(text_path, encoding='utf-8', mode='r')
            text = file.read()
            # Токенизируем текст (разбиваем на слова)
            tokens = tokenize_words(text) 
            # Создаем словарь, который сопоставляет каждому уникальному слову в тексте его индекс
            word_counts = Counter([word for line in tokens for word in line])
            vocab = sorted(word_counts, key=word_counts.get, reverse=True)
            vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
            # Преобразуем текст в числовое представление
            int_text = [[vocab_to_int[word] for word in line] for line in tokens]

            text_data_array.append( [[vocab_to_int[word] for word in line] for line in tokens])
            class_name.append(directory)
    text_data_array = pad_arrays(text_data_array)
    text_data_array = get_normal_array(text_data_array)
    return text_data_array, class_name

In [9]:
DATA_TEXT, DATA_LABELS = create_dataset(DATASET) #создание датасета из папки с файлами

In [10]:
class_names = np.unique(DATA_LABELS)
target_dict_train = {k: v for v, k in enumerate(np.unique(DATA_LABELS))}
DATA_LABELS = [target_dict_train[DATA_LABELS[i]] for i in range(len(DATA_LABELS))]

In [298]:
#разделение выборок и предварительная обработка данных

In [11]:
x_train, x_test, y_train, y_test = train_test_split(DATA_TEXT, DATA_LABELS, test_size = 0.25, random_state = 12)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
#создание модели и добавление в неё слоёв

In [29]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(691, 128), 
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(2,'sigmoid')
])

In [None]:
#компиляция модели

In [30]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#вывод информации о модели

In [None]:
model.summary()

In [None]:
#обучение

In [None]:
history = model.fit(x_train, y_train, epochs = 100)

In [None]:
test_loss, test_accuracy = model.evaluate(x_test, y_test) #точность определения по тестовым данным
print('Test accuracy: ', test_accuracy)

In [None]:
#предсказывание по тестовой выборке

In [None]:
predictions = model.predict(x_test)

In [52]:
index_of_prediction = 0 #индекс изображения тестовой выборки (можно поменять для демонстрации различных предсказаний)

In [None]:
print('Neural network\'s prediction is a', class_names[np.argmax(predictions[index_of_prediction])])

In [None]:
print('right answer is a', class_names[np.argmax(y_test[index_of_prediction])])