In [18]:
import pandas as pd
import numpy as np
import tensorflow.compat.v1 as tf
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

In [25]:
vocab = Counter()

text = "Добрый день. У ТН 2302177 рассчитались огромные суммы, а именно: сверхурочка (ВО1010, 1011), прошу указать причины такой ошибки, чтобы я перенаправила заказчику для исправления."

for word in text.split(' '):
    word_lowercase = word.lower()
    vocab[word_lowercase]+=1
        
def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word] = i
        
    return word2index

In [8]:
word2index = get_word_2_index(vocab)

total_words = len(vocab)
matrix = np.zeros((total_words),dtype=float)

for word in text.split():
    matrix[word2index[word.lower()]] += 1
    
print(text, matrix)

Добрый день. У ТН 2302177 рассчитались огромные суммы, а именно: сверхурочка (ВО1010, 1011), прошу указать причины такой ошибки, чтобы я перенаправила заказчику для исправления. [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [49]:
categories = ["comp.graphics","sci.space","rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

In [20]:
#ИЗМЕНЕНИЯ НЕЙРОНКИ
df = pd.read_excel('05.2023.xls', index_col = 0)
copy_df = df
copy_df = copy_df.reset_index(drop = True)
data = copy_df['Описание']

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data)

In [21]:
train_data = train_data.reset_index(drop = True)
test_data = test_data.reset_index(drop = True)

In [22]:
from nltk.tokenize import RegexpTokenizer
vocab = Counter()
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(train_data)):
    sent = train_data[i]
    for word in tokenizer.tokenize(sent):
        vocab[word.lower()]+=1

In [56]:
vocab

Counter({'solman': 2052,
         'sm_back_usr': 2010,
         '05': 4372,
         '2023': 4349,
         '09': 536,
         '56': 163,
         '29': 368,
         'для': 941,
         'люкшина': 9,
         'д': 338,
         'е': 89,
         'изменение': 74,
         'оплаты': 69,
         'после': 118,
         'индексации': 42,
         'консультация': 84,
         'по': 3333,
         'нормативной': 4,
         'оплате': 33,
         'шт': 7,
         'должность': 71,
         '30683179': 1,
         '19': 299,
         '14': 423,
         '24': 367,
         '03': 453,
         'добрый': 1942,
         'день': 2015,
         'подскажите': 185,
         'пожалуйста': 419,
         'в': 4108,
         '2014': 8,
         'году': 17,
         'программе': 36,
         'sap': 1367,
         'hr': 173,
         '2': 384,
         'был': 131,
         'настроен': 2,
         'отчет': 175,
         'расчета': 201,
         'премии': 44,
         'за': 751,
         'преданность': 1

In [6]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

In [23]:
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i
        
    return word2index

word2index = get_word_2_index(vocab)

print("Index of the word 'the':",word2index['the'])

Index of the word 'the': 997


In [28]:
def text_to_vector(text):
    layer = np.zeros(total_words,dtype=float)
    for word in tokenizer.tokenize(text): #ИЗМЕНЕНИЕ
        layer[word2index[word.lower()]] += 1
        
    return layer

def category_to_vector(category):
    y = np.zeros((3),dtype=float)
    if category == 0:
        y[0] = 1.
    elif category == 1:
        y[1] = 1.
    else:
        y[2] = 1.
        
    return y

In [9]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    texts = df[i][i*batch_size:i*batch_size+batch_size] #ИЗМЕНЕНИЕ
    categories = df[i][i*batch_size:i*batch_size+batch_size] #ИЗМЕНЕНИЕ
    
    for text in texts:
        layer = text_to_vector(text) 
        batches.append(layer)
        
    for category in categories:
        y = category_to_vector(category)
        results.append(y)  
     
    return np.array(batches),np.array(results)

In [11]:
# Parameters
learning_rate = 0.01
training_epochs = len(train_data) #ИЗМЕНЕНИЕ
batch_size = 150
display_step = 1

# Network Parameters
n_hidden_1 = 100      # 1st layer number of features
n_hidden_2 = 100       # 2nd layer number of features
n_input = total_words # Words in vocab
n_classes = 3         # Categories: graphics, sci.space and baseball

tf.compat.v1.disable_eager_execution()
input_tensor = tf.placeholder(tf.float32,[None, n_input],name="input")
output_tensor = tf.placeholder(tf.float32,[None, n_classes],name="output") 

In [12]:
def multilayer_perceptron(input_tensor, weights, biases):
    layer_1_multiplication = tf.matmul(input_tensor, weights['h1'])
    layer_1_addition = tf.add(layer_1_multiplication, biases['b1'])
    layer_1 = tf.nn.relu(layer_1_addition)
    
    # Hidden layer with RELU activation
    layer_2_multiplication = tf.matmul(layer_1, weights['h2'])
    layer_2_addition = tf.add(layer_2_multiplication, biases['b2'])
    layer_2 = tf.nn.relu(layer_2_addition)
    
    # Output layer 
    out_layer_multiplication = tf.matmul(layer_2, weights['out'])
    out_layer_addition = out_layer_multiplication + biases['out']
    
    return out_layer_addition

In [13]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
prediction = multilayer_perceptron(input_tensor, weights, biases)

# Define loss and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# Initializing the variables
init = tf.global_variables_initializer()

# [NEW] Add ops to save and restore all the variables
saver = tf.train.Saver()

2926

In [37]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(train_data[epoch])/batch_size) #ИЗМЕНЕНИЕ
        # Loop over all batches
        for i in range(total_batch):
            batch_x,batch_y = get_batch(train_data,i,batch_size) #ИЗМЕНЕНИЕ
            # Run optimization op (backprop) and cost op (to get loss value)
            c,_ = sess.run([loss,optimizer], feed_dict={input_tensor: batch_x,output_tensor:batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "loss=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(output_tensor, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    total_test_data = len(newsgroups_test.target) #ИЗМЕНЕНИЕ
    batch_x_test,batch_y_test = get_batch(test_data,0,total_test_data) # ИЗМЕНЕНИЕ
    print("Accuracy:", accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))
    
    # [NEW] Save the variables to disk
    save_path = saver.save(sess, "/tmp/model.ckpt")
    print("Model saved in path: %s" % save_path)

Epoch: 0001 loss= 24.928164959
Epoch: 0002 loss= 0.495009672
Epoch: 0003 loss= 0.000000000
Epoch: 0004 loss= 0.000000000
Epoch: 0005 loss= 0.000000000
Epoch: 0006 loss= 0.000000000
Epoch: 0007 loss= 0.000000000
Epoch: 0008 loss= 0.000000000
Epoch: 0009 loss= 0.000000000
Epoch: 0010 loss= 0.000000000
Epoch: 0011 loss= 0.000000000
Epoch: 0012 loss= 0.000000000
Epoch: 0013 loss= 0.000000000
Epoch: 0014 loss= 0.000000000


ValueError: Cannot feed value of shape (0,) for Tensor output_1:0, which has shape (None, 3)