In [2]:
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from keras.layers import Embedding, LSTM, Dense, Flatten, Reshape
from keras.models import Sequential, load_model

import os
import numpy as np
import pandas as pd

In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

MAX_WORDS = 10000
EMBEDDING_DIM = 300
NUM_CATEGORIES = 10

In [4]:
train_dir = "data/train/"
test_dir = "data/test/"

train_data = []
train_labels = []
test_data = []
test_labels = []

for filename in os.listdir(train_dir):
    if not filename.endswith('.txt'):
        continue
    filepath = os.path.join(train_dir, filename)
    with open(filepath) as f:
        for line in f:
            train_data.append(line.strip())
            train_labels.append(filename[:-4])

for filename in os.listdir(test_dir):
    if not filename.endswith('.txt'):
        continue
    filepath = os.path.join(test_dir, filename)
    with open(filepath) as f:
        for line in f:
            test_data.append(line.strip())
            test_labels.append(filename[:-4])

train_data = pd.DataFrame({'text': train_data, 'label': train_labels})
test_data = pd.DataFrame({'text': test_data, 'label': test_labels})

In [5]:
x_train = np.array(train_data.iloc[:, 0], dtype=object).reshape(-1, 1)
x_test = np.array(test_data.iloc[:, 0], dtype=object).reshape(-1, 1)

In [10]:
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
test_labels = le.transform(test_labels)

y_train = np.array(train_labels).reshape(-1, 1).flatten()
y_test = np.array(test_labels).reshape(-1, 1).flatten()

In [7]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(8200, 1)
(8200, 1)
(1900, 1)
(1900, 1)


In [50]:
encoder = OneHotEncoder(sparse=False)

encoder.fit([[c] for c in ''.join(x_train.flatten())])

x_train_encoded = np.array([encoder.transform([[c] for c in s]) for s in x_train.flatten()]).reshape(-1, 1)
x_test_encoded = np.array([encoder.transform([[c] for c in s]) for s in x_test.flatten()]).reshape(-1, 1)

  x_train_encoded = np.array([encoder.transform([[c] for c in s]) for s in x_train.flatten()]).reshape(-1, 1)
  x_test_encoded = np.array([encoder.transform([[c] for c in s]) for s in x_test.flatten()]).reshape(-1, 1)


In [48]:
print(x_train_encoded.shape)
print(y_train.shape)
print(x_test_encoded.shape)
print(y_test.shape)

print(x_test_encoded)

(8200, 1)
(8200,)
(1900, 1)
(1900,)
[[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [27]:
model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_shape=(1,)))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Flatten())
model.add(Dense(units=NUM_CATEGORIES, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1, 300)            3000000   
                                                                 
 lstm_3 (LSTM)               (None, 64)                93440     
                                                                 
 flatten_1 (Flatten)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 10)                650       
                                                                 
Total params: 3,094,090
Trainable params: 3,094,090
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
history = model.fit(x_train_encoded, y_train, validation_data=(x_test_encoded, y_test), epochs=10, batch_size=32)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [None]:
model.save('model')

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
loaded_model = load_model('model')
print(x_test)
prediction = model.predict(np.array(['y=123x',]))

In [None]:
category_mapping = {0: 'constant', 1: 'linear', 2: 'quadratic', 3: 'cubic', 4: 'polynomial', 5: 'exponential', 6: 'root', 7: 'rational', 8: 'power', 9: 'logarithmic'}
print(category_mapping[np.argmax(prediction[0])])