In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("train_input_data_eng.csv").fillna("")  # maybe drop?
print(df.shape[0])

x_columns = "text"
y_columns = "label_id"
x, y = df[x_columns], df[y_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4, stratify=y)

training = pd.concat([x_train, y_train], axis=1)
testing = pd.concat([x_test, y_test], axis=1)

9014


In [2]:
training

Unnamed: 0,text,label_id
39,,3
6956,invoice march nob tobacco industry national co...,1
1953,political camfaign contribution request date f...,1
3862,ace tlg lotresults fepdet crof verr tark esx p...,16
1833,rjr s june sefeet nobyl dcclassi fi-afian to h...,7
...,...,...
2122,ealshedeilier rqdrescrlellye hanufacturing cer...,17
3934,newman-stein inc broadway job new york ny augu...,13
6669,mar from slper arketing inc rjrpronotions clie...,6
8931,principal investigatoaprogran dikecidy arthur ...,14


In [3]:
testing

Unnamed: 0,text,label_id
7453,sorillard memorandum max to mr goldbrenner fro...,8
6972,e n w h mhl k hh m l j ul n m l a ve g w ie h ...,6
1832,kantuety joumel o commets wanegrom dstt l tx g...,9
6385,auz leon g cooperman chairman omege advisors i...,14
7117,i h ie gglz l,9
...,...,...
8833,clealtk qec sacramento bee weds smoking and vi...,9
2276,,3
8088,borriston laboratories inc sponsor lorillard i...,16
1946,biographical sketch nane romaine r bruns posit...,14


# Tokenization

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
training_sentences = x_train.to_list()

LABEL_N = 21

# First approach
# LEARNING_RATE = 0.01
# VOCAB_SIZE = 10_000
# EMBEDDING_DIM = 16
# DENSE_LAYER_COUNT = 128
# max_length = 600
# trunc_type='post'
# padding_type='post'

def pad(sequences):
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Optimized
LEARNING_RATE = 0.002
VOCAB_SIZE = 2_000
EMBEDDING_DIM = 64
DENSE_LAYER_COUNT = 128
max_length = 400
trunc_type='post'
padding_type='post'

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
# print(tokenizer.word_index)

In [6]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad(training_sequences)
# training_padded

In [7]:
testing_sentences = x_test.to_list()
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad(testing_sequences)
# testing_padded

In [8]:
import numpy as np
def create_all_categories_columns(input_array: np.array):
    result = np.zeros(shape=(input_array.shape[0], LABEL_N), dtype=int)
    result[np.arange(0, input_array.shape[0]), input_array] = 1
    return result

In [9]:
training_padded = np.array(training_padded)
training_labels = create_all_categories_columns(np.array(y_train))
testing_padded = np.array(testing_padded)
testing_labels = create_all_categories_columns(np.array(y_test))

In [10]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(DENSE_LAYER_COUNT, activation="relu"),
        tf.keras.layers.Dense(LABEL_N, activation='softmax'),
])
adam = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          128000    
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 21)                2709      
                                                                 
Total params: 139,029
Trainable params: 139,029
Non-trainable params: 0
_________________________________________________________________


In [11]:
num_epochs = 25
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/25
212/212 - 2s - loss: 2.5490 - accuracy: 0.1737 - val_loss: 2.3870 - val_accuracy: 0.2094 - 2s/epoch - 11ms/step
Epoch 2/25
212/212 - 1s - loss: 2.2161 - accuracy: 0.2735 - val_loss: 2.0777 - val_accuracy: 0.3381 - 1s/epoch - 5ms/step
Epoch 3/25
212/212 - 1s - loss: 1.9355 - accuracy: 0.3753 - val_loss: 1.8412 - val_accuracy: 0.4401 - 1s/epoch - 6ms/step
Epoch 4/25
212/212 - 1s - loss: 1.6957 - accuracy: 0.4799 - val_loss: 1.6394 - val_accuracy: 0.4938 - 1s/epoch - 5ms/step
Epoch 5/25
212/212 - 1s - loss: 1.4864 - accuracy: 0.5533 - val_loss: 1.5602 - val_accuracy: 0.5089 - 1s/epoch - 5ms/step
Epoch 6/25
212/212 - 1s - loss: 1.3431 - accuracy: 0.5957 - val_loss: 1.4183 - val_accuracy: 0.5856 - 1s/epoch - 6ms/step
Epoch 7/25
212/212 - 1s - loss: 1.2298 - accuracy: 0.6278 - val_loss: 1.3823 - val_accuracy: 0.5768 - 1s/epoch - 5ms/step
Epoch 8/25
212/212 - 1s - loss: 1.1398 - accuracy: 0.6571 - val_loss: 1.3181 - val_accuracy: 0.5843 - 1s/epoch - 6ms/step
Epoch 9/25
212/212 - 1s

In [12]:
def get_model_prediction(text):
    text_representation = np.array(
        pad(
            tokenizer.texts_to_sequences(
                [text]
            )
        )
    )

    prediction = list(model.predict(text_representation)[0])
    column = prediction.index(max(prediction))
    return column


In [13]:
model.save("model_eng")



INFO:tensorflow:Assets written to: model.eng\assets


INFO:tensorflow:Assets written to: model.eng\assets


In [14]:
import pickle

pickle.dump(tokenizer, open('tokenizer_eng.pkl', 'wb'))