## Start of the notebook

### Read the txt files

In [2]:
x_train_txt = open('../xtrain_obfuscated.txt').read().split('\n')
x_test_txt = open('../xtest_obfuscated.txt').read().split('\n')
y_train_txt = open('../ytrain.txt').read().split('\n')

assert(len(x_train_txt)==len(y_train_txt))


In [3]:
#remove potential empty lines
x_train_txt_cleaned = []
x_test_txt_cleaned = []
y_train_txt_cleaned = []

for i in range(len(x_train_txt)):
    if x_train_txt[i]=='' or y_train_txt[i]=='':
        assert(x_train_txt[i]==y_train_txt[i])
    else:
        x_train_txt_cleaned.append(x_train_txt[i])
        y_train_txt_cleaned.append(y_train_txt[i])
        
for i in range(len(x_test_txt)):
    if x_test_txt[i]=='':
        continue
    else:
        x_test_txt_cleaned.append(x_test_txt[i])
        
print("length of xtrain and ytrain are respectively: {} and {}, and length of xtest is {}".\
      format(len(x_train_txt_cleaned), len(y_train_txt_cleaned), len(x_test_txt_cleaned)))

assert(len(x_train_txt_cleaned)==len(y_train_txt_cleaned))


length of xtrain and ytrain are respectively: 32513 and 32513, and length of xtest is 3000


### check if there is an additional letter that can help with obfuscation


In [4]:
import collections
dictionary_xtrain = collections.Counter(''.join(x_train_txt_cleaned))
print(dictionary_xtrain, "\n\nthe length of dictonary is", len(dictionary_xtrain))
dictionary_xtrain_list = [*dictionary_xtrain]


Counter({'u': 1744926, 'h': 1239246, 'm': 1118878, 'v': 1095970, 'e': 951258, 'l': 875856, 'w': 861268, 't': 726263, 'i': 618660, 'p': 538704, 'a': 499071, 'k': 477126, 'q': 441865, 'n': 430088, 's': 367312, 'r': 315292, 'g': 247287, 'z': 217798, 'd': 137797, 'f': 137040, 'y': 129408, 'x': 121384, 'c': 59054, 'o': 31849, 'j': 29340, 'b': 27788}) 

the length of dictonary is 26


### Divide the training data to training and validation

In [5]:
from sklearn.model_selection import train_test_split

x_train_txt_cleaned, x_val_txt_cleaned, y_train, y_val = train_test_split(x_train_txt_cleaned,\
                                                                          y_train_txt_cleaned, test_size=0.3)


### 26 letters, so there is no additional letter

## Now to prepare data for training

In [6]:
#Here I replace the characters with integers (in fact the index from the alphabet, but it can be anything else)
import string

def integarize_characters(x_txt_cleaned):
    x_list = []
    temp = []
    for i in range(len(x_txt_cleaned)):
        for j in range(len(x_txt_cleaned[i])): 
            temp.append(string.ascii_lowercase.index(x_txt_cleaned[i][j])+1)
        x_list.append(temp)
        temp = []
    return x_list

x_train_list = integarize_characters(x_train_txt_cleaned)
x_val_list = integarize_characters(x_val_txt_cleaned)
x_test_list = integarize_characters(x_test_txt_cleaned)


In [7]:
#Here I am converting to numpy array and padding zeros so that we have equal lengths
import numpy as np

#max_length is implemented because x_train and test should be the same, so test should adapt to train
def pad_zeros_to_numpy(input, max_length=''):
    
    max_length_input = len(max(input, key = lambda x: len(x)))
    
    if max_length == '':
        x_padded = np.zeros([len(input), max_length_input])
    else:
        x_padded = np.zeros([len(input),max_length])
        
    for x_loc,y in enumerate(input):
        if max_length != '' and max_length < (max_length_input):
            if len(y)< max_length:
                x_padded[x_loc][0:len(y)] = y
            else:
                x_padded[x_loc][0:max_length] = y[0:max_length]
        else:
            x_padded[x_loc][0:len(y)] = y
    return x_padded

max(x_train_list, key = lambda x: len(x))

x_train = pad_zeros_to_numpy(x_train_list)
x_val = pad_zeros_to_numpy(x_val_list, x_train.shape[1])
x_test = pad_zeros_to_numpy(x_test_list, x_train.shape[1])


In [8]:
#convert from list of strings to list of ints
y_train = np.array(y_train, dtype=np.int64)


In [9]:
#Check the balance of the dataset
classes, class_occur = np.unique(y_train, return_counts=True)
for i in range(len(classes)):
    print(classes[i], ':', class_occur[i])
      

0 : 381
1 : 2384
2 : 1029
3 : 2822
4 : 1588
5 : 1632
6 : 3026
7 : 3587
8 : 2502
9 : 689
10 : 2123
11 : 996


## So this is an imbalanced dataset, with the major class 7 and minor class 0 with 1/10 occurences with relation to the maximum class

### Now to convert the target to onehot encoded labels

In [10]:
from keras.utils import to_categorical

y_train_onehot = to_categorical(y_train)
y_val_onehot = to_categorical(y_val)

Using TensorFlow backend.


### Go simple to advanced, first 1DCNNs, then GRUs, then LSTMs (code was added only for CNNs to save space, as they all were very slow to converge)

In [11]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, Flatten, GRU, SimpleRNN, RNN
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint, TensorBoard, LearningRateScheduler
from keras import regularizers
import keras.backend as K


print('Loading data...')

batch_size = 256
weight_decay = 1
print(weight_decay)

model = Sequential()
model.add(Embedding(27, 15, input_length=x_train.shape[1]))
model.add(Conv1D(filters=10, kernel_size=30, strides=1, activation='relu'))
model.add(Conv1D(filters=20, kernel_size=5, strides=2, activation='relu'))
model.add(Conv1D(filters=30, kernel_size=5, strides=2, activation='relu'))
model.add(Conv1D(filters=50, kernel_size=5, strides=2, activation='relu'))

model.add(Flatten())
model.add(Dense(1000, kernel_regularizer=regularizers.l2(weight_decay), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))

for layer in model.layers:
    print("Layer: " + str(layer)[0:25] + ", Input shape: "+str(layer.input_shape)+". Output shape: "+str(layer.output_shape))

nadam_optimizer = optimizers.Adam(lr=0.1)    
model.compile(loss='categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train_onehot,batch_size=batch_size,epochs=500,\
          validation_data=[x_val, y_val_onehot], shuffle=True,\
          verbose=1)

Loading data...
1
Layer: <keras.layers.embeddings., Input shape: (None, 452). Output shape: (None, 452, 15)
Layer: <keras.layers.convolution, Input shape: (None, 452, 15). Output shape: (None, 423, 10)
Layer: <keras.layers.convolution, Input shape: (None, 423, 10). Output shape: (None, 210, 20)
Layer: <keras.layers.convolution, Input shape: (None, 210, 20). Output shape: (None, 103, 30)
Layer: <keras.layers.convolution, Input shape: (None, 103, 30). Output shape: (None, 50, 50)
Layer: <keras.layers.core.Flatte, Input shape: (None, 50, 50). Output shape: (None, 2500)
Layer: <keras.layers.core.Dense , Input shape: (None, 2500). Output shape: (None, 1000)
Layer: <keras.layers.core.Dropou, Input shape: (None, 1000). Output shape: (None, 1000)
Layer: <keras.layers.core.Dense , Input shape: (None, 1000). Output shape: (None, 12)
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22759 samples, validate on 9754 samples
Epoch 1/500

KeyboardInterrupt: 

# Now to use another approach for the embeddings

In [12]:
from keras.preprocessing import sequence, text

token = text.Tokenizer(num_words=None, char_level=True)
token.fit_on_texts(x_train_txt)
char_index = token.word_index

#convert the training and validation to sequences
x_train_seq = token.texts_to_sequences(x_train_txt_cleaned)
x_val_seq = token.texts_to_sequences(x_val_txt_cleaned)
x_test_seq = token.texts_to_sequences(x_test_txt_cleaned)

y_train = np.array(y_train, dtype=np.int64)
y_val = np.array(y_val, dtype=np.int64)

In [14]:
# pad to have the same length
max_len = len(max(x_train_seq, key = lambda x: len(x)))

x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=max_len)
x_val_pad = sequence.pad_sequences(x_val_seq, maxlen=max_len)
x_test_pad = sequence.pad_sequences(x_test_seq, maxlen=max_len)


# Now, Download the glove vectors and use them for the dictonary we have

In [18]:
from tqdm import tqdm
char_vec_index = {}
f = open('../glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    char = values[0]
    try:
        float(values[1])
        if (values[0] in char_index.keys()):
            char_vec = np.asarray(values[1:], dtype='float32')
        else:
            continue
    except:
        continue
    char_vec_index[char] = char_vec
f.close()

print("Added {} char vectors".format(len(char_vec_index)))

2196017it [00:41, 52371.28it/s]

Added 26 char vectors





## Create our embedding matrix

In [19]:
import numpy as np

char_index = token.word_index
embeddings = np.zeros((len(char_index) + 1, char_vec.shape[0]))
for char, index in char_index.items():
    char_embedding = char_vec_index.get(char)
    if embeddings is not None:
        embeddings[index] = char_embedding
        

In [20]:
#preview of the embedding matrix
embeddings


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.32642001, -0.047426  , -0.30886999, ...,  0.32141   ,
         0.16011   ,  0.58876997],
       [-0.29438001,  1.21449995, -0.69893003, ...,  0.081675  ,
        -0.12504999,  0.19485   ],
       ...,
       [-0.14567   , -0.68028003, -0.47473001, ..., -0.023379  ,
         0.13245   , -0.041758  ],
       [-0.048405  ,  0.40928   , -0.084347  , ..., -0.58788002,
        -0.09176   ,  0.14101   ],
       [-0.51170999, -0.10681   , -0.40689   , ..., -0.19557001,
         0.094377  ,  0.14286   ]])

### Now to train the model

In [22]:
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, Flatten, GRU, SimpleRNN, RNN
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint, TensorBoard, LearningRateScheduler
from keras import regularizers
import keras.backend as K


print('Loading data...')

batch_size = 512

model = Sequential()
# We use here the embeddings we got from glove for our characters and we make them trainable to
# fine-tune to our obfuscated data
model.add(Embedding(27, 300, weights=[embeddings],\
                    trainable=True, input_length=x_train_pad.shape[1]))

model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))

for layer in model.layers:
    print("Layer: " + str(layer)[0:25] +\
          ", Input shape: "+str(layer.input_shape)+". Output shape: "+str(layer.output_shape))

nadam_optimizer = optimizers.Adam(lr=0.1)    
model.compile(loss='categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])

print('Train...')
model.fit(x_train_pad, y_train_onehot,batch_size=batch_size,epochs=250,\
          validation_data=[x_val_pad, y_val_onehot],\
          verbose=1)


Loading data...
Layer: <keras.layers.embeddings., Input shape: (None, 452). Output shape: (None, 452, 300)
Layer: <keras.layers.recurrent.L, Input shape: (None, 452, 300). Output shape: (None, 100)
Layer: <keras.layers.core.Dense , Input shape: (None, 100). Output shape: (None, 1000)
Layer: <keras.layers.core.Dropou, Input shape: (None, 1000). Output shape: (None, 1000)
Layer: <keras.layers.core.Dense , Input shape: (None, 1000). Output shape: (None, 12)
Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22759 samples, validate on 9754 samples
Epoch 1/250
  512/22759 [..............................] - ETA: 2:51 - loss: 2.5037 - accuracy: 0.0840

KeyboardInterrupt: 

## Same model design but the LSTM is bidirectional

In [None]:
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, Flatten, GRU, SimpleRNN, RNN
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint, TensorBoard, LearningRateScheduler
from keras import regularizers
import keras.backend as K


print('Loading data...')

batch_size = 512

model = Sequential()
model.add(Embedding(27, 300, weights=[embeddings],\
                    trainable=True, input_length=x_train_pad.shape[1]))

model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=False)))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))

for layer in model.layers:
    print("Layer: " + str(layer)[0:25] + ", Input shape: "+str(layer.input_shape)+". Output shape: "+str(layer.output_shape))

nadam_optimizer = optimizers.Adam(lr=0.1)    
model.compile(loss='categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])

print('Train...')
model.fit(x_train_pad, y_train_onehot,batch_size=batch_size,epochs=250,\
          validation_data=[x_val_pad, y_val_onehot],\
          verbose=1)


### Predictions

In [26]:
predictions = model.predict(xtest_pad)


In [30]:
ytest = predictions.argmax(axis=1)


### Save ytest in a file as requested

In [32]:
with open("ytest.txt", "w") as txt_file:
    for line in ytest:
        txt_file.write("{}".format(line) + "\n")
        