### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [1]:
train, dev, test = [], [], []

In [2]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
# print (len(train))
# print (train[:3])

In [3]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
# print (len(dev))
# print (dev[:3])

#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [4]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
# print (len(test))
# print (test[:3])

#### You can split every sentence into lists of words by white spaces.

In [5]:
print(train[:2])
print(dev[:2])
print(test[0:2])

[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .']]
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .']]
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .']


In [6]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [x.split(' ') for x in test]

In [7]:
print(train_split[0:2])
print(dev_split[0:2])
print(test_split[0:2])

[[0, ['lkêcê', 'yoúc', 'cêêö', 'y#êjl', 'lw', 'mówám', 'Újám', 'j', 'Úêê#', 'ütlk', 'Úol', 'lkêú', 'z#ê', 'ctöé8ú', 'ówl', 'xoóóú', 'éê#xw#öê#c', '.']], [0, ['6êcétlê', 'jolêot8', 'zc', 'éê#xw#öjóáê', ',', 'tl', 'zc', 'j', '#jlkê#', '8tcl8êcc', 'jöÚ8ê', '6wüó', 'lkê', 'öt668ê', 'wx', 'lkê', '#wj6', ',', 'ükê#ê', 'lkê', 'lkêöjltá', 't#wótêc', 'j#ê', 'lww', 'wÚ2twoc', 'jó6', 'lkê', 'cê+oj8', 'éw8tltác', 'lww', 'cöoy', '.']]]
[[1, ['ów8jó', 'Ú#j2ê8ú', 'l#êj6c', 'ükê#ê', 'xêü', 'jöê#tájó', 'xt8öc', '6j#ê', 'lw', '6ê82ê', '77', 'tólw', 'lkê', 'üw#86', 'wx', 'jöÚt2j8êóáê', 'jó6', 'jöÚtyotlú', '<<<']], [0, ['ê2êó', 'öo#ékú', 'zc', 'ê+éê#l', 'áwötá', 'ltötóy', 'jó6', 'xjöê6', 'ákj#tcöj', 'áj', 'ózl', '#êcáoê', 'lktc', 'êxxw#l', '.']]]
[['j', '6t6jáltá', 'jó6', '6o88', '6wáoöêólj#ú', 'y8w#txútóy', 'cwxlüj#ê', 'jój#ákú', '.'], ['ówlktóy', 'cltámc', ',', '#êj88ú', ',', 'ê+áêél', 'j', '8tóyê#tóy', 'á#êêétóêcc', 'wóê', 'xêê8c', 'x#wö', 'Úêtóy', '6#jyyê6', 'lk#woyk', 'j', 'cj6', ',', 'cw#6t6', 'oót2

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [27]:
# import required library
import numpy as np
np.random.seed(0)
from gensim.models import Word2Vec
from keras.models import Sequential
from collections import defaultdict
from tensorflow.keras import optimizers
from keras.initializers import Constant
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers import LSTM, Dropout, Dense, Flatten, Bidirectional



In [28]:
# train using Word2Vec to create vector embedding
train_sentences = [item[1] for item in train_split]
maxList = max(train_sentences, key = lambda i: len(i))
max_sentence_length = len(maxList)

word2Vec_model = Word2Vec(sentences = train_sentences, vector_size = 100, window = 10, min_count = 1, sg = 1)
word2Vec_weights = word2Vec_model.wv.vectors
word2Vec_vocab_size, word2Vec_embedding_size = word2Vec_weights.shape
vocabulary = list(word2Vec_model.wv.index_to_key)

word2Vec_dict = {}
for word in vocabulary:
    word2Vec_dict[word] = word2Vec_model.wv.get_vector(word)

In [29]:
X_dev = [item[1] for item in dev_split]
Y_dev = [item[0] for item in dev_split]
X_test = test_split
X_train = [item[1] for item in train_split]
Y_train = [item[0] for item in train_split]

print(len(X_dev))
print(X_dev[0:2])

print(len(X_test))
print(X_test[0:2])

2027
[['ów8jó', 'Ú#j2ê8ú', 'l#êj6c', 'ükê#ê', 'xêü', 'jöê#tájó', 'xt8öc', '6j#ê', 'lw', '6ê82ê', '77', 'tólw', 'lkê', 'üw#86', 'wx', 'jöÚt2j8êóáê', 'jó6', 'jöÚtyotlú', '<<<'], ['ê2êó', 'öo#ékú', 'zc', 'ê+éê#l', 'áwötá', 'ltötóy', 'jó6', 'xjöê6', 'ákj#tcöj', 'áj', 'ózl', '#êcáoê', 'lktc', 'êxxw#l', '.']]
2028
[['j', '6t6jáltá', 'jó6', '6o88', '6wáoöêólj#ú', 'y8w#txútóy', 'cwxlüj#ê', 'jój#ákú', '.'], ['ówlktóy', 'cltámc', ',', '#êj88ú', ',', 'ê+áêél', 'j', '8tóyê#tóy', 'á#êêétóêcc', 'wóê', 'xêê8c', 'x#wö', 'Úêtóy', '6#jyyê6', 'lk#woyk', 'j', 'cj6', ',', 'cw#6t6', 'oót2ê#cê', 'wx', 'yoóc', ',', '6#oyc', ',', 'j2j#táê', 'jó6', '6jöjyê6', '6#êjöc', '.']]


In [30]:
# data prepocessing
X_dev = [item[1] for item in dev_split]
Y_dev = [item[0] for item in dev_split]
# X_test = [item[0] for item in test_split]
X_test =  test_split
X_train = [item[1] for item in train_split]
Y_train = [item[0] for item in train_split]

train_dict = defaultdict(int)
for i, sentence in enumerate(X_train):
    for word in sentence:
        train_dict[word] += 1

# print(len(train_dict))
# print(len(vocabulary))

min_freq, max_freq = 1, 8000
train_dict = {k:v for k, v in train_dict.items() if v>=min_freq and v<=max_freq}
rank_words = {key: rank for rank, key in enumerate(sorted(train_dict, key=train_dict.get, reverse=True), 1)}
vocabulary_size = len(rank_words)

In [31]:
print(len(train_dict))
print(len(vocabulary))
print()

20854
20860



In [32]:
# create mbedding matrix
embedding_matrix = np.zeros(shape=(vocabulary_size + 1, word2Vec_embedding_size))
for i,word in enumerate(rank_words):
  embedding_vector = word2Vec_dict.get(word)
  if embedding_vector is not None:
    embedding_matrix[i+1] = embedding_vector
# print(embedding_matrix.shape)

In [33]:
print(embedding_matrix.shape)

(20855, 100)


In [34]:
# data encoding for training
X_train_encoded = []
for i,sentence in enumerate(X_train):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_train_encoded.append(encoded_sentence)

X_dev_encoded = []
for i, sentence in enumerate(X_dev):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_dev_encoded.append(encoded_sentence)

X_test_encoded = []
for i, sentence in enumerate(X_test):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_test_encoded.append(encoded_sentence)


In [35]:
# data padding
X_train_pad = sequence.pad_sequences(X_train_encoded, maxlen = max_sentence_length)
X_dev_pad = sequence.pad_sequences(X_dev_encoded, maxlen = max_sentence_length)
X_test_pad = sequence.pad_sequences(X_test_encoded, maxlen = max_sentence_length)

X_train_sample = np.asarray(X_train_pad)
X_dev_sample = np.asarray(X_dev_pad)
Y_train_sample = np.asarray(Y_train)
Y_dev_sample = np.asarray(Y_dev)
X_total_sample = np.concatenate((X_train_sample, X_dev_sample), axis=0)
Y_total_sample = np.concatenate((Y_train_sample, Y_dev_sample), axis=0)
X_test_sample = np.asarray(X_test_pad)

# print(len(X_total_sample))
# print(X_total_sample.shape)
# print(len(Y_total_sample))
# print(X_test_sample.shape)

In [36]:
# create the model
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size + 1, output_dim = word2Vec_embedding_size, input_length = max_sentence_length, embeddings_initializer = Constant(embedding_matrix)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
# model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
model.fit(X_train_sample, Y_train_sample, validation_data = (X_dev_sample, Y_dev_sample), epochs = 4, batch_size = 16)
# model.fit(X_total_sample, Y_total_sample, epochs = 5, batch_size = 16)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x12d3cdd50>

### Output Prediction Result File

In [37]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []
result_probabilities = model.predict(X_test_sample)
results = [1 if probability>=0.5 else 0 for probability in result_probabilities]

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [38]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [39]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]
zero = results.count(0)
one = results.count(1)
print(zero)
print(one)

1023
1005


In [155]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions1.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')