### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [210]:
train, dev, test = [], [], []

In [211]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
# print (train[:3])

16220


In [212]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
# print (dev[:3])

2027


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [213]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
# print (test[:3])

2028


#### You can split every sentence into lists of words by white spaces.

In [214]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]
# train_split[:2]
# dev_split[:2]
# test_split[:2]

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [215]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

In [216]:
import numpy
numpy.random.seed(7)
from keras.models import Sequential
from tensorflow.keras import optimizers
from keras.initializers import Constant
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers import LSTM, Dropout, Dense, Flatten

In [217]:
train_sentences = [item[1] for item in train_split]
# train_sentence[:2]

In [218]:
from gensim.models import Word2Vec
word2Vec_model = Word2Vec(sentences = train_sentences, vector_size = 100, window = 10, min_count = 1, sg = 1)

In [219]:
# w2v_weights = word2vec_model.wv.vectors
# w2v_vocab_size, w2v_embedding_size = w2v_weights.shape
# print(w2v_vocab_size, w2v_embedding_size)
# vocab = list(word2vec_model.wv.index_to_key)
# len(vocab)
word2Vec_weights = word2Vec_model.wv.vectors
word2Vec_vocab_size, word2Vec_embedding_size = word2Vec_weights.shape
vocabulary = list(word2Vec_model.wv.index_to_key)

word2Vec_dict = {}
for word in vocabulary:
    word2Vec_dict[word] = word2Vec_model.wv.get_vector(word)
    
print(len(word2Vec_dict))
len(word2Vec_dict['.'])
print(word2Vec_vocab_size, word2Vec_embedding_size)
print(len(vocabulary))

20860
20860 100
20860


In [175]:
# word_vec_dict = {}
# for word in vocab:
#     word_vec_dict[word] = word2vec_model.wv.get_vector(word) 
# print(len(word_vec_dict))
# len(word_vec_dict['.'])



20860


100

In [220]:
# getting maximum length sentence
maxList = max(train_sentences, key = lambda i: len(i))
max_sentence_length = len(maxList)
print(max_sentence_length, maxList)

56 ['lkê', 'xt8ö', 'tc', 'xjtlkxo8', 'lw', 'ükjl', 'wóê', 'é#êcoöêc', 'j#ê', 'lkê', 'Úwwm', 'zc', 'lütó', 'é#êötcêc', '77', 'lkjl', 'üê', 'Úêáwöê', 'ükw', 'üê', 'j#ê', 'wó', 'lkê', 'Újámc', 'wx', 'wo#', 'éj#êólc', ',', 'Úol', 'üê', 'kj2ê', 'ów', 't6êj', 'ükw', 'lkêú', 'üê#ê', 'jl', 'wo#', 'jyê', '.', 'jó6', 'lkjl', 'ltöê', 'tc', 'j', 'x8êêltóy', 'jó6', 'é#êátwoc', 'áwööw6tlú', 'ów', 'öjllê#', 'kwü', 'w86', 'úwo', 'j#ê', '.']


In [223]:
X_train = [item[1] for item in train_split]
Y_train = [item[0] for item in train_split]
# print(X_train[:2])
# print(Y_train[:4])
print(len(X_train))
len(Y_train)

16220


16220

In [224]:
from collections import defaultdict
train_dict = defaultdict(int)
for i, sentence in enumerate(X_train):
    for word in sentence:
        train_dict[word] += 1

print(len(train_dict))

20860


In [225]:
min_freq, max_freq = 1, 10000
train_dict = {k:v for k, v in train_dict.items() if v>=min_freq and v<=max_freq}
print(len(train_dict))

20856


In [226]:
rank_words = {key: rank for rank, key in enumerate(sorted(train_dict, key=train_dict.get, reverse=True), 1)}
print(len(rank_words))
vocabulary_size = len(rank_words)

20856


In [227]:
X_train_encoded = []
for i,sentence in enumerate(X_train):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_train_encoded.append(encoded_sentence)

len(X_train_encoded[0])


18

In [228]:
X_dev = [item[1] for item in dev_split]
y_test = [item[0] for item in dev_split]
len(X_dev)

2027

In [229]:
X_dev_encoded = []
for i, sentence in enumerate(X_dev):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_dev_encoded.append(encoded_sentence)

len(X_dev_encoded)

2027

In [230]:
# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train_encoded, maxlen = max_sentence_length)
X_test = sequence.pad_sequences(X_dev_encoded, maxlen = max_sentence_length)
X_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  194, 4529,   10,  163,    0,    5,   10,    0,  212,
       2069, 9249,  184,    0,  828,    2,    0,  862,    0,  196,    0,
       2529, 6078,   25,   42,  397,    1,    0,  804,  886,   42, 1651,
          0], dtype=int32)

In [231]:
import numpy as np
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
Y_train = np.asarray(Y_train)
y_test = np.asarray(y_test)

In [232]:
# now creating the embedding matrix
embedding_matrix = np.zeros(shape=(vocabulary_size + 1, word2Vec_embedding_size))
for i,word in enumerate(rank_words):
  embedding_vector = word2Vec_dict.get(word)
  if embedding_vector is not None:  # word is in the vocabulary learned by the w2v model
    embedding_matrix[i+1] = embedding_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.
print(embedding_matrix.shape)

(20857, 100)


In [233]:
print(word2Vec_model.wv.get_vector('y#êjl'))
embedding_matrix[rank_words['y#êjl']]

[-0.3859272   0.29502597  0.5025096  -0.00436226 -0.27154326 -0.10293099
  0.32060978  0.2423444  -0.1385685  -0.50439054 -0.0077074  -0.1548165
 -0.02441715  0.08030856  0.1301237   0.07277843  0.42257386  0.11222274
 -0.14543763 -0.68495     0.36744437 -0.13969915  0.28065568 -0.2201954
 -0.16848989 -0.01974002 -0.26097727 -0.04706571 -0.04970746  0.38222367
  0.22387798 -0.08790313  0.10107074 -0.48340166  0.17058302  0.26408052
  0.28522012  0.0994729  -0.514254   -0.05467613  0.24629626 -0.38426045
 -0.21269956  0.356397   -0.09157103 -0.12774144  0.14562462 -0.30157048
  0.40542898  0.22734295  0.06030554 -0.22287962 -0.26009795 -0.15791777
 -0.46795398 -0.01119609  0.15155265 -0.00257432 -0.252003    0.41979232
 -0.01622939  0.00529151  0.41018394 -0.12429683 -0.14171688  0.56592745
  0.12852614  0.15051155 -0.42725107 -0.00219187 -0.17311488  0.07736689
  0.18298632  0.29209015  0.32596895  0.06871334  0.04326439  0.24928945
 -0.3235982   0.02513141 -0.23781468 -0.02458365 -0.2

array([-0.3859272 ,  0.29502597,  0.50250959, -0.00436226, -0.27154326,
       -0.10293099,  0.32060978,  0.24234439, -0.13856851, -0.50439054,
       -0.0077074 , -0.15481649, -0.02441715,  0.08030856,  0.1301237 ,
        0.07277843,  0.42257386,  0.11222274, -0.14543763, -0.68494999,
        0.36744437, -0.13969915,  0.28065568, -0.2201954 , -0.16848989,
       -0.01974002, -0.26097727, -0.04706571, -0.04970746,  0.38222367,
        0.22387798, -0.08790313,  0.10107074, -0.48340166,  0.17058302,
        0.26408052,  0.28522012,  0.0994729 , -0.51425397, -0.05467613,
        0.24629626, -0.38426045, -0.21269956,  0.356397  , -0.09157103,
       -0.12774144,  0.14562462, -0.30157048,  0.40542898,  0.22734295,
        0.06030554, -0.22287962, -0.26009795, -0.15791777, -0.46795398,
       -0.01119609,  0.15155265, -0.00257432, -0.25200301,  0.41979232,
       -0.01622939,  0.00529151,  0.41018394, -0.12429683, -0.14171688,
        0.56592745,  0.12852614,  0.15051155, -0.42725107, -0.00

In [234]:
# create the model
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size + 1, output_dim = word2Vec_embedding_size, input_length = max_sentence_length, embeddings_initializer = Constant(embedding_matrix)))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
model.fit(X_train, Y_train, validation_data = (X_test, y_test), epochs = 5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15e42ffa0>

In [235]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_dev_prob = model.predict(X_test)
y_dev_lstm = []
for prob in y_dev_prob:
    if prob >= 0.49:
        y_dev_lstm.append(1)
    else:
        y_dev_lstm.append(0)
# print(y_dev_lstm)
print(confusion_matrix(y_test,y_dev_lstm))
print(classification_report(y_test,y_dev_lstm))
print(accuracy_score(y_test, y_dev_lstm))
with open('lstm.txt', 'w') as wf:
    for item in y_dev_lstm:
        wf.write(str(item) + '\n')



[[835 117]
 [106 969]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       952
           1       0.89      0.90      0.90      1075

    accuracy                           0.89      2027
   macro avg       0.89      0.89      0.89      2027
weighted avg       0.89      0.89      0.89      2027

0.889985199802664


In [26]:
# create the model
model = Sequential()
model.add(Embedding(input_dim=w2v_vocab_size+1,output_dim=w2v_embedding_size,input_length=max_sentence_length))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x131edc730>

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [None]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [None]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [None]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')