### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [1]:
train, dev, test = [], [], []

In [2]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))

16220


In [3]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))

2027


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [4]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))

2028


#### You can split every sentence into lists of words by white spaces.

In [5]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

In [6]:
train_sentences = [item[1] for item in train]
Y_train = [item[0] for item in train]
dev_sentences = [item[1] for item in dev]
Y_dev = [item[0] for item in dev]
print(len(Y_train))
print(len(Y_dev))

16220
2027


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(min_df = 5, max_df = 0.90)
X_train = tfidfconverter.fit_transform(train_sentences)
tfidfconverter = TfidfVectorizer(vocabulary=tfidfconverter.vocabulary_, min_df = 5, max_df = 0.90)
X_dev = tfidfconverter.fit_transform(dev_sentences)
print(X_train.shape)
print(X_dev.shape)

(16220, 5631)
(2027, 5631)


In [8]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', tol = 1e-6, C = 8.5)
svm.fit(X_train, Y_train)


SVC(C=8.5, tol=1e-06)

In [9]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
Y_pred_svm = svm.predict(X_dev)
print(confusion_matrix(Y_dev,Y_pred_svm))
print(classification_report(Y_dev,Y_pred_svm))
print(accuracy_score(Y_dev, Y_pred_svm))

[[851 101]
 [109 966]]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       952
           1       0.91      0.90      0.90      1075

    accuracy                           0.90      2027
   macro avg       0.90      0.90      0.90      2027
weighted avg       0.90      0.90      0.90      2027

0.8963986186482487


In [37]:
X_test = tfidfconverter.fit_transform(test)
Y_test = svm.predict(X_test)
results = [ x for x in Y_test]
zero = results.count(1)
one = results.count(0)
print(zero)
print(one)

989
1039


### Main Code Body

In [13]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [None]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [None]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [None]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')