### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [7]:
train, dev, test = [], [], []

In [8]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))

16220


In [9]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))

2027


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [10]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))

2028


#### You can split every sentence into lists of words by white spaces.

In [11]:
train = train + dev
X_train_sentences = [item[1] for item in train]
Y_train = [item[0] for item in train]
print(len(X_train_sentences))
print(len(Y_train))

18247
18247


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(min_df = 5, max_df = 0.90)
X_train = tfidfconverter.fit_transform(X_train_sentences)
tfidfconverter = TfidfVectorizer(vocabulary = tfidfconverter.vocabulary_, min_df = 2, max_df = 0.90)
X_test = tfidfconverter.fit_transform(test)
print(X_train.shape)
print(X_test.shape)

(18247, 6050)
(2028, 6050)


In [14]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svm = SVC(kernel='rbf', C = 8)
# svm.fit(X_train, Y_train)
scores = cross_val_score(svm, X_train, Y_train, cv = 40)
print(scores)


In [None]:
Y_pred_test = svm.predict(X_test)
print(len(Y_pred_test))

### Main Code Body

In [9]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []
results = [ pred for pred in Y_pred_test]
print(results.count(0))
print(results.count(1))

1031
997


### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [10]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [11]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [12]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')