### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [36]:
train, dev, test = [], [], []

In [37]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))

16220


In [38]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))

2027


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [39]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))

2028


#### You can split every sentence into lists of words by white spaces.

In [None]:
### Main Code Body

In [5]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

In [6]:
train_sentences = [item[1] for item in train]
y_train = [item[0] for item in train]
print(len(y_train))

16220


In [7]:
dev_sentences = [item[1] for item in dev]
y_dev = [item[0] for item in dev]
print(len(y_dev))

2027


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(min_df = 5, max_df = 0.90)
X_train = tfidfconverter.fit_transform(train_sentences).toarray()
print(len(X_train[0]))

5631


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(vocabulary=tfidfconverter.vocabulary_, min_df = 5, max_df = 0.90)
X_dev = tfidfconverter.fit_transform(dev_sentences).toarray()
print(len(X_dev[0]))

5631


In [22]:
from sklearn.linear_model import Perceptron
from sklearn.ensemble import AdaBoostClassifier
base_perceptron = Perceptron(penalty = 'l2', alpha = 0.00001, tol = 1e-5, max_iter = 10000)
perceptron_adaBoost = AdaBoostClassifier(base_estimator = base_perceptron, n_estimators = 20000, learning_rate = 0.1, algorithm='SAMME')
perceptron_adaBoost.fit(X_train, y_train)


AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=Perceptron(alpha=1e-05, max_iter=10000,
                                             penalty='l2', tol=1e-05),
                   learning_rate=0.1, n_estimators=20000)

In [26]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred_ab_perceptron = perceptron_adaBoost.predict(X_dev)
print(confusion_matrix(y_dev,y_pred_ab_perceptron))
print(classification_report(y_dev,y_pred_ab_perceptron))
print(accuracy_score(y_dev, y_pred_ab_perceptron))
with open('adaboost_perceptron.txt', 'w') as wf:
    for item in y_pred_ab_perceptron:
        wf.write(str(item) + '\n')

[[808 144]
 [153 922]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       952
           1       0.86      0.86      0.86      1075

    accuracy                           0.85      2027
   macro avg       0.85      0.85      0.85      2027
weighted avg       0.85      0.85      0.85      2027

0.8534780463739516


In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', tol=1e-6)
svm.fit(X_train, y_train)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred_svm = svm.predict(X_dev)
print(confusion_matrix(y_dev,y_pred_svm))
print(classification_report(y_dev,y_pred_svm))
print(accuracy_score(y_dev, y_pred_svm)
with open('svm.txt', 'w') as wf:
    for item in y_pred_svm:
        wf.write(str(item) + '\n')

### Main Code Body

In [13]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [None]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [None]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [None]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')