In [1]:
import pandas as pd
import textdistance
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score

In [2]:
train_filename = 'train_with_label.txt'
dev_filename = 'dev_with_label.txt'
test_filename = 'test_without_label.txt'

In [3]:
def readdf(filename, has_label=True):
    with open(filename, encoding='UTF-8') as file:
        buffer = file.readlines()
        
        col_names = ['id', 's1', 's2']
        if has_label:
            col_names.append('label')
            
        df = pd.DataFrame([row.split('\t') for row in buffer], columns=col_names)
        
        if has_label:
            df.label = df.label.apply(lambda x: int(x.rstrip()))
            
        df.s1 = df.s1.apply(lambda x: x.lower())
        df.s2 = df.s2.apply(lambda x: x.lower())
        
        return df

In [4]:
train_df = readdf(train_filename)
dev_df = readdf(dev_filename)
test_df = readdf(test_filename, has_label=False)

# Oversample
train_df = pd.concat([train_df,
                      train_df[train_df.label == 1],
                      train_df[train_df.label == 1],
                      train_df[train_df.label == 1].sample(201)])

In [5]:
def feature(df, fn):
    return pd.DataFrame([fn(s1,s2) for (s1,s2) in zip(df.s1, df.s2)])

In [6]:
def features(df):
    return pd.concat([
        feature(df, textdistance.jaccard.normalized_similarity),
        feature(df, textdistance.sorensen.normalized_similarity),
        feature(df, textdistance.Cosine(qval=3).normalized_similarity),
        feature(df, textdistance.Cosine(qval=4).normalized_similarity),
        feature(df, textdistance.cosine.normalized_similarity),
        feature(df, textdistance.levenshtein.normalized_similarity),
    ], axis = 1)

In [7]:
X_train = torch.FloatTensor(features(train_df).values)
X_dev = torch.FloatTensor(features(dev_df).values)
X_test = torch.FloatTensor(features(test_df).values)

In [8]:
y_train = torch.LongTensor(train_df.label.values)
y_dev = torch.LongTensor(dev_df.label.values)

In [9]:
num_samples = len(train_df)
num_input = X_train.size()[1]
num_output = 2
num_hidden = 20
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(num_input, num_hidden)
        self.fc2 = nn.Linear(num_hidden, num_hidden)
        self.out = nn.Linear(num_hidden, num_output)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.out(x)
        return x

In [10]:
clf = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(clf.parameters(), lr=0.1)
epochs = 12000
for i in range(epochs):
    y_hat = clf.forward(X_train)
    loss = criterion(y_hat, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [11]:
y_hat_dev = []
with torch.no_grad():
    for val in X_dev:
        y_hat = clf.forward(val)
        y_hat_dev.append(y_hat.argmax().item())
f1_score(y_dev, y_hat_dev)

0.8101694915254237

In [12]:
y_test = []
with torch.no_grad():
    for val in X_test:
        y_hat = clf.forward(val)
        y_test.append(y_hat.argmax().item())
results_df = pd.DataFrame({'id': test_df.id, 'label': y_test})
results_df.to_csv('EricNguyen_test_result.txt', sep='\t', header=False, index=False)