In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim.downloader as api
import textdistance
import rapidfuzz

In [2]:
stop_words = stopwords.words('english')
model = api.load('glove-wiki-gigaword-50')

In [3]:
train_filename = 'train_with_label.txt'
dev_filename = 'dev_with_label.txt'
test_filename = 'test_without_label.txt'

In [4]:
def preprocess(s):
    s = s.replace(' ,', ',')
    s = s.replace('$ ', '$')
    s = s.replace(" '", "'")
    s = s.replace('``', '"')
    s = s.replace("''", '"')
    s = s.replace(' .', '.')
    s = s.replace(' ?', '?')
    return s

In [5]:
def readdf(filename, has_label=True):
    with open(filename) as file:
        buffer = file.readlines()
        
        col_names = ['id', 'sentence1', 'sentence2']
        if has_label:
            col_names.append('similar')
            
        df = pd.DataFrame([row.split('\t') for row in buffer], columns=col_names)
        if has_label:
            df.similar = df.similar.apply(lambda x: int(x.rstrip()))
        
        df.sentence1 = df.sentence1.apply(lambda x: preprocess(x.lower()))
        df.sentence2 = df.sentence2.apply(lambda x: preprocess(x.lower()))
        
        return df

In [6]:
train_df = readdf(train_filename)
dev_df = readdf(dev_filename)
test_df = readdf(test_filename, has_label=False)

In [7]:
def wmd(s1, s2):
    s1_split = [x for x in word_tokenize(s1) if x not in stop_words and len(x) > 1]
    s2_split = [x for x in word_tokenize(s2) if x not in stop_words and len(x) > 1]
    return model.wmdistance(s1_split, s2_split)

def bleu(s1, s2):
    return sentence_bleu([s1], s2)

def meteor(s1, s2):
    return single_meteor_score(s1.split(), s2.split())

In [8]:
def feature(df, name, fn):
    return pd.DataFrame(
        [fn(s1,s2) for (s1,s2) in zip(df.sentence1, df.sentence2)],
        columns=[name]
    )

In [9]:
def features(df):
    return pd.concat([
        feature(df, 'wmd', wmd),
        feature(df, 'bleu', bleu),
        feature(df, 'meteor', meteor),
        feature(df, 'jaccard', textdistance.jaccard.normalized_similarity),
        feature(df, 'damerau', rapidfuzz.distance.DamerauLevenshtein.normalized_similarity),
    ], axis = 1)

In [10]:
X_train = features(train_df)
X_dev = features(dev_df)
X_test = features(test_df)

y_train = train_df.similar
y_dev = dev_df.similar

In [11]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_dev, y_dev)

0.6367403314917127

In [12]:
y_test = lr.predict(X_test)

results_df = pd.concat([test_df.id, pd.DataFrame(y_test, columns=['similar'])], axis=1)
results_df.to_csv('EricNguyen_test_result.txt', sep='\t', header=False, index=False)