In [1]:
import re # Regular expression package.
import pandas as pd # Data analysis library used in processing data and features.
from nltk.corpus import stopwords # Used in pre-processing to remove stopwords.
from fuzzywuzzy import fuzz # String matching package used in extracting fuzzy features.
import distance # Used to compare sequences of tokens.
from nltk.stem.wordnet import WordNetLemmatizer # Used in pre-processing to stem words.
from collections import Counter # Used in processing strings.



In [2]:
# Prevent divison-by-zero errors by adding SAFE_DIV to the demominator.
SAFE_DIV = 0.000001 
# Words that appear < RARE_WORD_LIMIT are considered as rare words.
RARE_WORD_LIMIT = 100 
# Rare words are replaced by this magic_word which is referenced from 
# the article "Wonderful Words That You're Not Using (Yet)"
magic_word = 'biblioklept' 

In [3]:
# Lemmatizing words with length greater than 4.
def cutter(word):
    if len(word) < 4:
        return word
    return WordNetLemmatizer().lemmatize(WordNetLemmatizer().lemmatize(word, "n"), "v")

In [4]:
# Cleaning texts
def preprocess(string):
    
    string = string.lower() # Lower capitalizations.
    
    string.replace(",000,000", "m")
    string.replace(",000", "k")
    string.replace("′", "'")
    string.replace("’", "'")
    string.replace("won't", "will not")
    string.replace("cannot", "can not")
    string.replace("can't", "can not")
    string.replace("n't", " not")
    string.replace("what's", "what is")
    string.replace("that's", "that is")
    string.replace("it's", "it is")
    string.replace("'ve", " have")
    string.replace("i'm", "i am")
    string.replace("'re", " are")
    string.replace("he's", "he is")
    string.replace("she's", "she is")
    string.replace("'s", " own")
    string.replace("%", " percent ")
    string.replace("₹", " rupee ")
    string.replace("$", " dollar ")
    string.replace("€", " euro ")
    string.replace("'ll", " will")
    string.replace("'d", " would")
    string.replace("=", " equal ")
    string.replace("+", " plus ")
        
    string = re.sub(r"e-mail", "email", string)
    string = re.sub(r" usa ", " america ", string)
    string = re.sub(r"the us", "america", string)
    string = re.sub(r" uk ", " england ", string)
    string = re.sub(r"c#", "c sharp", string)
    
    string = re.sub('[“”\(\'…\)\!\^\"\.;:,\-\?？\{\}\[\]\\/\*@]', ' ', string)
    string = re.sub(r"([0-9]+)000000", r"\1m", string)
    string = re.sub(r"([0-9]+)000", r"\1k", string)
    
    string = ' '.join([cutter(w) for w in string.split()])
    
    return string

In [5]:
def get_token_features(q1, q2):
    
    STOP_WORDS = stopwords.words("english") 
    token_features = [0.0]*10

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Word list in Question1 excluding stop words.
    q1_words = set([cutter(word) for word in q1_tokens if word not in STOP_WORDS])
    # Word list in Question2 excluding stop words.
    q2_words = set([cutter(word) for word in q2_tokens if word not in STOP_WORDS])
    # Stop word list in Question1.
    q1_stops = set([cutter(word) for word in q1_tokens if word in STOP_WORDS])
    # Stop word list in Question2.
    q2_stops = set([cutter(word) for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    # These NLP features are extracted using min and max operations instead of being question dependent.
    # All these features thus have a better generalization.
    
    # Common word ratio with respect to the shorter question excluding stop words.
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    # Common word ratio with respect to the longer question excluding stop words.
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    # Common word ratio with respect to the shorter stop word list.
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    # Common word ratio with respect to the longer stop word list.
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    # Common token ratio with respect to the shorter question.
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    # Common token ratio with respect to the longer question.
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    # Equality of the last word.
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1]) 
    # Equality of the first word.
    token_features[7] = int(q1_tokens[0] == q2_tokens[0]) 
    # Difference in lengths of two questions.
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens)) 
    # Average length of two questions.
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2 
    return token_features

In [6]:
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

In [7]:
# Words that appear less than 100 times are considered as "rare words" and replaced by a special token.
def replace_rare(df):

    words = []
    for q in df['question1']:
        for w in q.split():
            words.append(w);
    for q in df['question2']:
        for w in q.split():
            words.append(w);

    counts = Counter(words)
    rare_words = []
    for w,c in counts.items():
        if c < RARE_WORD_LIMIT:
            rare_words.append(w)

    result_df_q1 = []
    for q in df['question1']:
        result_df_q1.append([word if word not in rare_words else magic_word for word in q])

    result_df_q2 = []
    for q in df['question2']:
        result_df_q1.append([word if word not in rare_words else magic_word for word in q])

    df['question1'] = result_df_q1
    df['question2'] = result_df_q2

In [8]:
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    replace_rare(df) # Replace rare words with a special token.

    print("Token features.")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("Fuzzy features.")
    # Ratio of removing duplicates.
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    # Ratio of ordered words.
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    # Ratio of original questions.
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    # Ratio of similar substring.
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    # Ratio of longer substring over the shorter one.
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df

In [None]:
print("Extracting features for training data:")
train_df = pd.read_csv("data/train.csv")
train_df = extract_features(train_df)
train_df.drop(["id", "qid1", "qid2", "question1", "question2", "is_duplicate"], axis=1, inplace=True)
train_df.to_csv("data/nlp_stemmed_features_train.csv", index=False)

Extracting features for training data:


In [None]:
print("Extracting features for testing data:")
test_df = pd.read_csv("data/test.csv")
test_df = extract_features(test_df)
test_df.drop(["test_id", "question1", "question2"], axis=1, inplace=True)
test_df.to_csv("data/nlp_stemmed_features_test.csv", index=False)