# Quora Question Duplicates - Feature Engineering

Our goal is to identify which questions asked on [Quora](https://www.quora.com/), a quasi-forum website with over 100 million visitors a month, are duplicates of questions that have already been asked. This could be useful, for example, to instantly provide answers to questions that have already been answered. We are tasked with predicting whether a pair of questions are duplicates or not, and submitting a binary prediction against the logloss metric.

In [15]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, jaccard, euclidean
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import re
STOP_WORDS = stopwords.words('english')
DELTA = 0.0001
%matplotlib inline

In [62]:
def preprocess_text(text):
        
    # Remove stop words
    words = text.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    text = " ".join(words)

    # Clean the text of special characters
    text = text.replace("%", " percent ").replace("₹₹", " rupee ").replace("$", " dollar ").replace("€", " euro ")
    
    # Expand abbreviations
    text = re.sub(r"\b([A-Za-z]+)'re\b", '\\1 are', text)
    text = re.sub(r"\b([A-Za-z]+)'s\b", '\\1 is', text)
    text = re.sub(r"\b([A-Za-z]+)'ve\b", '\\1 have', text)
    text = re.sub(r"([0-9]+)000000", r"\1m", text)
    text = re.sub(r"([0-9]+)000", r"\1k", text)
    
    # Convert words to lower case and split them
    text = re.sub(r'[^\w\s]','', text)
    words = text.lower().split()
    
    # Shorten words to their stems
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in words]
    
    text = " ".join(stemmed_words)
    return text

In [63]:
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer

def get_new_features(q1, q2):
    new_features = [0.0]*15

    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return new_features

    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    q1_stop_words = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stop_words = set([word for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stop_words.intersection(q2_stop_words))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    q1_avg_word_len = sum(len(word) for word in q1_words) / (len(q1_words) + DELTA)
    q2_avg_word_len = sum(len(word) for word in q2_words) / (len(q2_words) + DELTA)
    
    q1_avg_stop_len = sum(len(word) for word in q1_stop_words) / (len(q1_stop_words) + DELTA)
    q2_avg_stop_len = sum(len(word) for word in q2_stop_words) / (len(q2_stop_words) + DELTA)
    
    # Common words, stop words, and token ratios
    new_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + DELTA)
    new_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + DELTA)
    new_features[2] = common_stop_count / (min(len(q1_stop_words), len(q2_stop_words)) + DELTA)
    new_features[3] = common_stop_count / (max(len(q1_stop_words), len(q2_stop_words)) + DELTA)
    new_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + DELTA)
    new_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + DELTA)
    
    # Matching first or last token 
    new_features[6] = int(q1_tokens[0] == q2_tokens[0])
    new_features[7] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # Difference in token lengths, average word lengths
    new_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    new_features[9] = abs(q1_avg_word_len - q2_avg_word_len)
    new_features[10] = abs(q1_avg_stop_len - q1_avg_stop_len)
    
    # Average of token length
    new_features[11] = (len(q1_tokens) + len(q2_tokens))/2
    
    # Similarity measures
    new_features[12] = SequenceMatcher(None, " ".join(q1_words), " ".join(q2_words)).ratio()
    new_features[13] = SequenceMatcher(None, " ".join(q1_stop_words), " ".join(q2_stop_words)).ratio()
    
    return new_features


In [64]:
def nlp_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess_text)
    df["question2"] = df["question2"].fillna("").apply(preprocess_text)
    
    new_features = df.apply(lambda x: get_new_features(x["question1"], x["question2"]), axis=1)
    df["min_common_word_count"] = list(map(lambda x: x[0], new_features))
    df["max_common_word_count"] = list(map(lambda x: x[1], new_features))
    df["min_common_stop_count"] = list(map(lambda x: x[2], new_features))
    df["max_common_stop_count"]  = list(map(lambda x: x[3], new_features))
    df["min_common_token_count"] = list(map(lambda x: x[4], new_features))
    df["max_common_token_count"] = list(map(lambda x: x[5], new_features))
    df["same_first_word"] = list(map(lambda x: x[6], new_features))
    df["same_last_word"] = list(map(lambda x: x[7], new_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], new_features))
    df["avg_word_len_diff"]  = list(map(lambda x: x[9], new_features))
    df["avg_stop_len_diff"]  = list(map(lambda x: x[10], new_features))
    df["mean_len"]  = list(map(lambda x: x[11], new_features))
    df["word_similarity"]  = list(map(lambda x: x[12], new_features))
    df["stop_similarity"]  = list(map(lambda x: x[13], new_features))
    df["tf_idf"]  = list(map(lambda x: x[14], new_features))
    
    return df


In [65]:
print("Running NLP features on train..")
train_df = pd.read_csv("input/train.csv")
train_df = nlp_features(train_df)
train_df.drop(["id", "qid1", "qid2", "question1", "question2", "is_duplicate"], axis=1, inplace=True)
train_df.to_csv("input/nlp_features_train.csv", index=False)

print("Running NLP features on test..")
test_df = pd.read_csv("input/test.csv")
test_df = nlp_features(test_df)
test_df.drop(["test_id", "question1", "question2"], axis=1, inplace=True)
test_df.to_csv("input/nlp_features_test.csv", index=False)

Running NLP features on train..
Running NLP features on test..


  interactivity=interactivity, compiler=compiler, result=result)
