In [54]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import difflib
import utils
import math
import re, string
import nltk
import itertools
from nltk import bigrams
from nltk import trigrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models, similarities
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn import model_selection
# from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, log_loss

In [2]:
# Import

train = pd.read_csv('train.csv')

In [3]:
# simple baseline model...compare words in [Andrea's]

def find_similarity(wl1, wl2):
    # send 2 word lists to find matching sequence
    sm = difflib.SequenceMatcher(None, wl1,wl2)
    sm = sm.ratio()
    return sm

In [4]:
# Preprocessing, SM loop
# code below runs through all IDs, comparing Q1 to Q2, storing similarity measurement [Andrea's]
# I played around with a bunch of preprocessing improvements here, but none actually performed better than what we have currently, for log_loss

sm_results = []

for id in range(0, len(train)):
# test with a smaller loop first
# for id in range(0, 20):

    q1 = str(train['question1'][id])
    q2 = str(train['question2'][id])
    
    q1 = q1.translate(None, string.punctuation).lower()
    q2 = q2.translate(None, string.punctuation).lower()

    q1words = q1.split()
    q2words = q2.split()
    
    sm_results.append([id, find_similarity(q1words, q2words)])
# print(sm_results)

In [5]:
actuals = np.array(train['is_duplicate'])
n_sm_results = np.array(sm_results)
predictions_sm = n_sm_results[:,1]
score_sm = log_loss(actuals, predictions_sm)
# accuracy_score_sm = accuracy_score(actuals, predictions_sm)
# print(accuracy_score_sm)
print(score_sm)

0.637853951947


In [6]:
# I want to binarize here before running log loss.

# sm_binarized_results = []

# for i, t in sm_results:
#     if t >= 0.75:
#         sm_binarized_results.append([i, 1])
#     else:
#         sm_binarized_results.append([i, 0])

# print(sm_binarized_results)

In [7]:
# Applied binarization, which netted us much worse results. Binarization will become more important for actual submissions, though.

# actuals1 = np.array(train['is_duplicate'])
# n_sm_results1 = np.array(sm_binarized_results)
# predictions_sm1 = n_sm_results1[:,1]
# score_sm1 = log_loss(actuals1, predictions_sm1)
# print(score_sm1)

The approach below comes from https://github.com/ab-bh/Quora-Duplicate-Question-Pairs/blob/master/TF-IDF%20Approach%20.ipynb, which I wanted to try re: learning. I think the approach is pretty elegant re: setup and can be extended, edited, nicely.

In [8]:
# def clean_text(content):
#     if type(content) == str:
#         text = content.lower()
# #         text = re.sub(r'[^\x00-\x7f]',r' ',text)
# #         text = re.sub("["+string.punctuation+"]", " ", text)
#         text = content.translate(None, string.punctuation).lower()
#         words=text.split()
# #         stop_word=set(stopwords.words('english'))
# #         words=list(word for word in words if not word in stop_word)
# #         words=[word for word in words if len(word)>1 ]
# #         words=[WordNetLemmatizer().lemmatize(word) for word in words]
#         return ( " ".join(words) )
#     else:
#         return ""

In [9]:
# # this take a somewhat long time

# train.question1 = train.question1.map(clean_text)
# train.question2 = train.question2.map(clean_text)

In [10]:
# # Looking into TF-IDF -- this takes a really long time.

# tfidf_vectorizer = TfidfVectorizer(analyzer='word', max_df=1.0, min_df=1)
# tfidf_results = []
    
# for i in train.id:
#     try:
#         tfidf_matrix = tfidf_vectorizer.fit_transform([train.loc[i]['question1'], train.loc[i]['question2']])
#         tfidf_results.append([i, round(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][1], 3)])
#     except:
#         tfidf_results.append([i, 0])
# # print(tfidf_results)

In [11]:
# actuals_tfidf = np.array(train['is_duplicate'])
# n_tfidf_results = np.array(tfidf_results)
# predictions_tfidf = n_tfidf_results[:,1]
# score_tfidf = log_loss(actuals_tfidf, predictions_tfidf)
# # accuracy_score_tfidf = accuracy_score(actuals_tfidf, predictions_tfidf)
# # print(accuracy_score_tfidf)
# print(score_tfidf)

Next up, N-grams. Bigrams and trigrams are both poor predictors pre-binarization, as they're mostly syntactical and not strong for semantic meaning.

In [12]:
# Taking bi-grams for each question.
# Must tokenize before starting.

id=2
# bigram1 = list(bigrams(str(train['question1'][id])))
# bigram2 = list(bigrams(str(train['question2'][id])))

q1 = str(train['question1'][id])
q2 = str(train['question2'][id])
    
q1 = q1.translate(None, string.punctuation).lower()
q2 = q2.translate(None, string.punctuation).lower()

q1words = q1.split()
q2words = q2.split()

bigram1 = list(bigrams(q1words))
bigram2 = list(bigrams(q2words))

print bigram1
print bigram2

# # list(bigrams(['more', 'is', 'said', 'than', 'done']))

[('how', 'can'), ('can', 'i'), ('i', 'increase'), ('increase', 'the'), ('the', 'speed'), ('speed', 'of'), ('of', 'my'), ('my', 'internet'), ('internet', 'connection'), ('connection', 'while'), ('while', 'using'), ('using', 'a'), ('a', 'vpn')]
[('how', 'can'), ('can', 'internet'), ('internet', 'speed'), ('speed', 'be'), ('be', 'increased'), ('increased', 'by'), ('by', 'hacking'), ('hacking', 'through'), ('through', 'dns')]


In [13]:
# def bigrammer(words):
#     return zip(words, words[1:])

In [14]:
# Create loop to go through for each id and come up w/ a percentage of bigram overlap
# Removing stopwords increased log_loss quite a bit.

bigram_similarity_results = []

for id in range(0, len(train)):
# test with a smaller loop first
# for id in range(0, 20):

    q1 = str(train['question1'][id])
    q2 = str(train['question2'][id])
    
    q1 = q1.translate(None, string.punctuation).lower()
    q2 = q2.translate(None, string.punctuation).lower()

    q1words = q1.split()
    q2words = q2.split()
    
#     stop_word=set(stopwords.words('english'))
#     q1words=list(word for word in q1words if not word in stop_word)
#     q2words=list(word for word in q2words if not word in stop_word)
    
    bigram1 = list(bigrams(q1words))
    bigram2 = list(bigrams(q2words))
    
#     print bigram1, bigram2
    
    bigram_similarity_results.append([id, find_similarity(bigram1, bigram2)])
    
# print(bigram_similarity_results)

In [15]:
# bigram log_loss

actuals = np.array(train['is_duplicate'])
n_bigram_results = np.array(bigram_similarity_results)
predictions_bigram = n_bigram_results[:,1]
score_bigram = log_loss(actuals, predictions_bigram)
# accuracy_score_bigram = accuracy_score(actuals, predictions_bigram)
# print(accuracy_score_bigram)
print(score_bigram)

1.85594647997


In [16]:
# trigrams
# removing stopwords was not at all effective in lowering log_loss for ngram models

trigram_similarity_results = []

for id in range(0, len(train)):
# test with a smaller loop first
# for id in range(0, 20):

    q1 = str(train['question1'][id])
    q2 = str(train['question2'][id])
    
    q1 = q1.translate(None, string.punctuation).lower()
    q2 = q2.translate(None, string.punctuation).lower()

    q1words = q1.split()
    q2words = q2.split()
    
#     stop_word=set(stopwords.words('english'))
#     q1words=list(word for word in q1words if not word in stop_word)
#     q2words=list(word for word in q2words if not word in stop_word)    
    
    trigram1 = list(trigrams(q1words))
    trigram2 = list(trigrams(q2words))
    
#     print trigram1, trigram2
    
    trigram_similarity_results.append([id, find_similarity(trigram1, trigram2)])

# print(trigram_similarity_results)

In [17]:
# trigram log_loss

actuals = np.array(train['is_duplicate'])
n_trigram_results = np.array(trigram_similarity_results)
predictions_trigram = n_trigram_results[:,1]
score_trigram = log_loss(actuals, predictions_trigram)
# accuracy_score_trigram = accuracy_score(actuals, predictions_trigram)
# print(accuracy_score_sm)
print(score_trigram)

5.29866130866


In [18]:
# character bi-gram similarity

char_bi_similarity_results = []

for id in range(0, len(train)):
# test with a smaller loop first
# for id in range(0, 20):

    q1 = str(train['question1'][id])
    q2 = str(train['question2'][id])
    
    q1 = q1.translate(None, string.punctuation).lower()
    q2 = q2.translate(None, string.punctuation).lower()

#     q1words = q1.split()
#     q2words = q2.split()
    
#     stop_word=set(stopwords.words('english'))
#     q1words=list(word for word in q1words if not word in stop_word)
#     q2words=list(word for word in q2words if not word in stop_word)
    
    char_bigram1 = list(bigrams(q1))
    char_bigram2 = list(bigrams(q2))
    
#     print bigram1, bigram2
    
    char_bi_similarity_results.append([id, find_similarity(char_bigram1, char_bigram2)])
    
# print(char_tri_similarity_results)

In [19]:
# character bigram log_loss

actuals = np.array(train['is_duplicate'])
n_char_bigram_results = np.array(char_bi_similarity_results)
predictions_char_bigram = n_char_bigram_results[:,1]
score_char_bigram = log_loss(actuals, predictions_char_bigram)
# accuracy_score_trigram = accuracy_score(actuals, predictions_trigram)
# print(accuracy_score_sm)
print(score_char_bigram)

0.650028562634


In [20]:
# character tri-gram similarity

char_tri_similarity_results = []

for id in range(0, len(train)):
# test with a smaller loop first
# for id in range(0, 20):

    q1 = str(train['question1'][id])
    q2 = str(train['question2'][id])
    
    q1 = q1.translate(None, string.punctuation).lower()
    q2 = q2.translate(None, string.punctuation).lower()

#     q1words = q1.split()
#     q2words = q2.split()
    
#     stop_word=set(stopwords.words('english'))
#     q1words=list(word for word in q1words if not word in stop_word)
#     q2words=list(word for word in q2words if not word in stop_word)
    
    char_trigram1 = list(trigrams(q1))
    char_trigram2 = list(trigrams(q2))
    
#     print bigram1, bigram2
    
    char_tri_similarity_results.append([id, find_similarity(char_trigram1, char_trigram2)])
    
# print(char_tri_similarity_results)

In [21]:
# character trigram log_loss

actuals = np.array(train['is_duplicate'])
n_char_trigram_results = np.array(char_tri_similarity_results)
predictions_char_trigram = n_char_trigram_results[:,1]
score_char_trigram = log_loss(actuals, predictions_char_trigram)
# accuracy_score_trigram = accuracy_score(actuals, predictions_trigram)
# print(accuracy_score_sm)
print(score_char_trigram)

0.620707028792


In [22]:
# I want to add some simple features that will likely be beneficial

def common_words(x):
    q1, q2 = x
    return len(set(str(q1).lower().split()) & set(str(q2).lower().split()))

def words_count(question):
    return len(str(question).split())

def length(question):
    return len(str(question))

In [31]:
def tokenize(word):
    return word.split()

In [33]:
# I want to look into LSI and LDA similarities

In [42]:
# Adding features to train

train['trigram-word-similarity'] = predictions_trigram
train['bigram-word-similarity'] = predictions_bigram
train['trigram-char-similarity'] = predictions_char_trigram
train['bigram-char-similarity'] = predictions_char_bigram
train['q1_num_words'] = train['question1'].map(words_count)
train['q2_numwords'] = train['question2'].map(words_count)
train['q1_sent_length'] = train['question1'].map(length)
train['q2_sent_length'] = train['question2'].map(length)
train['common_words'] = train[['question1', 'question2']].apply(common_words, axis=1)

In [57]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,trigram-word-similarity,bigram-word-similarity,trigram-char-similarity,bigram-char-similarity,q1_words_num,q2_words_num,q1_length,q2_length,common_words,q1_num_words,q2_numwords,q1_sent_length,q2_sent_length
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.909091,0.916667,0.923077,0.92437,14,12,66,57,10,14,12,66,57
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.117647,0.210526,0.512,0.582677,8,13,51,88,4,8,13,51,88
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.0,0.090909,0.269841,0.3125,14,10,73,59,4,14,10,73,59
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.0,0.0,0.0,0.039604,11,9,50,65,0,11,9,50,65
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.0,0.0,0.224299,0.293578,13,7,76,39,2,13,7,76,39


In [76]:
# Run random forest at the end

train, test = train_test_split(train, test_size = 0.2)

Y = train.is_duplicate
X = train[['trigram-word-similarity', 'bigram-word-similarity', 'trigram-char-similarity', 'bigram-char-similarity', 'q1_words_num', 'q2_words_num', 'q1_length', 'q2_length', 'common_words', 'q1_num_words', 'q2_numwords', 'q1_sent_length', 'q2_sent_length']]
Y1 = test.is_duplicate
X1 = test[['trigram-word-similarity', 'bigram-word-similarity', 'trigram-char-similarity', 'bigram-char-similarity', 'q1_words_num', 'q2_words_num', 'q1_length', 'q2_length', 'common_words', 'q1_num_words', 'q2_numwords', 'q1_sent_length', 'q2_sent_length']]

clf = RandomForestClassifier(n_jobs=2)
clf.fit(X, Y)

preds = clf.predict(X1)
# clf.predict_proba(X)

In [77]:
list(zip(X, clf.feature_importances_))

[('trigram-word-similarity', 0.052654594077853292),
 ('bigram-word-similarity', 0.085351440061041728),
 ('trigram-char-similarity', 0.1841751874576136),
 ('bigram-char-similarity', 0.16940546297003919),
 ('q1_words_num', 0.029112562437234028),
 ('q2_words_num', 0.031861099535370419),
 ('q1_length', 0.073100233361969164),
 ('q2_length', 0.077853735173448291),
 ('common_words', 0.080629270199144121),
 ('q1_num_words', 0.030288919563345635),
 ('q2_numwords', 0.035607484999337538),
 ('q1_sent_length', 0.073116444640596689),
 ('q2_sent_length', 0.076843565523006377)]

In [79]:
actuals = np.array(Y1)
score_rfc = log_loss(actuals, preds)
# accuracy_score = accuracy_score(actuals, preds)
# print(accuracy_score)
print(score_rfc)

9.93879567684
