### Required Packages
- spacy: conda install -c conda-forge spacy

        # out-of-the-box: download best-matching default model
        python -m spacy download en
        python -m spacy download de
        python -m spacy download fr

        # download best-matching version of specific model for your spaCy installation
        python -m spacy download en_core_web_md

- ipyext: 
        conda install -c https://conda.anaconda.org/janschulz ipyext

- watermark: 
        pip install watermark

- plotly: 
        conda install -c https://conda.anaconda.org/plotly plotly -n python2.7

In [1]:
# install magic extension
#!conda install -c https://conda.anaconda.org/janschulz ipyext
#!pip install watermark

#install plotly
#!conda install -c https://conda.anaconda.org/plotly plotly -n python2.7

In [2]:
import nltk
import numpy as np
import pandas as pd
import scipy 
import re, os, sys

import spacy
import seaborn as sns
import matplotlib.pyplot as plt

from subject_object_extraction import findSVOs

%matplotlib inline

#### Print the timestamp, server, python version information

In [3]:
%load_ext watermark

%watermark -u -n -t -z -v -m -p nltk,scipy,pandas,spacy,numpy

last updated: Sat Aug 19 2017 02:47:41 CST

CPython 3.5.3
IPython 6.1.0

nltk 3.2.4
scipy 0.19.1
pandas 0.20.3
spacy 1.9.0
numpy 1.13.1

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 4.4.0-89-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 6
interpreter: 64bit


In [4]:
# Plotly imports.
# import plotly.offline as plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
#from plotly.graph_objs import *

init_notebook_mode(connected= True)

In [None]:
# https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

# enable output for each command lines. By default, IPython only show ouput for the last command in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' 
# InteractiveShell.ast_node_interactivity = 'last' 

### Download stopwords of nltk

In [None]:
nltk.download('stopwords')

In [None]:
training_data = pd.read_csv('./data/train.csv', encoding = 'utf-8').fillna("")
testing_data  = pd.read_csv('./data/test.csv', encoding = 'utf-8').fillna("")

In [None]:
training_data.head()
training_data.tail()

In [None]:
testing_data.head()
testing_data.tail()

In [None]:
training_data.describe(include='all')
testing_data.describe(include='all')

# NLP Parse

In [None]:
nlp = spacy.load('en_core_web_md')
nltk_stops = set(nltk.corpus.stopwords.words("english"))
print('nltk stopwords lenth',len(nltk_stops))

# spacy has more stopwords
print('spacy stopword lenth',len(spacy.en.word_sets.STOP_WORDS))

In [None]:
# To include lower/upper/title -cased words (him/HIM/Him) I had to use:
# nlp.vocab.add_flag(lambda s: s.lower() in spacy.en.word_sets.STOP_WORDS, spacy.attrs.IS_STOP)
# en_core_web_md does include stopword

nlp.vocab.add_flag(lambda s: s.casefold() in spacy.en.word_sets.STOP_WORDS, spacy.attrs.IS_STOP)

In [None]:
df_train_q1 = training_data[['id', 'question1']].copy()
df_train_q2 = training_data[['id', 'question2']].copy()
df_test_q1 = testing_data[['test_id', 'question1']].copy()
df_test_q2 = testing_data[['test_id', 'question2']].copy()

df_train_q1.columns = ['id', 'question']
df_train_q2.columns = ['id', 'question']
df_test_q1.columns = ['id', 'question']
df_test_q2.columns = ['id', 'question']

df_train_q1['dataset'] = 1
df_train_q2['dataset'] = 1
df_test_q1['dataset'] = 2
df_test_q2['dataset'] = 2


df_train_q1['q1_or_q2'] = 1
df_train_q2['q1_or_q2'] = 2
df_test_q1['q1_or_q2'] = 1
df_test_q2['q1_or_q2'] = 2


df_train_q1.tail()
df_train_q2.tail()

df_test_q1.tail()
df_test_q2.tail()

df_all = pd.concat([df_train_q1,  df_train_q2, df_test_q1, df_test_q2])
df_all.head()
df_all.reset_index(drop=True, inplace = True)

In [None]:
# Cleaning data, remove leading and tailing spaces
# df_all['q'] = df_all.question.map( lambda q: q.strip().replace("\n", " ").replace("\r", " "))

df_all['q'] = df_all.question.map( lambda q: re.sub("\s\s+" , " ", q))



In [None]:
# Get the question character length
df_all['q_len'] = df_all.q.map(len)
df_all.tail()

In [None]:
df_all.pivot_table(values='q', index=['dataset','q1_or_q2'], columns = ['q_len'], 
               fill_value = 0,
               aggfunc='count')

df_pivot = df_all.pivot_table(values='id', index=['q'], columns = ['dataset'], 
               fill_value = 0,
#                margins= True, 
               aggfunc='count')
df_pivot

In [None]:
df_dup_pivot = training_data.pivot_table(values='id', index=['is_duplicate'], #columns = ['dataset'],
               fill_value = 0,
#                margins= True, 
               aggfunc='count')
df_dup_pivot

In [None]:
df_pivot.columns
df_pivot.columns = ['1','2']

data = go.Bar(x=['Training dataset','Testing dataset'], 
              y=[sum(df_pivot['1'])/2, sum(df_pivot['2'])/2],
#                text = ["{}".format(i) for i in question_cnt.index ],
              hoverinfo='y+text+name',
               name='Counts')
layout = go.Layout(
    title='Number of Question Pairs',
    xaxis=dict(
        title='dataset'
    ),
    yaxis=dict(
        title='Count'
    )
)
iplot(go.Figure(data=[data], layout=layout))


###################
data = go.Bar(x=['Training dataset','Testing dataset'], 
              y=[np.array(np.nonzero(df_pivot['1'])).shape[1], 
                np.array(np.nonzero(df_pivot['2'])).shape[1]], 
               name='Counts')
layout = go.Layout(
    title='Number of Unique Questions',
    xaxis=dict(
        title='dataset'
    ),
    yaxis=dict(
        title='Numbers of questions'
    )
)
iplot(go.Figure(data=[data], layout=layout))

#########################################
data = go.Bar(x=['Training dataset','Testing dataset'], 
              y=[df_pivot.loc['','1'], 
                df_pivot.loc['','2']], 
               name='Counts')
layout = go.Layout(
    title='Number of Empty Questions',
    xaxis=dict(
        title='dataset'
    ),
    yaxis=dict(
        title='Numbers of questions'
    )
)
iplot(go.Figure(data=[data], layout=layout))


In [None]:
# df_pivot = training_data.pivot_table(values='id', index=['is_duplicate'],
#                fill_value = 0,
#                aggfunc='count')

In [None]:

top_n = 50

# question_val_cnt = df_pivot['1'][df_pivot['1']>0].sort_values(ascending=False)
question_val_cnt =  df_all.q[df_all.dataset == 1].value_counts()


question_cnt = question_val_cnt[:top_n]

data1 = go.Bar(x=[i for i in range(len(question_cnt))], 
               y=list(question_cnt), 
               text = ["{}".format(i) for i in question_cnt.index ],
               name='Counts')

appearance_cnt = pd.Series(data=question_val_cnt).value_counts() 

data2 = go.Bar(x=appearance_cnt.index, 
               y=appearance_cnt, 
               name='Counts')

fig = tools.make_subplots(rows=2, cols=1,
                          subplot_titles=('Most frequent questions', 
                                                          'Appearance Count'))
fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)


fig['layout']['xaxis1'].update(title='questions')
fig['layout']['yaxis1'].update(title='Count')

fig['layout']['xaxis2'].update(title='Number of occurences of question')
fig['layout']['yaxis2'].update(title='Number of questions (log)',
                               type='log')


fig['layout'].update(title='Training Dataset')

iplot(fig)

In [None]:
question_cnt[:10]

In the training and testing dataset, many questions appear numerous times. In this section, we will analyze how many times each question appears in the following dataset

- training dataset
- testing dataset
- training + testing dataset

### Observations - Training dataset

In training dataset, the top frequent questions are 

1. weight loss
2. social - Instragram
3. weight loss
4. money - personal
5. social - Instragram
6. job
7. money - public policy
8. education
9. health
10. social - Instagram

If the questions are randomly sampled from Quora, then Weight loss and Instagram(social) seem to the most concerned questions among users.   

----

In [None]:
question_val_cnt =  df_all.q[df_all.dataset == 2].value_counts()

question_cnt = question_val_cnt[:top_n]

data1 = go.Bar(x=[i for i in range(len(question_cnt))], 
               y=list(question_cnt), 
               text = ["{}".format(i) for i in question_cnt.index ],
               name='Counts')


appearance_cnt = pd.Series(data=question_val_cnt).value_counts() 

data2 = go.Bar(x=appearance_cnt.index, 
               y=appearance_cnt, 
               name='Counts')

fig = tools.make_subplots(rows=2, cols=1,
                          subplot_titles=('Most frequent questions', 
                                                          'Appearance Count'))
fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)


fig['layout']['xaxis1'].update(title='questions')
fig['layout']['yaxis1'].update(title='Count')

fig['layout']['xaxis2'].update(title='Number of occurences of question')
fig['layout']['yaxis2'].update(title='Number of questions (log)',
                               type='log')


fig['layout'].update(height=1000, width=800,title='Testing Dataset')

iplot(fig)

In [None]:
question_cnt[:15]

### Observations - Testing dataset

In the testing dataset, top questions are meaningless. Most of them are WH-words questions without noun-phase referring to the subjects/objects. In addition, they are very short, containing one or few words only, and several dont have question mark (?). Only #10 has subject - I.

Apprarently, these single WH-word questions are not valid question in Quora. It it likely that these question are added into test dataset to avoid "cheating"(i.e. overfitting). These questions are "noises" added to the dataset to test the generalization capability of the classification model.

From these observations, we could use word count of question and punctuations (e.g. does the question contain question mark ?) as features. 

-----

In [None]:
question_val_cnt =  df_all.q.value_counts()


question_cnt = question_val_cnt[:top_n]

data1 = go.Bar(x=[i for i in range(len(question_cnt))], 
               y=list(question_cnt), 
               text = ["{}".format(i) for i in question_cnt.index ],
               name='Counts')


appearance_cnt = pd.Series(data=question_val_cnt).value_counts() 

data2 = go.Bar(x=appearance_cnt.index, 
               y=appearance_cnt, 
               name='Counts')

fig = tools.make_subplots(rows=2, cols=1,
                          subplot_titles=('Most frequent questions', 
                                                          'Appearance Count'))
fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)


fig['layout']['xaxis1'].update(title='questions')
fig['layout']['yaxis1'].update(title='Count')

fig['layout']['xaxis2'].update(title='Number of occurences of question')
fig['layout']['yaxis2'].update(title='Number of questions (log)',
                               type='log')


fig['layout'].update(height=1000, width=800, title='Training+Testing Dataset')

iplot(fig)

In [None]:
question_cnt[:15]

### Observations - Training+Testing dataset

WH-words occupies top rankings. In addition, "What", "How", and ..etc only appear in the testing dataset. The intuition is that we should examine syntactical validility and grammar rules of the questions. We could use Dependency parsing to analyze  the sentence structure and relationship among words.

--------

In [None]:
df_all['q_len'].head()

In [None]:
train_q_len = go.Histogram(
    x=df_all.q_len[df_all.dataset == 1],
    name='train data',
    histnorm='probability',
    opacity=0.7
)
test_q_len = go.Histogram(
    x=df_all.q_len[df_all.dataset == 2],
    name='test data',
    histnorm='probability',
    opacity=0.7
)

data = [train_q_len, test_q_len]

layout = go.Layout(title='Normalized histogram of character count in questions',
                   xaxis=dict(
                       title='Number of characters'),
                   yaxis=dict(
                       title='Probability'))
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='overlaid histogram')

In [None]:
def nlp_parse(q):
    token = []
    lemma = []
    pos = []
    tag =[]
    dep = []
#     shape = []
#     alpha = []
    stop =[]
    doc = nlp(q)
    for w in doc:
        token.append(w.text)
        lemma.append(w.lemma_)
        pos.append(w.pos_)
        tag.append(w.tag_)
        dep.append(w.dep_)
#         shape.append(w.shape_)
#         alpha.append(w.is_alpha)
        stop.append(w.is_stop)
    word_cnt = len(token)
    svo = findSVOs(doc)
    ents = [ (e.label_, e.text) for e in doc.ents]
#     return token, lemma, pos, tag, dep, shape, alpha, stop, word_cnt, svo, ents
    return token, lemma, pos, tag, dep, stop, word_cnt, svo, ents

In [None]:
df_ = df_all[(df_all['dataset'] == 1) ].copy()

df_.head()
len(df_)

In [None]:

df_['token'], df_['lemma'], df_['pos'], \
df_['tag'], df_['dep'], df_['stop'], \
df_['word_cnt'], df_['svo'], df_['ents'] = \
         zip(*df_['q'].map(nlp_parse))   

In [None]:
# df_[['q','tag','dep','svo','ents']]

df_.query('(dataset == 1) & (q_len >0)')

In [None]:
print('Total number of')

print('\t question pairs for training: {}'.format(len( training_data )))
print('\t duplicate question pairs: {:.2%}'.format(training_data['is_duplicate'].mean()))

print('####################################################')

question_ids = pd.Series( training_data['qid1'].tolist() + training_data['qid2'].tolist() )
print('Total number of unique questions in the training data: {}'.format( len(np.unique(question_ids)) ))
print('Number of questions that appear multiple times: {}'.format( np.sum(question_ids.value_counts() > 1 )))

print('####################################################')

training_questions = pd.concat([training_data['question1'], training_data['question2']], 
                              axis=0, ignore_index = True) 

testing_questions = pd.concat([testing_data['question1'], testing_data['question2']], 
                              axis=0, ignore_index = True) 

print('Training questions with')
print('\t question marks: {:.2%}'.format(np.mean(training_questions.apply(lambda x:1 if '?' in x else 0))))
print('\t [math] tags: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if '[math]' in x else 0 ))))
print('\t full stops: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if '.' in x else 0))))
print('\t numbers: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if len(re.findall('\d+',x)) else 0))))
print('\t Capital letters: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if len(re.findall('[A-Z]',x)) else 0))))
print('\t capitalised first letters: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if len(re.findall('^[A-Z]',x)) else 0))))

empty_q = training_questions.apply(lambda x: 0 if len(x) else 1)
print('\t empty question: {}, {:.4%}'.format(np.sum(empty_q), np.mean(empty_q)))
print('####################################################')


----------------

# Word Share


In [None]:
def word_share(q1, q2):
    q1_set = set(q1)
    q2_set = set(q2)
    word_share = q1_set.intersection(q2_set)
    return word_share
    
df_train['word_share'] = df_train.apply(lambda x: word_share(q1 = x['q1_token'], q2 = x['q2_token']), axis=1)
df_test['word_share'] = df_test.apply(lambda x: word_share(q1 = x['q1_token'], q2 = x['q2_token']), axis=1)

In [None]:
####################################################

training_questions = pd.Series( training_data['question1'].tolist() + training_data['question2'].tolist() ).astype(str)
testing_questions  = pd.Series( testing_data['question1'].tolist()  + testing_data['question2'].tolist() ).astype(str)

training_distribution = training_questions.apply(lambda x: len(x.split(' ')))
testing_distribution  = testing_questions.apply(lambda x: len(x.split(' ')))

####################################################

plt.hist (
          x      = training_distribution, 
          bins   = 50, 
          range  = [0, 50], 
          color  = 'green', 
          normed = True, 
          label  = 'training_data'
         )

plt.hist (
          x      = testing_distribution, 
          bins   = 50, 
          range  = [0, 50], 
          color  = 'red', 
          normed = True, 
          alpha  = 0.5, 
          label  = 'testing_data'
         )

plt.title (
           s        = 'Normalised histogram of word count in questions', 
           fontsize = 15
          )

plt.xlabel (
            s        = 'Number of words', 
            fontsize = 15
           )

plt.ylabel (
            s        = 'Probability', 
            fontsize = 15
           )

plt.legend()

In [None]:



####################################################

def word_match_simple_count ( row ):
    
    question1_words = {}
    question2_words = {}
    
    for word in str( row['question1'] ).lower().split():
        
        if word not in stops:
            
            question1_words[word] = 1
            
    for word in str( row['question2'] ).lower().split():
        
        if word not in stops:
            
            question2_words[word] = 1
            
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0
    
    words_shared_question1 = [word for word in question1_words.keys() if word in question2_words]
    words_shared_question2 = [word for word in question2_words.keys() if word in question1_words]
    
    return ( len(words_shared_question1) + len(words_shared_question2) ) / \
           ( len(question1_words)        + len(question2_words)        )

####################################################

training_data_word_match = training_data.apply (
                                                func = word_match_simple_count, 
                                                axis = 1, 
                                                raw  = True
                                               )

plt.hist (
          x      = training_data_word_match[training_data['is_duplicate'] == 0], 
          bins   = 20, 
          normed = True, 
          label  = 'Not Duplicate'
         )

plt.hist ( 
          x      = training_data_word_match[training_data['is_duplicate'] == 1], 
          bins   = 20, 
          normed = True, 
          alpha  = 0.7, 
          label  = 'Duplicate'
         )

plt.title (
           s        = 'Label distribution over word_match_share', 
           fontsize = 15
          )

plt.xlabel (
            s        = 'word_match_share', 
            fontsize = 15
           )

plt.legend()

## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)

transformer 

#training_questions = pd.Series( training_data['question1'].tolist() + training_data['question2'].tolist() ).astype(str)
#testing_questions  = pd.Series( testing_data['question1'].tolist()  + testing_data['question2'].tolist() ).astype(str)

counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]

tfidf = transformer.fit_transform(counts)

tfidf.toarray() 

#print tf.get_feature_names()

#print len(training_questions)



#print tf.get_feature_names()[200:210]

In [None]:
# Label distribution over word_order_similarity

In [None]:
# Label distribution over semantic_similarity
# http://sujitpal.blogspot.ca/2014/12/semantic-similarity-for-short-sentences.html

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# tfidf - rare words
# from sklearn.feature_extraction.text import TfidfTransformer
# https://chisqr.wordpress.com/2017/07/03/classifying-duplicate-questions-with-tensorflow/

In [None]:
# from difflib import SequenceMatcher

In [None]:
# from nltk.corpus import wordnet as wn
# nltk.word_tokenize

In [None]:
# https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question/blob/master/feature_engineering.py

## POS Tag, Lemma, Dependency Parsing Analysis

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
training_data.head()
training_data.tail()

In [None]:
testing_data.head()
testing_data.tail()

### 1. Combine training and test data, and remove duplicated questions 

In [None]:
df_all = pd.concat([training_data.question1, training_data.question2, 
                    testing_data.question1, testing_data.question2], 
                   axis =0, ignore_index = True) 

df_all.reset_index(drop=True, inplace = True)

In [None]:
df_all.head()

In [None]:
df_all.describe(include='all')

In [None]:
df_no_dup = df_all.drop_duplicates(keep='first') 
df_no_dup.reset_index(drop=True, inplace = True)
df_no_dup.describe(include='all')

##### Extract Name Entity information



In [None]:
# df[361520:361530]

# for row in tqdm(range(361557,361530)):
#     doc = nlp(unicode(df[row], errors='ignore')) 

##  Warning, the following code block takes 3 hours to run

In [None]:
from tqdm import tqdm
from collections import defaultdict
ents_dict = defaultdict(lambda : defaultdict(int))

df = df_no_dup
iter_len = len(df)
for row in tqdm(range(0,iter_len)):
    try:
        if len(df[row]) > 0:
            doc = nlp(df[row]) 
            for ent in doc.ents:
                ents_dict[ent.label_][ent.text] += 1
    except TypeError:
        print(row, df[row])
        

In [None]:
ents_dict.keys()
ents_set = set()
for label in ents_dict.keys():
    for text in ents_dict[label].keys():  
        if not set('[]~!@#$%^&*()_+{}":;\'+-<>?').intersection(text):
            ents_set.add(text)
            
# ents_dict
len(ents_set) 

# remove 'US'
ents_set.remove('US')
ents_dict

In [None]:
'india' in ents_set

In [None]:
def preprocess_ent(sent): 
#     print(sent)
    sent_new = sent
    for ent in ents_set:
#         print('\\b'+re.escape(ent)+'\\b')
#        print(ent)
        sent_new = re.sub('\\b'+ent+'\\b', ent, sent_new, flags=re.IGNORECASE|re.MULTILINE|re.X)
    return sent_new


In [None]:
df = training_data.copy()[:2]

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
# tqdm.pandas(desc="my bar!")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# df.progress_apply(lambda x: x**2)

df['sent1'] = df.question1.progress_apply(preprocess_ent)
# df['sent2'] = df.question2.apply(preprocess_ent)

In [None]:
df

In [None]:
for i, row in df.iterrows():
    print row[6]

In [None]:
sent = df.question1[0]
sent

In [None]:
re.sub('\\b'+'India'+'\\b', 'India', sent, flags=re.IGNORECASE|re.MULTILINE|re.X)