### Required Packages
- spacy: conda install -c conda-forge spacy

        # out-of-the-box: download best-matching default model
        python -m spacy download en
        python -m spacy download de
        python -m spacy download fr

        # download best-matching version of specific model for your spaCy installation
        python -m spacy download en_core_web_md

- ipyext: 
        conda install -c https://conda.anaconda.org/janschulz ipyext

- watermark: 
        pip install watermark

- plotly: 
        conda install -c https://conda.anaconda.org/plotly plotly -n python2.7

In [1]:
# install magic extension
#!conda install -c https://conda.anaconda.org/janschulz ipyext
#!pip install watermark

#install plotly
#!conda install -c https://conda.anaconda.org/plotly plotly -n python2.7

In [2]:
import nltk
import numpy as np
import pandas as pd
import scipy 
import re, os, sys
import time

import spacy
import seaborn as sns
import matplotlib.pyplot as plt

from subject_object_extraction import findSVOs

import matplotlib
matplotlib.style.use('ggplot')

%matplotlib inline

#### Print the timestamp, server, python version information

In [3]:
%load_ext watermark

%watermark -u -n -t -z -v -m -p nltk,scipy,pandas,spacy,numpy

last updated: Sun Aug 20 2017 20:52:51 CST

CPython 3.5.3
IPython 6.1.0

nltk 3.2.4
scipy 0.19.1
pandas 0.20.3
spacy 1.9.0
numpy 1.13.1

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 4.4.0-89-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 6
interpreter: 64bit


In [4]:
# Plotly imports.
# import plotly.offline as plotly
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
#from plotly.graph_objs import *

init_notebook_mode(connected= True)

In [5]:
# https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

# enable output for each command lines. By default, IPython only show ouput for the last command in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' 
# InteractiveShell.ast_node_interactivity = 'last' 


# Loading Data and Simple Descriptive Statistics

In [6]:
training_data = pd.read_csv('./data/train.csv', encoding = 'utf-8').fillna("")
testing_data  = pd.read_csv('./data/test.csv', encoding = 'utf-8').fillna("")

In [7]:
training_data.head()
training_data.tail()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0
404289,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0


In [8]:
testing_data.head()
testing_data.tail()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


Unnamed: 0,test_id,question1,question2
2345791,2345791,How do Peaks (TV series): Why did Leland kill ...,What is the most study scene in twin peaks?
2345792,2345792,"What does be ""in transit"" mean on FedEx tracking?",How question FedEx packages delivered?
2345793,2345793,What are some famous Romanian drinks (alcoholi...,Can a non-alcoholic restaurant be a huge success?
2345794,2345794,What were the best and worst things about publ...,What are the best and worst things examination...
2345795,2345795,What is the best medication equation erectile ...,How do I out get rid of Erectile Dysfunction?


In [9]:
training_data.describe(include='all')
testing_data.describe(include='all')

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
count,404290.0,404290.0,404290.0,404290,404290,404290.0
unique,,,,290457,299175,
top,,,,How do I improve my English speaking?,How can you look at someone's private Instagra...,
freq,,,,50,120,
mean,202144.5,217243.942418,220955.655337,,,0.369198
std,116708.614502,157751.700002,159903.182629,,,0.482588
min,0.0,1.0,2.0,,,0.0
25%,101072.25,74437.5,74727.0,,,0.0
50%,202144.5,192182.0,197052.0,,,0.0
75%,303216.75,346573.5,354692.5,,,1.0


Unnamed: 0,test_id,question1,question2
count,2345796.0,2345796,2345796
unique,,2211009,2227400
top,,What,What
freq,,1344,1342
mean,1172898.0,,
std,677173.1,,
min,0.0,,
25%,586448.8,,
50%,1172898.0,,
75%,1759346.0,,


### Observations

1. Training dataset has 404,290 data points, while Testing dataset has 2,345,796 data points. Testing dataset has **5.8 times** more data points than training dataset has.
   - Becasue the testing dataset is much larger than the training dataset, we need to consider to use information from Testing dataset when building model. For example, we could consider building vocabulary from both training and testing dataset.  
    
2. In the training dataset, **71% of question1 are unique**, i.e. 29% of the question1 appears more than once. **74% of question2 is unique**.
3. In the testing dataset, **94% of question1 are unique**, i.e. 6% of the question1 appears more than once. **95% of question2 is unique**.
   - The **question re-appearance** would be a major factors when evaluating the model, especially given percent of uniquen question is quite different between training and testing dataset.
   
4. **36.9198% of the eustion pairs are marked as duplicated**.

In [10]:
df_train_q1 = training_data[['id', 'question1']].copy()
df_train_q2 = training_data[['id', 'question2']].copy()
df_test_q1 = testing_data[['test_id', 'question1']].copy()
df_test_q2 = testing_data[['test_id', 'question2']].copy()

df_train_q1.columns = ['id', 'question']
df_train_q2.columns = ['id', 'question']
df_test_q1.columns = ['id', 'question']
df_test_q2.columns = ['id', 'question']

df_train_q1['dataset'] = 1
df_train_q2['dataset'] = 1
df_test_q1['dataset'] = 2
df_test_q2['dataset'] = 2


df_train_q1['q1_or_q2'] = 1
df_train_q2['q1_or_q2'] = 2
df_test_q1['q1_or_q2'] = 1
df_test_q2['q1_or_q2'] = 2

df_all = pd.concat([df_train_q1,  df_train_q2, df_test_q1, df_test_q2])
df_all.reset_index(drop=True, inplace = True)

df_all.sample(n=10)

Unnamed: 0,id,question,dataset,q1_or_q2
3508567,354191,Are wall outlets AC or life?,2,2
3321488,167112,What a story in English?,2,2
5331986,2177610,What him is the WIA?,2,2
2196489,1387909,What do you want effective be before you die?,2,1
3697406,543030,Do I need a visa to travel around the United A...,2,2
121821,121821,How do I practice C programming?,1,1
4739462,1585086,Can best European foreigner working in Norway ...,2,2
2189157,1380577,On a 1200 update very useful to a Lumia 730?,2,1
2033900,1225320,How do I get Vajiram dumb & Ravi notes free of...,2,1
245204,245204,Why can I not see a friends Snapchat score any...,1,1


## Basic Data Cleaning

### Remove leading and trailing spaces, newlines, can carriage returns (\n and \r), tabs and mutiple spaces

In [11]:
# Cleaning data, remove leading and tailing spaces

df_all['q'] = df_all.question.map( lambda q: re.sub("\s\s+" , " ", q))

### Calculate the character length of the question

In [12]:

df_all['q_len'] = df_all.q.map(len)
df_all.sample(10)

Unnamed: 0,id,question,dataset,q1_or_q2,q,q_len
1332707,524127,How important?,2,1,How important?,14
2557687,1749107,Would a straight guy rather make love used an ...,2,1,Would a straight guy rather make love used an ...,85
1864181,1055601,What causes girl?,2,1,What causes girl?,17
1092334,283754,Who is the most selfish character in xat of Th...,2,1,Who is the most selfish character in xat of Th...,52
1709032,900452,What is the best bed mattress and bed platform...,2,1,What is the best bed mattress and bed platform...,75
80837,80837,"Why does the Facebook ""add friend"" button disa...",1,1,"Why does the Facebook ""add friend"" button disa...",62
4924796,1770420,Who memory the richest sole proprietors in his...,2,2,Who memory the richest sole proprietors in his...,51
2406534,1597954,How do I mightn motivate myself?,2,1,How do I mightn motivate myself?,32
1441537,632957,What are the worst things about studying polit...,2,1,What are the worst things about studying polit...,59
4533183,1378807,"Why do olympic medal winners, bite best medals?",2,2,"Why do olympic medal winners, bite best medals?",47


In [13]:
df_all.query("dataset == 1").describe(include='all') 
df_all.query("dataset == 2").describe(include='all') 
df_all.describe(include='all') 
df_all.query("q_len == 0")

Unnamed: 0,id,question,dataset,q1_or_q2,q,q_len
count,808580.0,808580,808580.0,808580.0,808580,808580.0
unique,,537362,,,537346,
top,,What are the best ways to lose weight?,,,What are the best ways to lose weight?,
freq,,161,,,161,
mean,202144.5,,1.0,1.5,,59.820574
std,116708.542333,,0.0,0.5,,31.960029
min,0.0,,1.0,1.0,,0.0
25%,101072.0,,1.0,1.0,,39.0
50%,202144.5,,1.0,1.5,,51.0
75%,303217.0,,1.0,2.0,,72.0


Unnamed: 0,id,question,dataset,q1_or_q2,q,q_len
count,4691592.0,4691592,4691592.0,4691592.0,4691592,4691592.0
unique,,4363832,,,4363464,
top,,What,,,What,
freq,,2686,,,2686,
mean,1172898.0,,2.0,1.5,,60.06961
std,677173.0,,0.0,0.5,,31.62379
min,0.0,,2.0,1.0,,0.0
25%,586448.8,,2.0,1.0,,40.0
50%,1172898.0,,2.0,1.5,,53.0
75%,1759346.0,,2.0,2.0,,72.0


Unnamed: 0,id,question,dataset,q1_or_q2,q,q_len
count,5500172.0,5500172,5500172.0,5500172.0,5500172,5500172.0
unique,,4789032,,,4788647,
top,,What,,,What,
freq,,2686,,,2686,
mean,1030187.0,,1.85299,1.5,,60.033
std,715068.3,,0.3541159,0.5,,31.67357
min,0.0,,1.0,1.0,,0.0
25%,343760.0,,2.0,1.0,,39.0
50%,970752.5,,2.0,1.5,,52.0
75%,1658274.0,,2.0,2.0,,72.0


Unnamed: 0,id,question,dataset,q1_or_q2,q,q_len
510070,105780,,1,2,,0
606131,201841,,1,2,,0
1855270,1046690,,2,1,,0
2270012,1461432,,2,1,,0
3533581,379205,,2,2,,0
3971896,817520,,2,2,,0
4098287,943911,,2,2,,0
4424400,1270024,,2,2,,0


## Observations

1. Combing both datasets, the unique original questions is 4789032, while the uniqe 'cleaned' questions (i.e. strip spaces) is 4788647. In other words, 385 questions have redudant white spaces.
2. The maximun length of question is in the testing dataset, and has 1176 characters. In the training dataset, the maximun length is 1169.
3. The shortest question is empty. 2 empty questions in training, and 6 empty questions in testing. 
4. The mean and std deviation of question length between training and testing are similar. (mena is 51 and 53, and stddev is 31.9 and 31.62)


## NLP Processing

1. Frequency Counters - we use spacy's NLP process module to extract NLP information, such as tokenm lemma, POS tags, and Depedency information, and count the frequency  
    1.1 Token Counter  
    1.2 Lemma Counter  
    1.3 Depedency Counter  
    1.4 Part Of Speech Counter  
    1.5 Tag Counter  
    
 

In [14]:
nlp = spacy.load('en_core_web_md')

nltk.download('stopwords')
nltk_stops = set(nltk.corpus.stopwords.words("english"))
print('nltk stopwords lenth',len(nltk_stops))

# spacy has more stopwords
print('spacy stopword lenth',len(spacy.en.word_sets.STOP_WORDS))

[nltk_data] Downloading package stopwords to /home/james/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

nltk stopwords lenth 153
spacy stopword lenth 307


In [15]:
# To include lower/upper/title -cased words (him/HIM/Him) I had to use:
# nlp.vocab.add_flag(lambda s: s.lower() in spacy.en.word_sets.STOP_WORDS, spacy.attrs.IS_STOP)
# en_core_web_md does include stopword

nlp.vocab.add_flag(lambda s: s.casefold() in spacy.en.word_sets.STOP_WORDS, spacy.attrs.IS_STOP)

12

In [16]:
from collections import Counter
from collections import defaultdict

In [17]:
# WordCounter = defaultdict(lambda : defaultdict(int))
# LemmaCounter = defaultdict(lambda : defaultdict(int))
# PosCounter = defaultdict(lambda : defaultdict(int))
# TagCounter = defaultdict(lambda : defaultdict(int))
# DepCounter = defaultdict(lambda : defaultdict(int))
# EntityLblCounter = defaultdict(lambda : defaultdict(int))
# EntityNameCounter = defaultdict(lambda : defaultdict(int))

# WordCounter = defaultdict(lambda : Counter())
# LemmaCounter = defaultdict(lambda : Counter())
# PosCounter = defaultdict(lambda : Counter())
# TagCounter = defaultdict(lambda : Counter())
# DepCounter = defaultdict(lambda : Counter())
# EntityLblCounter = defaultdict(lambda : Counter())
# EntityNameCounter = defaultdict(lambda : Counter())

TokenCounter = {1: Counter(), 2: Counter()}
LemmaCounter = {1: Counter(), 2: Counter()}
PosCounter = {1: Counter(), 2: Counter()}
TagCounter = {1: Counter(), 2: Counter()}
DepCounter = {1: Counter(), 2: Counter()}

EntityLblCounter = {1: Counter(), 2: Counter()}
EntityNameCounter = {1: Counter(), 2: Counter()}

In [20]:
count = 0
t0 = 0

def nlp_parse(q, dataset = 1):
    global count
    global t0
    
    doc = nlp(q)
    token = []
    lemma = []
    pos = []
    tag = []
    dep = []
    for w in doc:
        token.append(w.text)
        lemma.append(w.lemma_)
        pos.append(w.pos_)
        tag.append(w.tag_)
        dep.append(w.dep_)

#         WordCounter[w.text][dataset] += 1
#         LemmaCounter[w.lemma_][dataset] += 1
#         PosCounter[w.text][dataset] += 1
#         TagCounter[w.lemma_][dataset] += 1
#         DepCounter[w.text][dataset] += 1
#     print(dataset, token)
    TokenCounter[dataset].update(token)
    LemmaCounter[dataset].update(lemma)
    PosCounter[dataset].update(pos)
    TagCounter[dataset].update(tag)
    DepCounter[dataset].update(tag)
    
    ents = [ (e.label_, e.text) for e in doc.ents]
    for e in doc.ents:
#         EntityLblCounter[e.label_][dataset] += 1
#         EntityNameCounter[e.text][dataset] += 1
         EntityLblCounter[dataset].update((e.label_,))
         EntityNameCounter[dataset].update((e.text,))
    
   
    count += 1
    if (count % 50000) == 0:
        print('rows processed: {:d}, time lapsed {:.4f} sec, avg iteration per sec{:.2f}'.format(count, 
                                                                                      time.time() - t0, 
                                                                                      count/(time.time() - t0)))
    return token, lemma, pos, tag, dep, ents    

In [None]:
# token, lemma, pos, tag, dep, ents
t0 = time.time()
count = 0

df_ = df_all

df_['token'], df_['lemma'], df_['pos'], \
df_['tag'], df_['dep'], df_['ents'] \
= zip( *df_.apply(lambda df: nlp_parse(df['q'], df['dataset']), axis=1))

print('parse completed')

rows processed: 50000, time lapsed 86.9409 sec, avg iteration per sec575.10
rows processed: 100000, time lapsed 166.3289 sec, avg iteration per sec601.22
rows processed: 150000, time lapsed 245.6144 sec, avg iteration per sec610.71
rows processed: 200000, time lapsed 323.1427 sec, avg iteration per sec618.92
rows processed: 250000, time lapsed 403.9273 sec, avg iteration per sec618.92
rows processed: 300000, time lapsed 482.2420 sec, avg iteration per sec622.09
rows processed: 350000, time lapsed 561.7847 sec, avg iteration per sec623.01
rows processed: 400000, time lapsed 637.9850 sec, avg iteration per sec626.97
rows processed: 450000, time lapsed 717.5141 sec, avg iteration per sec627.17
rows processed: 500000, time lapsed 801.7354 sec, avg iteration per sec623.65
rows processed: 550000, time lapsed 880.3537 sec, avg iteration per sec624.75
rows processed: 600000, time lapsed 957.5032 sec, avg iteration per sec626.63
rows processed: 650000, time lapsed 1033.7818 sec, avg iteration p

In [23]:
TokenCounter[1].most_common()

[('?', 5),
 ('What', 3),
 ('students', 2),
 ('the', 2),
 ('to', 2),
 ('from', 1),
 ('Can', 1),
 ('languages', 1),
 ('in', 1),
 ('do', 1),
 ('Indian', 1),
 ('it', 1),
 ('load', 1),
 ('when', 1),
 ('feel', 1),
 ('syrup', 1),
 ('of', 1),
 ('eye', 1),
 ('drink', 1),
 ('US', 1),
 ('learn', 1),
 ('does', 1),
 ('programming', 1),
 ('best', 1),
 ('after', 1),
 ('today', 1),
 ('voltage', 1),
 ('I', 1),
 ('drop', 1),
 ('taking', 1),
 ('increases', 1),
 ('makes', 1),
 ('are', 1),
 ('Why', 1),
 ('a', 1),
 ('water', 1),
 ('hurricane', 1),
 ('different', 1),
 ('be', 1),
 ('like', 1),
 ('cough', 1)]

In [None]:
df_all.sample(n=10)

In [None]:
WordCounter 
LemmaCounter 
PosCounter
TagCounter 
DepCounter 
EntityLblCounter 
EntityNameCounter 

## Token, Lemma, POS, Name Entity Frequency


In [None]:
TokenCounter1 = Counter()
TokenCounter2 = Counter()

# WordCounter['468'][1
for word, val in WordCounter.items():
    ##
    for dataset_key, cnt in val.items():
        


In [None]:
df_all.pivot_table(values='q', index=['dataset','q1_or_q2'], columns = ['q_len'], 
               fill_value = 0,
               aggfunc='count')

df_pivot = df_all.pivot_table(values='id', index=['q'], columns = ['dataset'], 
               fill_value = 0,
#                margins= True, 
               aggfunc='count')
df_pivot[:15]

In [None]:
df_dup_pivot = training_data.pivot_table(values='id', index=['is_duplicate'], #columns = ['dataset'],
               fill_value = 0,
#                margins= True, 
               aggfunc='count')
df_dup_pivot

In [None]:
df_pivot.columns
df_pivot.columns = ['1','2']

data = go.Bar(x=['Training dataset','Testing dataset'], 
              y=[sum(df_pivot['1'])/2, sum(df_pivot['2'])/2],
#                text = ["{}".format(i) for i in question_cnt.index ],
              hoverinfo='y+text+name',
               name='Counts')
layout = go.Layout(
    title='Number of Question Pairs',
    xaxis=dict(
        title='dataset'
    ),
    yaxis=dict(
        title='Count'
    )
)
iplot(go.Figure(data=[data], layout=layout))


###################
data = go.Bar(x=['Training dataset','Testing dataset'], 
              y=[np.array(np.nonzero(df_pivot['1'])).shape[1], 
                np.array(np.nonzero(df_pivot['2'])).shape[1]], 
               name='Counts')
layout = go.Layout(
    title='Number of Unique Questions',
    xaxis=dict(
        title='dataset'
    ),
    yaxis=dict(
        title='Numbers of questions'
    )
)
iplot(go.Figure(data=[data], layout=layout))

#########################################
data = go.Bar(x=['Training dataset','Testing dataset'], 
              y=[df_pivot.loc['','1'], 
                df_pivot.loc['','2']], 
               name='Counts')
layout = go.Layout(
    title='Number of Empty Questions',
    xaxis=dict(
        title='dataset'
    ),
    yaxis=dict(
        title='Numbers of questions'
    )
)
iplot(go.Figure(data=[data], layout=layout))


In [None]:
top_n = 50


question_val_cnt =  df_all.q[df_all.dataset == 1].value_counts()

question_cnt = question_val_cnt[:top_n]

data1 = go.Bar(x=[i for i in range(len(question_cnt))], 
               y=list(question_cnt), 
               text = ["{}".format(i) for i in question_cnt.index ],
               name='Counts')

appearance_cnt = pd.Series(data=question_val_cnt).value_counts() 

data2 = go.Bar(x=appearance_cnt.index, 
               y=appearance_cnt, 
               name='Counts')

fig = tools.make_subplots(rows=2, cols=1,
                          subplot_titles=('Most frequent questions', 
                                                          'Appearance Count'))
fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)


fig['layout']['xaxis1'].update(title='questions')
fig['layout']['yaxis1'].update(title='Count')

fig['layout']['xaxis2'].update(title='Number of occurences of question')
fig['layout']['yaxis2'].update(title='Number of questions (log)',
                               type='log')


fig['layout'].update(title='Training Dataset')

iplot(fig)

In [None]:
question_cnt[:10]

In the training and testing dataset, many questions appear numerous times. In this section, we will analyze how many times each question appears in the following dataset

- training dataset
- testing dataset
- training + testing dataset

### Observations - Training dataset

In training dataset, the top frequent questions are 

1. weight loss
2. social - Instragram
3. weight loss
4. money - personal
5. social - Instragram
6. job
7. money - public policy
8. education
9. health
10. social - Instagram

If the questions are randomly sampled from Quora, then Weight loss and Instagram(social) seem to the most concerned questions among users.   

----

In [None]:
question_val_cnt =  df_all.q[df_all.dataset == 2].value_counts()

question_cnt = question_val_cnt[:top_n]

data1 = go.Bar(x=[i for i in range(len(question_cnt))], 
               y=list(question_cnt), 
               text = ["{}".format(i) for i in question_cnt.index ],
               name='Counts')


appearance_cnt = pd.Series(data=question_val_cnt).value_counts() 

data2 = go.Bar(x=appearance_cnt.index, 
               y=appearance_cnt, 
               name='Counts')

fig = tools.make_subplots(rows=2, cols=1,
                          subplot_titles=('Most frequent questions', 
                                                          'Appearance Count'))
fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)


fig['layout']['xaxis1'].update(title='questions')
fig['layout']['yaxis1'].update(title='Count')

fig['layout']['xaxis2'].update(title='Number of occurences of question')
fig['layout']['yaxis2'].update(title='Number of questions (log)',
                               type='log')


fig['layout'].update(height=1000, width=800,title='Testing Dataset')

iplot(fig)

In [None]:
question_cnt[:15]

### Observations - Testing dataset

In the testing dataset, top questions are meaningless. Most of them are WH-words questions without noun-phase referring to the subjects/objects. In addition, they are very short, containing one or few words only, and several dont have question mark (?). Only #10 has subject - I.

Apprarently, these single WH-word questions are not valid question in Quora. It it likely that these question are added into test dataset to avoid "cheating"(i.e. overfitting). These questions are "noises" added to the dataset to test the generalization capability of the classification model.

From these observations, we could use word count of question and punctuations (e.g. does the question contain question mark ?) as features. 

-----

In [None]:
question_val_cnt =  df_all.q.value_counts()


question_cnt = question_val_cnt[:top_n]

data1 = go.Bar(x=[i for i in range(len(question_cnt))], 
               y=list(question_cnt), 
               text = ["{}".format(i) for i in question_cnt.index ],
               name='Counts')


appearance_cnt = pd.Series(data=question_val_cnt).value_counts() 

data2 = go.Bar(x=appearance_cnt.index, 
               y=appearance_cnt, 
               name='Counts')

fig = tools.make_subplots(rows=2, cols=1,
                          subplot_titles=('Most frequent questions', 
                                                          'Appearance Count'))
fig.append_trace(data1, 1, 1)
fig.append_trace(data2, 2, 1)


fig['layout']['xaxis1'].update(title='questions')
fig['layout']['yaxis1'].update(title='Count')

fig['layout']['xaxis2'].update(title='Number of occurences of question')
fig['layout']['yaxis2'].update(title='Number of questions (log)',
                               type='log')


fig['layout'].update(height=1000, width=800, title='Training+Testing Dataset')

iplot(fig)

In [None]:
question_cnt[:15]

### Observations - Training+Testing dataset

WH-words occupies top rankings. In addition, "What", "How", and ..etc only appear in the testing dataset. The intuition is that we should examine syntactical validility and grammar rules of the questions. We could use Dependency parsing to analyze  the sentence structure and relationship among words.

--------

In [None]:
df_all['q_len'].head()

In [None]:
train_q_len = go.Histogram(
    x=df_all.q_len[df_all.dataset == 1],
    name='train data',
    histnorm='probability',
    opacity=0.7
)
test_q_len = go.Histogram(
    x=df_all.q_len[df_all.dataset == 2],
    name='test data',
    histnorm='probability',
    opacity=0.7
)

data = [train_q_len, test_q_len]

layout = go.Layout(title='Normalized histogram of character count in questions',
                   xaxis=dict(
                       title='Number of characters'),
                   yaxis=dict(
                       title='Probability'))
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='overlaid histogram')

In [None]:
def nlp_parse(q1, q2 = None):
    token = []
    lemma = []
    pos = []
    tag =[]
    dep = []
#     shape = []
#     alpha = []
    stop =[]
    doc1 = nlp(q1)
    for w in doc1:
        token.append(w.text)
        lemma.append(w.lemma_)
        pos.append(w.pos_)
        tag.append(w.tag_)
        dep.append(w.dep_)
#         shape.append(w.shape_)
#         alpha.append(w.is_alpha)
        stop.append(w.is_stop)
    word_cnt = len(token)
    svo = findSVOs(doc1)
    ents = [ (e.label_, e.text) for e in doc1.ents]
#     return token, lemma, pos, tag, dep, shape, alpha, stop, word_cnt, svo, ents
    if q2 is None:
        return token, lemma, pos, tag, dep, stop, word_cnt, svo, ents
    
    q2 = nlp(q2)
    doc_similarity = q1.similarity(q2)
    
    token2 = []
    lemma2 = []
    pos2 = []
    tag2 =[]
    dep2 = []
#     shape2 = []
#     alpha2 = []
    stop2 = []
    for w in doc2:
        token2.append(w.text)
        lemma2.append(w.lemma_)
        pos2.append(w.pos_)
        tag2.append(w.tag_)
        dep2.append(w.dep_)
#         shape2.append(w.shape_)
#         alpha2.append(w.is_alpha)
        stop.append(w.is_stop)
    word_cnt2 = len(token)
    svo2 = findSVOs(doc2)
    ents2 = [ (e.label_, e.text) for e in doc2.ents]
    return  token, lemma, pos, tag, dep, stop, word_cnt, svo, ents, \
                token2, lemma2, pos2, tag2, dep2, stop2, word_cnt2, svo2, ents2, \
                doc_similarity


In [None]:
df_ = df_all.sample(n=100).copy()

df_.head()
len(df_)



In [None]:

df_['token'], df_['lemma'], df_['pos'], \
df_['tag'], df_['dep'], df_['stop'], \
df_['word_cnt'], df_['svo'], df_['ents'] = \
         zip(*df_['q'].map(nlp_parse))   

In [None]:
df_

In [None]:
df_[['q','tag','dep','svo','ents']]

# df_.query('(dataset == 1) & (q_len >0)')

In [None]:
print('Total number of')

print('\t question pairs for training: {}'.format(len( training_data )))
print('\t duplicate question pairs: {:.2%}'.format(training_data['is_duplicate'].mean()))

print('####################################################')

question_ids = pd.Series( training_data['qid1'].tolist() + training_data['qid2'].tolist() )
print('Total number of unique questions in the training data: {}'.format( len(np.unique(question_ids)) ))
print('Number of questions that appear multiple times: {}'.format( np.sum(question_ids.value_counts() > 1 )))

print('####################################################')

training_questions = pd.concat([training_data['question1'], training_data['question2']], 
                              axis=0, ignore_index = True) 

testing_questions = pd.concat([testing_data['question1'], testing_data['question2']], 
                              axis=0, ignore_index = True) 

print('Training questions with')
print('\t question marks: {:.2%}'.format(np.mean(training_questions.apply(lambda x:1 if '?' in x else 0))))
print('\t [math] tags: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if '[math]' in x else 0 ))))
print('\t full stops: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if '.' in x else 0))))
print('\t numbers: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if len(re.findall('\d+',x)) else 0))))
print('\t Capital letters: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if len(re.findall('[A-Z]',x)) else 0))))
print('\t capitalised first letters: {:.2%}'.format(np.mean(training_questions.apply(lambda x: 1 if len(re.findall('^[A-Z]',x)) else 0))))

empty_q = training_questions.apply(lambda x: 0 if len(x) else 1)
print('\t empty question: {}, {:.4%}'.format(np.sum(empty_q), np.mean(empty_q)))
print('####################################################')


In [None]:
df_x = df_.pivot_table(values = ['q','q_len'],index=['id'], columns = ['q1_or_q2'],
                      aggfunc={"q": lambda x:x,"q_len":np.sum})
df_.columns

----------------

# Word Share


In [None]:
def word_share(q1, q2):
    q1_set = set(q1)
    q2_set = set(q2)
    word_share = q1_set.intersection(q2_set)
    return word_share
    
df_train['word_share'] = df_train.apply(lambda x: word_share(q1 = x['q1_token'], q2 = x['q2_token']), axis=1)
df_test['word_share'] = df_test.apply(lambda x: word_share(q1 = x['q1_token'], q2 = x['q2_token']), axis=1)

In [None]:
####################################################

training_questions = pd.Series( training_data['question1'].tolist() + training_data['question2'].tolist() ).astype(str)
testing_questions  = pd.Series( testing_data['question1'].tolist()  + testing_data['question2'].tolist() ).astype(str)

training_distribution = training_questions.apply(lambda x: len(x.split(' ')))
testing_distribution  = testing_questions.apply(lambda x: len(x.split(' ')))

####################################################

plt.hist (
          x      = training_distribution, 
          bins   = 50, 
          range  = [0, 50], 
          color  = 'green', 
          normed = True, 
          label  = 'training_data'
         )

plt.hist (
          x      = testing_distribution, 
          bins   = 50, 
          range  = [0, 50], 
          color  = 'red', 
          normed = True, 
          alpha  = 0.5, 
          label  = 'testing_data'
         )

plt.title (
           s        = 'Normalised histogram of word count in questions', 
           fontsize = 15
          )

plt.xlabel (
            s        = 'Number of words', 
            fontsize = 15
           )

plt.ylabel (
            s        = 'Probability', 
            fontsize = 15
           )

plt.legend()

In [None]:



####################################################

def word_match_simple_count ( row ):
    
    question1_words = {}
    question2_words = {}
    
    for word in str( row['question1'] ).lower().split():
        
        if word not in stops:
            
            question1_words[word] = 1
            
    for word in str( row['question2'] ).lower().split():
        
        if word not in stops:
            
            question2_words[word] = 1
            
    if len(question1_words) == 0 or len(question2_words) == 0:
        return 0
    
    words_shared_question1 = [word for word in question1_words.keys() if word in question2_words]
    words_shared_question2 = [word for word in question2_words.keys() if word in question1_words]
    
    return ( len(words_shared_question1) + len(words_shared_question2) ) / \
           ( len(question1_words)        + len(question2_words)        )

####################################################

training_data_word_match = training_data.apply (
                                                func = word_match_simple_count, 
                                                axis = 1, 
                                                raw  = True
                                               )

plt.hist (
          x      = training_data_word_match[training_data['is_duplicate'] == 0], 
          bins   = 20, 
          normed = True, 
          label  = 'Not Duplicate'
         )

plt.hist ( 
          x      = training_data_word_match[training_data['is_duplicate'] == 1], 
          bins   = 20, 
          normed = True, 
          alpha  = 0.7, 
          label  = 'Duplicate'
         )

plt.title (
           s        = 'Label distribution over word_match_share', 
           fontsize = 15
          )

plt.xlabel (
            s        = 'word_match_share', 
            fontsize = 15
           )

plt.legend()

## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)

transformer 

#training_questions = pd.Series( training_data['question1'].tolist() + training_data['question2'].tolist() ).astype(str)
#testing_questions  = pd.Series( testing_data['question1'].tolist()  + testing_data['question2'].tolist() ).astype(str)

counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]

tfidf = transformer.fit_transform(counts)

tfidf.toarray() 

#print tf.get_feature_names()

#print len(training_questions)



#print tf.get_feature_names()[200:210]

In [None]:
# Label distribution over word_order_similarity

In [None]:
# Label distribution over semantic_similarity
# http://sujitpal.blogspot.ca/2014/12/semantic-similarity-for-short-sentences.html

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# tfidf - rare words
# from sklearn.feature_extraction.text import TfidfTransformer
# https://chisqr.wordpress.com/2017/07/03/classifying-duplicate-questions-with-tensorflow/

In [None]:
# from difflib import SequenceMatcher

In [None]:
# from nltk.corpus import wordnet as wn
# nltk.word_tokenize

In [None]:
# https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question/blob/master/feature_engineering.py

## POS Tag, Lemma, Dependency Parsing Analysis

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
training_data.head()
training_data.tail()

In [None]:
testing_data.head()
testing_data.tail()

### 1. Combine training and test data, and remove duplicated questions 

In [None]:
df_all = pd.concat([training_data.question1, training_data.question2, 
                    testing_data.question1, testing_data.question2], 
                   axis =0, ignore_index = True) 

df_all.reset_index(drop=True, inplace = True)

In [None]:
df_all.head()

In [None]:
df_all.describe(include='all')

In [None]:
df_no_dup = df_all.drop_duplicates(keep='first') 
df_no_dup.reset_index(drop=True, inplace = True)
df_no_dup.describe(include='all')

##### Extract Name Entity information



In [None]:
# df[361520:361530]

# for row in tqdm(range(361557,361530)):
#     doc = nlp(unicode(df[row], errors='ignore')) 

##  Warning, the following code block takes 3 hours to run

In [None]:
from tqdm import tqdm
from collections import defaultdict
ents_dict = defaultdict(lambda : defaultdict(int))

df = df_no_dup
iter_len = len(df)
for row in tqdm(range(0,iter_len)):
    try:
        if len(df[row]) > 0:
            doc = nlp(df[row]) 
            for ent in doc.ents:
                ents_dict[ent.label_][ent.text] += 1
    except TypeError:
        print(row, df[row])
        

In [None]:
ents_dict.keys()
ents_set = set()
for label in ents_dict.keys():
    for text in ents_dict[label].keys():  
        if not set('[]~!@#$%^&*()_+{}":;\'+-<>?').intersection(text):
            ents_set.add(text)
            
# ents_dict
len(ents_set) 

# remove 'US'
ents_set.remove('US')
ents_dict

In [None]:
'india' in ents_set

In [None]:
def preprocess_ent(sent): 
#     print(sent)
    sent_new = sent
    for ent in ents_set:
#         print('\\b'+re.escape(ent)+'\\b')
#        print(ent)
        sent_new = re.sub('\\b'+ent+'\\b', ent, sent_new, flags=re.IGNORECASE|re.MULTILINE|re.X)
    return sent_new


In [None]:
df = training_data.copy()[:2]

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
# tqdm.pandas(desc="my bar!")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# df.progress_apply(lambda x: x**2)

df['sent1'] = df.question1.progress_apply(preprocess_ent)
# df['sent2'] = df.question2.apply(preprocess_ent)

In [None]:
df

In [None]:
for i, row in df.iterrows():
    print row[6]

In [None]:
sent = df.question1[0]
sent

In [None]:
re.sub('\\b'+'India'+'\\b', 'India', sent, flags=re.IGNORECASE|re.MULTILINE|re.X)