In [66]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from nltk.corpus import wordnet
from textblob import Word
import random

from nltk.corpus import stopwords
stops = stopwords.words('english')
pd.set_option('max_colwidth',80)

In [128]:
df_10k = pd.read_table('samples/training_10k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
df_50k = pd.read_table('samples/training_50k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
df_5k = pd.read_table('samples/training_5k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
df_15k = pd.read_table('samples/training_15k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
df_20k = pd.read_table('samples/training_20k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
df_25k = pd.read_table('samples/training_25k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
df_30k = pd.read_table('samples/training_30k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])

In [154]:
ask_5k = pd.read_table('samples/ask_5k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
ask_10k = pd.read_table('samples/ask_10k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
ask_15k = pd.read_table('samples/ask_15k.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'id'])
ask_full = pd.read_table('new_train.tsv', sep='\t', encoding='utf-8',
                     names=['is_duplicate','question1', 'question2', 'qid'])

In [158]:
def preprocess(df):
    df["question1"] = df['question1'].str.replace('[^\w\s]','')
    df["question2"] = df['question2'].str.replace('[^\w\s]','')
    print("Preprocess done!")
    return df

def features(df):
    df['q1_lemma'] = df['question1'].apply(lemmatize_text)
    df['q2_lemma'] = df['question2'].apply(lemmatize_text)
    df['word_overlap'] = df.apply( word_match, axis=1, raw=True)
    df['word_overlap_score'] = df.apply(word_match_share, axis=1, raw=True)
    df['q1_n_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
    df['q2_n_words'] = df['question2'].apply(lambda row: len(row.split(" ")))
    print("Features done!")
    return df

def heuristic(df):
    overlap = df[(df['word_overlap_score'] > 0.6)]
    overlap = overlap[['is_duplicate','question2', 'question1', 'qid']]
    overlap.rename(columns={'question2':'question1',
                                'question1':'question2'}, inplace=True)

    low_overlap = df[(df['word_overlap_score'] < 0.4) & (df_10k['is_duplicate'] == 1)]
    low_overlap = low_overlap[['is_duplicate','question2', 'question1', 'qid']]
    low_overlap.rename(columns={'question2':'question1',
                                'question1':'question2'}, inplace=True)
    print(overlap.count()[0])
    print(low_overlap.count()[0])
    heuristic = pd.concat([df[['is_duplicate','question1', 'question2', 'qid']], overlap, low_overlap])
    heuristic.reset_index(inplace=True, drop=True)
    print("Heuristic done!")
    return heuristic

def replace_sysnonyms(text):
    line = text.strip()
    words = line.split(" ")
    output = list()
    for word_str in words:
        word_obj = Word(word_str)
        if len(word_str) > 3 and len(word_obj.synsets) > 0:
            synset = word_obj.synsets[0]
            lemma = random.choice(synset.lemma_names())
            output.append(lemma.replace('_', ' '))
        else:
            output.append(word_str)
    new_text =  " ".join(output)
    return new_text

def synonyms(df):
    df['q1_synonym'] = df['question1'].apply(lambda x: replace_sysnonyms(x))
    df['q2_synonym'] = df['question2'].apply(lambda x: replace_sysnonyms(x))

    synonyms = df[['is_duplicate','q1_synonym', 'q2_synonym', 'qid']]
    synonyms.rename(columns={'q1_synonym':'question1',
                                'q2_synonym':'question2'}, inplace=True)
    df_syn = pd.concat([df[['is_duplicate','question1', 'question2', 'qid']], synonyms])
    df_syn.reset_index(inplace=True, drop=True)
    print("Synonyms done!")
    return df_syn

def reverse_order(df):
    reverse = df[['is_duplicate','question2', 'question1', 'qid']]
    reverse.rename(columns={'question2':'question1',
                                'question1':'question2'}, inplace=True)
    df = df[['is_duplicate','question1', 'question2', 'qid']]
    df_reverse = pd.concat([df, reverse])
    df_reverse.reset_index(inplace=True, drop=True)
    print("Reverse done!")
    return df_reverse

def write_to_file(reverse, heuristic, synonyms, size):
    h_file = 'samples/a_heuristic_%sk.tsv' % size
    s_file = 'samples/a_synonyms_%sk.tsv' % size
    r_file = 'samples/a_reverse_%sk.tsv' % size
    heuristic.to_csv(h_file,sep='\t', header=False, index=False, encoding='utf-8')
    synonyms.to_csv(s_file,sep='\t', header=False, index=False, encoding='utf-8')
    reverse.to_csv(r_file,sep='\t', header=False, index=False, encoding='utf-8')
    
def augment_data(df, size):
    df = preprocess(df)
    df = features(df)
    h = heuristic(df)
#     print(h.groupby('is_duplicate').count())
    syn = synonyms(df)
#     print(syn.groupby('is_duplicate').count())
    r = reverse_order(df)
#     print(r.groupby('is_duplicate').count())
    write_to_file(r,h,syn, size)
    print('aici')
    return r,h,syn

In [141]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def word_match(row):
    q1words = {}
    q2words = {}
    for word in str(row['q1_lemma']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_lemma']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    R = len(shared_words_in_q1)
    return R

def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['q1_lemma']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_lemma']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [71]:
df_20k.isnull().sum()
df_20k.dropna(inplace=True)

In [None]:
# augment_data(df_5k, '5')
# augment_data(df_15k, '15')
# augment_data(df_20k, '20')
# augment_data(df_25k, '25')
# augment_data(df_20k, '30')
augment_data(df_10k, '10')

In [159]:
augment_data(ask_full, 'full')

Preprocess done!
Features done!
52
3545
Heuristic done!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Synonyms done!
Reverse done!
aici


(       is_duplicate  \
 0                 0   
 1                 0   
 2                 1   
 3                 0   
 4                 1   
 5                 0   
 6                 0   
 7                 0   
 8                 1   
 9                 1   
 10                0   
 11                1   
 12                1   
 13                1   
 14                0   
 15                0   
 16                1   
 17                0   
 18                0   
 19                1   
 20                1   
 21                1   
 22                1   
 23                1   
 24                1   
 25                0   
 26                0   
 27                0   
 28                0   
 29                0   
 ...             ...   
 37970             0   
 37971             1   
 37972             0   
 37973             0   
 37974             1   
 37975             1   
 37976             1   
 37977             0   
 37978             0   
 37979          

In [148]:
preprocess(ask_15k)
features(ask_15k)
h = heuristic(ask_15k)
h.groupby('is_duplicate').count()

Preprocess done!
Features done!
40
3535
Heuristic done!


Unnamed: 0_level_0,question1,question2,id
is_duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11739,11739,11739
1,6836,6836,6836


In [127]:
preprocess(ask_5k)
features(ask_5k)
test1 = reverse_order(ask_5k)

preprocess(ask_10k)
features(ask_10k)
test2 = heuristic(ask_10k)

preprocess(ask_15k)
features(ask_15k)
test3 = heuristic(ask_15k)

Preprocess done!
Features done!
Reverse done!
Preprocess done!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Features done!
Heuristic done!
Preprocess done!
Features done!
Heuristic done!


In [126]:
print(test1.count()[0])
print(test2.count()[0])
print(test3.count()[0])

6782
13569
18575


In [80]:
def percentage(x,y):
    print(round((y*100)/x,2))

In [151]:
percentage(20912, 4541)
percentage(30387, 8010)
percentage(37589, 11367)
percentage(42539, 13709)
np.mean([21.71, 26.36, 30.24, 32.23])

21.71
26.36
30.24
32.23


27.634999999999998

In [153]:
percentage(20912, 5074)
percentage(30387, 9082)
percentage(37589, 12730)
percentage(42539, 15322)
np.mean([24.26, 29.89, 33.87, 36.02])

24.26
29.89
33.87
36.02


31.010000000000005

In [83]:
percentage(5000, 2295)
percentage(10000, 3622)
percentage(15000, 6066)
np.mean([45.9, 36.22, 40.44])

45.9
36.22
40.44


40.85333333333333

In [99]:
print("Number of duplicates {}%".format(round(df_5k[df_5k.columns[0]].mean()*100, 2)))

Number of duplicates 37.84%


In [101]:
print("Number of duplicates {}%".format(round(df_10k[df_10k.columns[0]].mean()*100, 2)))

Number of duplicates 36.29%


In [100]:
print("Number of duplicates {}%".format(round(df_15k[df_15k.columns[0]].mean()*100, 2)))

Number of duplicates 36.09%


In [107]:
print(df_5k['word_overlap_score'].mean())
test1['q1_lemma'] = test1['question1'].apply(lemmatize_text)
test1['q2_lemma'] = test1['question2'].apply(lemmatize_text)
test1['word_overlap_score'] = test1.apply(word_match_share, axis=1, raw=True)
print(test1['word_overlap_score'].mean())

print(df_10k['word_overlap_score'].mean())
test2['q1_lemma'] = test2['question1'].apply(lemmatize_text)
test2['q2_lemma'] = test2['question2'].apply(lemmatize_text)
test2['word_overlap_score'] = test2.apply(word_match_share, axis=1, raw=True)
print(test2['word_overlap_score'].mean())

print(df_15k['word_overlap_score'].mean())
test3['q1_lemma'] = test3['question1'].apply(lemmatize_text)
test3['q2_lemma'] = test3['question2'].apply(lemmatize_text)
test3['word_overlap_score'] = test3.apply(word_match_share, axis=1, raw=True)
print(test3['word_overlap_score'].mean())

0.45284124103218676
0.49022946651117455
0.4520219727890094
0.5154047055249136
0.45360792319090837
0.5027675857457312


In [109]:
augment_data(ask_5k, '5')
augment_data(ask_10k, '10')
augment_data(ask_15k, '15')

Preprocess done!
Features done!




Heuristic done!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Synonyms done!
Reverse done!
aici
Preprocess done!
Features done!
Heuristic done!
Synonyms done!
Reverse done!
aici
Preprocess done!
Features done!
Heuristic done!
Synonyms done!
Reverse done!
aici


(       is_duplicate  \
 0                 1   
 1                 1   
 2                 1   
 3                 1   
 4                 1   
 5                 0   
 6                 0   
 7                 0   
 8                 0   
 9                 0   
 10                0   
 11                1   
 12                0   
 13                1   
 14                1   
 15                1   
 16                0   
 17                0   
 18                0   
 19                0   
 20                1   
 21                0   
 22                1   
 23                0   
 24                0   
 25                1   
 26                0   
 27                1   
 28                0   
 29                1   
 ...             ...   
 29970             0   
 29971             0   
 29972             0   
 29973             0   
 29974             1   
 29975             0   
 29976             0   
 29977             0   
 29978             0   
 29979          

In [164]:
df_10k["question1"] = df_10k['question1'].str.replace('[^\w\s]','')
df_10k["question2"] = df_10k['question2'].str.replace('[^\w\s]','')

df_50k["question1"] = df_50k['question1'].str.replace('[^\w\s]','')
df_50k["question2"] = df_50k['question2'].str.replace('[^\w\s]','')


# df1['question1'] = df1['question1'].apply(lambda x: [item for item in x if item not in stop])
# df1['question2'] = df1['question2'].apply(lambda x: [item for item in x if item not in stop])

# df2['question1'] = df2['question1'].apply(lambda x: [item for item in x if item not in stop])
# df2['question2'] = df2['question2'].apply(lambda x: [item for item in x if item not in stop])

qs10k = df1['question1'].str.lower().tolist() + df1['question2'].str.lower().tolist()
qs50k = df2['question1'].str.lower().tolist() + df2['question2'].str.lower().tolist()

In [270]:
df_50k[df_50k['question1'] == df_50k['question2']]

Unnamed: 0,is_duplicate,question1,question2,id,q1_lemma,q2_lemma,word_overlap,word_overlap_score,q1_n_words,q2_n_words,q1_synonym,q2_synonym
974,1,How do they measure mean sea level ?,How do they measure mean sea level ?,259230,"[How, do, they, measure, mean, sea, level, ?]","[How, do, they, measure, mean, sea, level, ?]",8,1.0,8,8,How do they step mean value sea degree ?,How do they measure mean value sea degree ?
3267,1,Why are n't most Chinese good at English ?,Why are n't most Chinese good at English ?,356338,"[Why, are, n't, most, Chinese, good, at, English, ?]","[Why, are, n't, most, Chinese, good, at, English, ?]",9,1.0,9,9,Why are n't most Chinese good at English language ?,Why are n't most Chinese good at English ?
4905,1,What are some of the most lucrative insurances to sell ?,What are some of the most lucrative insurances to sell ?,323638,"[What, are, some, of, the, most, lucrative, insurance, t...","[What, are, some, of, the, most, lucrative, insurance, t...",11,1.0,11,11,What are some of the most moneymaking insurance to sell ?,What are some of the most lucrative insurance to sell ?
6057,1,How do programming contest problem setters make test cas...,How do programming contest problem setters make test cas...,237589,"[How, do, programming, contest, problem, setter, make, t...","[How, do, programming, contest, problem, setter, make, t...",10,1.0,10,10,How do programming contest problem setter brand tryout e...,How do programing contest problem compositor brand trial...
9785,1,Why ca n't matter be created or destroyed ?,Why ca n't matter be created or destroyed ?,12675,"[Why, ca, n't, matter, be, created, or, destroyed, ?]","[Why, ca, n't, matter, be, created, or, destroyed, ?]",9,1.0,9,9,Why ca n't thing be create or destroy ?,Why ca n't thing be create or destruct ?
10320,0,Who is this ?,Who is this ?,299692,"[Who, is, this, ?]","[Who, is, this, ?]",4,1.0,4,4,Who is this ?,Who is this ?
10521,1,What 's your favorite Chinese food ?,What 's your favorite Chinese food ?,163465,"[What, 's, your, favorite, Chinese, food, ?]","[What, 's, your, favorite, Chinese, food, ?]",7,1.0,7,7,What 's your favourite Chinese food ?,What 's your favorite Chinese food ?
10892,1,What are the benefits of reading novels ?,What are the benefits of reading novels ?,87351,"[What, are, the, benefit, of, reading, novel, ?]","[What, are, the, benefit, of, reading, novel, ?]",8,1.0,8,8,What are the benefit of reading novel ?,What are the benefit of reading novel ?
11082,1,Why did Bhishma alienate Karna from the War ?,Why did Bhishma alienate Karna from the War ?,254786,"[Why, did, Bhishma, alienate, Karna, from, the, War, ?]","[Why, did, Bhishma, alienate, Karna, from, the, War, ?]",9,1.0,9,9,Why did Bhishma alienate Karna from the War ?,Why did Bhishma alienate Karna from the War ?
11782,1,How do small business owners use CRM systems ?,How do small business owners use CRM systems ?,257139,"[How, do, small, business, owner, use, CRM, system, ?]","[How, do, small, business, owner, use, CRM, system, ?]",9,1.0,9,9,How do small business concern owner use CRM system ?,How do small concern proprietor use CRM system ?


In [165]:
Counter(" ".join(qs10k).split()).most_common(10)

[('the', 9353),
 ('what', 7915),
 ('is', 6617),
 ('how', 5407),
 ('i', 5385),
 ('a', 5209),
 ('to', 5119),
 ('in', 4912),
 ('do', 4110),
 ('of', 3965)]

In [208]:
df_10k['q1_lemma'] = df_10k['question1'].apply(lemmatize_text)
df_10k['q2_lemma'] = df_10k['question2'].apply(lemmatize_text)
df_10k['word_overlap'] = df_10k.apply( word_match, axis=1, raw=True)
df_10k['word_overlap_score'] = df_10k.apply(word_match_share, axis=1, raw=True)
df_10k['q1_n_words'] = df_10k['question1'].apply(lambda row: len(row.split(" ")))
df_10k['q2_n_words'] = df_10k['question2'].apply(lambda row: len(row.split(" ")))

df_50k['q1_lemma'] = df_50k['question1'].apply(lemmatize_text)
df_50k['q2_lemma'] = df_50k['question2'].apply(lemmatize_text)
df_50k['word_overlap'] = df_50k.apply( word_match, axis=1, raw=True)
df_50k['word_overlap_score'] = df_50k.apply(word_match_share, axis=1, raw=True)
df_50k['q1_n_words'] = df_50k['question1'].apply(lambda row: len(row.split(" ")))
df_50k['q2_n_words'] = df_50k['question2'].apply(lambda row: len(row.split(" ")))

In [168]:
df_10k.head()

Unnamed: 0,is_duplicate,question1,question2,id,q1_lemma,q2_lemma,word_overlap,word_overlap_score,q1_n_words,q2_n_words
0,0,Should Mandarin course be introduced in the sc...,How do I get a Digital marketingSales job in H...,45324,"[Should, Mandarin, course, be, introduced, in,...","[How, do, I, get, a, Digital, marketingSales, ...",5,0.192308,25,33
1,0,How many people could become millionaires this...,Starting with nothing at all how many years d...,263521,"[How, many, people, could, become, millionaire...","[Starting, with, nothing, at, all, how, many, ...",3,0.230769,16,17
2,0,What are good gifts for a foreign visitor to b...,What are good gifts for a foreign visitor to b...,246571,"[What, are, good, gift, for, a, foreign, visit...","[What, are, good, gift, for, a, foreign, visit...",21,0.954545,25,25
3,1,How do you properly tie ice skates,How can I tie my ice skate shoelaces properly,313078,"[How, do, you, properly, tie, ice, skate]","[How, can, I, tie, my, ice, skate, shoelace, p...",3,0.375,8,10
4,1,Getting Started on Quora What is Quora,Who and what is Quora,239251,"[Getting, Started, on, Quora, What, is, Quora]","[Who, and, what, is, Quora]",3,0.5,9,6


In [260]:
score_dup = df_10k[(df_10k['word_overlap_score'] > 0.5) & (df_10k['is_duplicate'] == 0)].count()[0]
score_nondup = df_10k[(df_10k['word_overlap_score'] < 0.5) & (df_10k['is_duplicate'] == 1)].count()[0]
print(score_dup)
print(score_nondup)

1272
752


In [281]:
df_10k[(df_10k['word_overlap_score'] < 0.3) & (df_10k['is_duplicate'] == 1)]

Unnamed: 0,is_duplicate,question1,question2,id,q1_lemma,q2_lemma,word_overlap,word_overlap_score,q1_n_words,q2_n_words,q1_synonym,q2_synonym
380,1,Is there any evidence that can prove the existence of God ?,"Do you believe in a god ? If so , what sort of testable evidence do you have ?",392179,"[Is, there, any, evidence, that, can, prove, the, existence, of, God, ?]","[Do, you, believe, in, a, god, ?, If, so, ,, what, sort, of, testable, evide...",4,0.266667,12,19,Is there any evidence that can turn out the beingness of God ?,"Do you believe in a god ? If so , what variety of testable evidence do you h..."
701,1,What is the best way to increase traffic for a new blog ?,How do I increase traffic on my site ?,58143,"[What, is, the, best, way, to, increase, traffic, for, a, new, blog, ?]","[How, do, I, increase, traffic, on, my, site, ?]",3,0.272727,13,9,What is the best way to addition traffic for a new blog ?,How do I increase traffic on my site ?
1562,1,What is the best way to learn chess ?,How do I play chess ?,312035,"[What, is, the, best, way, to, learn, chess, ?]","[How, do, I, play, chess, ?]",2,0.266667,9,6,What is the best way to acquire cheat ?,How do I play Bromus secalinus ?
1697,1,Why do I get hiccups when I eat rice ?,What causes hiccups after you eat and how can you rid yourself of them ?,289881,"[Why, do, I, get, hiccup, when, I, eat, rice, ?]","[What, cause, hiccup, after, you, eat, and, how, can, you, rid, yourself, of...",3,0.26087,10,15,Why do I get hiccup when I eat rice ?,What cause hiccup after you eat and how can you rid yourself of them ?
1930,1,How can you cope with loneliness ?,What are the ways to end loneliness ?,39598,"[How, can, you, cope, with, loneliness, ?]","[What, are, the, way, to, end, loneliness, ?]",2,0.266667,7,8,How can you header with solitariness ?,What are the ways to end solitariness ?
1978,1,What works to make a man 's penis thicker ?,Can you make your penis larger at the age of 27 ?,95639,"[What, work, to, make, a, man, 's, penis, thicker, ?]","[Can, you, make, your, penis, larger, at, the, age, of, 27, ?]",3,0.272727,10,12,What works to brand a man 's penis thick ?,Can you brand your penis larger at the age of 27 ?
2075,1,How do I get rid of my belly fat ?,What is the best way to reduce belly and arm fat ?,25022,"[How, do, I, get, rid, of, my, belly, fat, ?]","[What, is, the, best, way, to, reduce, belly, and, arm, fat, ?]",3,0.272727,10,12,How do I get rid of my stomach fat ?,What is the best way to cut back venter and arm fat ?
2280,1,Why Do You Think World War 3 Will Happen ? I Personally think it wo n't .,What could potentially spark World War III ?,217632,"[Why, Do, You, Think, World, War, 3, Will, Happen, ?, I, Personally, think, ...","[What, could, potentially, spark, World, War, III, ?]",2,0.166667,17,8,Why Do You think world War 3 will pass off ? I personally think it wo n't .,What could potentially flicker existence War III ?
2368,1,Who is currently winning the presidential election ?,"All biases aside , at this point in time , who do you think will win the pre...",131951,"[Who, is, currently, winning, the, presidential, election, ?]","[All, bias, aside, ,, at, this, point, in, time, ,, who, do, you, think, wil...",4,0.296296,8,20,Who is presently winning the presidential election ?,"All bias aside , at this point in clip , who do you think volition win the p..."
2595,1,Did Mahabharata happen for real ?,"Is Mahabharata real ? If it is , where are the present generations of Pandav...",12525,"[Did, Mahabharata, happen, for, real, ?]","[Is, Mahabharata, real, ?, If, it, is, ,, where, are, the, present, generati...",3,0.24,6,19,Did Mahabharata happen for real ?,"Is Mahabharata real number ? If it is , where are the nowadays contemporarie..."


In [226]:
score_dup = df_50k[(df_50k['word_overlap_score'] > 0.5) & (df_50k['is_duplicate'] == 0)].count()[0]
score_nondup = df_50k[(df_50k['word_overlap_score'] < 0.5) & (df_50k['is_duplicate'] == 1)].count()[0]
print(score_dup)
print(score_nondup)

12263
3841


In [278]:
df_50k[(df_50k['word_overlap_score'] > 0.6) & (df_50k['is_duplicate'] == 0)].count()[0]

9172

In [279]:
df_50k[(df_10k['word_overlap_score'] < 0.4)& (df_50k['is_duplicate'] == 1)].count()[0]

1150

In [271]:
overlap_10k = df_10k[(df_10k['word_overlap_score'] > 0.6)]
overlap_10k = overlap_10k[['is_duplicate','question2', 'question1', 'id']]
overlap_10k.rename(columns={'question2':'question1',
                            'question1':'question2'}, inplace=True)

low_overlap_10k = df_10k[(df_10k['word_overlap_score'] < 0.4) & (df_10k['is_duplicate'] == 1)]
low_overlap_10k = low_overlap_10k[['is_duplicate','question2', 'question1', 'id']]
low_overlap_10k.rename(columns={'question2':'question1',
                            'question1':'question2'}, inplace=True)

heuristic_10k = pd.concat([df_10k[['is_duplicate','question1', 'question2', 'id']], overlap_10k, low_overlap_10k])
heuristic_10k.groupby('is_duplicate').count()

Unnamed: 0_level_0,question1,question2,id
is_duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8172,8172,8172
1,6021,6021,6021


In [272]:
overlap_50k = df_50k[(df_50k['word_overlap_score'] > 0.6)]
overlap_50k = overlap_50k[['is_duplicate','question2', 'question1', 'id']]
overlap_50k.rename(columns={'question2':'question1',
                            'question1':'question2'}, inplace=True)

low_overlap_50k = df_50k[(df_50k['word_overlap_score'] < 0.4) & (df_50k['is_duplicate'] == 1)]
low_overlap_50k = low_overlap_50k[['is_duplicate','question2', 'question1', 'id']]
low_overlap_50k.rename(columns={'question2':'question1',
                            'question1':'question2'}, inplace=True)

heuristic_50k = pd.concat([df_50k[['is_duplicate','question1', 'question2', 'id']], overlap_50k, low_overlap_50k])
heuristic_50k.groupby('is_duplicate').count()

Unnamed: 0_level_0,question1,question2,id
is_duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,41114,41114,41114
1,29930,29930,29930


In [243]:
print(heuristic_10k.count()[0])
print(heuristic_50k.count()[0])

14678
73416


In [244]:
heuristic_10k.to_csv('heuristic_10k.tsv',sep='\t', header=False, index=False, encoding='utf-8')
heuristic_50k.to_csv('heuristic_50k.tsv',sep='\t', header=False, index=False, encoding='utf-8')

# REPLACE SYNONYMS - THESAURUS BASED AUGMENTATION

In [175]:
def replace_sysnonyms(text):
    line = text.strip()
    words = line.split(" ")
    output = list()
    for word_str in words:
        word_obj = Word(word_str)
        if len(word_str) > 3 and len(word_obj.synsets) > 0:
            synset = word_obj.synsets[0]
            lemma = random.choice(synset.lemma_names())
            output.append(lemma.replace('_', ' '))
        else:
            output.append(word_str)
    new_text =  " ".join(output)
    return new_text

In [214]:
df_10k['q1_synonym'] = df_10k['question1'].apply(lambda x: replace_sysnonyms(x))
df_10k['q2_synonym'] = df_10k['question2'].apply(lambda x: replace_sysnonyms(x))

df_50k['q1_synonym'] = df_50k['question1'].apply(lambda x: replace_sysnonyms(x))
df_50k['q2_synonym'] = df_50k['question2'].apply(lambda x: replace_sysnonyms(x))

In [178]:
# df_10k[['question1', 'q1_synonym','question2', 'q2_synonym']]
synonyms_10k = df_10k[['is_duplicate','q1_synonym', 'q2_synonym', 'id']]
synonyms_10k.rename(columns={'q1_synonym':'question1',
                            'q2_synonym':'question2'}, inplace=True)
df_10k_syn = pd.concat([df_10k[['is_duplicate','question1', 'question2', 'id']], synonyms_10k])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [255]:
synonyms_10k

Unnamed: 0,is_duplicate,question1,question2,id
0,0,Should mandarin course of study be present in the school...,How do I get a digital marketingSales job in Hong Kong ...,45324
1,0,How many people could get millionaire this twelvemonth ...,start with nil at all how many old age Energy Departmen...,263521
2,0,What are good gift for a foreign visitant to take when t...,What are good gift for a foreign visitant to convey when...,246571
3,1,How do you in good order tie ice skate,How can I tie my ice skate shoelace decent,313078
4,1,acquiring start out on Quora What is Quora,Who and what is Quora,239251
5,0,What is the name of this song,What s the name of this song,158495
6,0,What are the dealing charge of BHIM,Are the state bank associate s proceedings available on ...,93832
7,0,What will be the futurity of hereafter,What is my future,364295
8,0,Can hummingbird fly backward,How do hummingbird fly rearward,346488
9,0,Do diet coke and coke naught really have nix kilocalorie,volition coke cypher or coke visible radiation make me p...,387336


In [184]:
synonyms_50k = df_50k[['is_duplicate','q1_synonym', 'q2_synonym', 'id']]
synonyms_50k.rename(columns={'q1_synonym':'question1',
                            'q2_synonym':'question2'}, inplace=True)
df_50k_syn = pd.concat([df_50k[['is_duplicate','question1', 'question2', 'id']], synonyms_50k])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [187]:
df_10k_syn.to_csv('synonyms_10k.tsv',sep='\t', header=False, index=False, encoding='utf-8')
df_50k_syn.to_csv('synonyms_50k.tsv',sep='\t', header=False, index=False, encoding='utf-8')

# REVERSE QUESTION ORDER

In [204]:
reverse_10k = df_10k[['is_duplicate','question2', 'question1', 'id']]
reverse_10k.rename(columns={'question2':'question1',
                            'question1':'question2'}, inplace=True)
reverse_10k.head()
df_reverse_10k = pd.concat([df_10k[['is_duplicate','question1', 'question2', 'id']], reverse_10k])
df_reverse_10k.head()

Unnamed: 0,is_duplicate,question1,question2,id
0,0,Should Mandarin course be introduced in the sc...,How do I get a Digital marketing/Sales job in ...,45324
1,0,How many people could become millionaires this...,"Starting with nothing at all , how many years ...",263521
2,0,What are good gifts for a foreign visitor to b...,What are good gifts for a foreign visitor to b...,246571
3,1,How do you properly tie ice skates ?,How can I tie my ice skate shoelaces properly ?,313078
4,1,Getting Started on Quora : What is Quora ?,Who and what is Quora ?,239251


In [205]:
reverse_50k = df_50k[['is_duplicate','question2', 'question1', 'id']]
reverse_50k.rename(columns={'question2':'question1',
                            'question1':'question2'}, inplace=True)
reverse_50k.head()
df_reverse_50k = pd.concat([df_50k[['is_duplicate','question1', 'question2', 'id']], reverse_50k])
df_reverse_50k.count()

is_duplicate    100000
question1       100000
question2       100000
id              100000
dtype: int64

In [206]:
df_reverse_10k.to_csv('reverse_10k.tsv',sep='\t', header=False, index=False, encoding='utf-8')
df_reverse_50k.to_csv('reverse_50k.tsv',sep='\t', header=False, index=False, encoding='utf-8')

In [253]:
df_10k[['is_duplicate','question1', 'question2', 'id']].head(15)

Unnamed: 0,is_duplicate,question1,question2,id
0,0,Should Mandarin course be introduced in the schools ? Is...,How do I get a Digital marketing/Sales job in Hong Kong ...,45324
1,0,"How many people could become millionaires this year , th...","Starting with nothing at all , how many years does it ta...",263521
2,0,What are good gifts for a foreign visitor to bring when ...,What are good gifts for a foreign visitor to bring when ...,246571
3,1,How do you properly tie ice skates ?,How can I tie my ice skate shoelaces properly ?,313078
4,1,Getting Started on Quora : What is Quora ?,Who and what is Quora ?,239251
5,0,What is the name of this song ?,What 's the name of this song ?,158495
6,0,What are the transaction charges of BHIM ?,Are the State Bank associates 's transactions available ...,93832
7,0,What will be the future of future ?,What is my future ?,364295
8,0,Can hummingbirds fly backwards ?,How do hummingbirds fly backwards ?,346488
9,0,Do Diet Coke and Coke Zero really have zero calories ?,Will Coke Zero or Coke Light make me put on weight ?,387336
