In [40]:
import pandas as pd
import numpy as np
from collections import *
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from pymagnitude import *
import time
from scipy.spatial.distance import cosine
import warnings
warnings.filterwarnings('ignore')

In [41]:
data = pd.read_csv('data/questions.csv')
data = data
path = 'data/GoogleNews-vectors-negative300.magnitude'
vectors = Magnitude(path)
tok = RegexpTokenizer('\w+')
data.question1 = pd.Series([tok.tokenize(str(s).lower()) for s in data.question1])
data.question2 = pd.Series([tok.tokenize(str(s).lower()) for s in data.question2])

In [42]:
# helper functions and constants
def sum_vec(words):
    res = np.sum([vectors.query(w) for w in words if w in vectors], axis=0)
    if type(res) is np.float64:
        return np.array([1e-32] * vectors.dim)
    else:
        return res
    
def rm_stop(words):
    stop_words = nltk.corpus.stopwords.words('english')
    return [w for w in words if not w in stop_words]

def train_test(col1 ,col2):
    sep = int(len(data)*.75)
    ret = {}
    ret['train_pnts'] = data.as_matrix([col1, col2])[:sep]
    ret['train_pnts'] = [[cosine(x[0], x[1])] for x in ret['train_pnts']]
    ret['train_tgts'] = data.as_matrix(["is_duplicate"])[:sep]
    ret['test_pnts']  = data.as_matrix([col1, col2])[sep:]
    ret['test_pnts']  = [[cosine(x[0], x[1])] for x in ret['test_pnts']]
    ret['test_tgts']  = data.as_matrix(["is_duplicate"])[sep:]
    return ret

i_tup_num = 0
i_question1 = 3
i_question2 = 4
i_is_duplicate = 5
i_vec1 = 6
i_vec2 = 7
i_stop1 = 8
i_stop_vec1 = 9
i_stop2 = 10
i_stop_vec2 = 11

In [43]:
t = time.time()
vec1 = pd.Series(sum_vec(words) for words in data.question1)
data.insert(len(data.columns), "vec1", vec1)
print("Finished vec1")
vec2 = pd.Series(sum_vec(words) for words in data.question2)
data.insert(len(data.columns), "vec2", vec2)
print("Finished vec2")
print(time.time() - t)
t = time.time()
stop1 = pd.Series(rm_stop(words) for words in data.question1)
data.insert(len(data.columns), "stop1", stop1)
stop_vec1 = pd.Series(sum_vec(words) for words in data.stop1)
data.insert(len(data.columns), "stop_vec1", stop_vec1)
print("Finished stop1")
stop2 = pd.Series(rm_stop(words) for words in data.question2)
data.insert(len(data.columns), "stop2", stop2)
stop_vec2 = pd.Series(sum_vec(words) for words in data.stop2)
data.insert(len(data.columns), "stop_vec2", stop_vec1)
print("Finished stop2")
print(time.time() - t)

Finished vec1
Finished vec2
1811.9964082241058
Finished stop1
Finished stop2
736.51695728302


In [45]:
#initial data information
print("# of data points:", len(data))
print("# of word vectors:", len(vectors))
print('vector dimensions:', vectors.dim)
cnt = Counter(data.is_duplicate)
print("random baseline:", 1/len(cnt.keys()))
print("most common baseline:", max([cnt[k]/sum(cnt.values()) for k in cnt.keys()]))

# of data points: 404351
# of word vectors: 3000000
vector dimensions: 300
random baseline: 0.5
most common baseline: 0.6307515005527375


In [46]:
accuracy_score(
    [0 if nltk.translate.bleu_score.sentence_bleu(
    [data.question1[i]], 
    data.question2[i]) < .5 else 1
    for i in range(len(data))],
    data.is_duplicate)

0.6185492307425974

In [47]:
x = train_test("vec1", "vec2")
lr = LogisticRegression().fit(x['train_pnts'], x['train_tgts'])
print("normal embeddings LR accuracy:", accuracy_score(lr.predict(x['test_pnts']), x['test_tgts']))
print("normal embeddings LR coefficients:", lr.coef_)

normal embeddings LR accuracy: 0.6544693732193733
normal embeddings LR coefficients: [[-7.12968694]]


In [48]:
x = train_test("stop_vec1", "stop_vec2")
lr = LogisticRegression().fit(x['train_pnts'], x['train_tgts'])
print("no stop words LR accuracy:", accuracy_score(lr.predict(x['test_pnts']), x['test_tgts']))
print("no stop words LR coefficients:", lr.coef_)

no stop words LR accuracy: 0.6383744855967078
no stop words LR coefficients: [[0.]]


In [22]:
i = 0
for x in data.values:
    if x[i_question1] == x[i_question2]:
        print(x[i_is_duplicate], x[i_tup_num])
        print(x[i_question1])
        i += 1
print(i)

1 1209
['what', 'are', 'the', 'minimum', 'gpa', 'and', 'gre', 'requirements', 'of', 'florida', 'state', 'university?']
1 3099
['which', 'book', 'should', 'i', 'use', 'for', 'jee', 'organic', 'chemistry?']
1 21862
['how', 'troll', 'characters', 'got', 'famous?']
1 40775
['i', 'want', 'to', 'export', 'my', 'products', 'worldwide,', 'is', 'there', 'any', 'site', 'to', 'post', 'my', 'products', 'for', 'free?']
1 40787
['how', 'did', 'playing', 'sports', 'benefit', 'you', 'in', 'real', 'life?']
1 46609
['should', 'i', 'stop', 'masturbating?']
1 70317
['how', 'much', 'computer', 'science', 'does', 'an', '8', 'year', 'old', 'us', 'kid', 'typically', 'know', 'or', 'is', 'taught', 'in', 'school', 'curricula?']
1 72670
['what', 'are', 'the', 'skills', 'required', 'for', 'big', 'data', 'jobs?']
1 79795
['when', 'will', 'the', 'market', 'price', 'of', 'mac', 'pro', 'late', '2013', 'edition', 'decrease?']
1 87351
['what', 'are', 'the', 'benefits', 'of', 'reading', 'novels?']
1 112462
['will', 'wear

In [23]:
hello = data.as_matrix(["vec1", "vec2"])
hello = [[cosine(x[0], x[1])] for x in hello]

In [27]:
x = data.values
k = 0
for i in range(len(data.values)):
    if hello[i] == [0.0]:
        print(x[i][i_is_duplicate], x[i][i_tup_num])
        print(x[i][i_question1])
        print(x[i][i_question2])
        k += 1
print(k)

0 8
['when', 'do', 'you', 'use', 'シ', 'instead', 'of', 'し?']
['when', 'do', 'you', 'use', '"&"', 'instead', 'of', '"and"?']
0 14
['what', 'are', 'the', 'laws', 'to', 'change', 'your', 'status', 'from', 'a', 'student', 'visa', 'to', 'a', 'green', 'card', 'in', 'the', 'us,', 'how', 'do', 'they', 'compare', 'to', 'the', 'immigration', 'laws', 'in', 'canada?']
['what', 'are', 'the', 'laws', 'to', 'change', 'your', 'status', 'from', 'a', 'student', 'visa', 'to', 'a', 'green', 'card', 'in', 'the', 'us?', 'how', 'do', 'they', 'compare', 'to', 'the', 'immigration', 'laws', 'in', 'japan?']
1 16
['what', 'does', 'manipulation', 'mean?']
['what', 'does', 'manipulation', 'means?']
0 42
['can', 'i', 'make', '50,000', 'a', 'month', 'by', 'day', 'trading?']
['can', 'i', 'make', '30,000', 'a', 'month', 'by', 'day', 'trading?']
1 71
['what', 'is', 'a', 'narcissistic', 'personality', 'disorder?']
['what', 'is', 'narcissistic', 'personality', 'disorder?']
0 89
['what', 'is', 'your', 'review', 'of', 'the'

0 23252
['where', 'is', 'lord', 'krishna?']
['where', 'is', 'lord', "krishna's", 'diamond?']
0 23303
['how', 'is', 'the', 'isro', 'interview', 'for', 'the', 'post', 'of', 'scientist/engineer', "'sc'", '(instrumentation)?']
['how', 'is', 'the', 'isro', 'interview', 'for', 'the', 'post', 'of', 'scientist/engineer', "'sc'", '(computer)?']
0 23352
['is', 'travellingtrips.com', 'legit?']
['is', 'buildmylikes.com', 'legit?']
0 23370
['which', 'is', 'the', 'best', 'vpn', '(virtual', 'private', 'network)', 'service', 'in', 'houston,', 'usa?']
['which', 'is', 'the', 'best', 'vpn', '(virtual', 'private', 'network)', 'service', 'in', 'pakistan?']
0 23376
['what', 'strikes', 'first', 'time', 'visitors', 'as', 'special', 'or', 'unusual', 'when', 'they', 'arrive', 'in', 'puning,', 'china?']
['what', 'strikes', 'first', 'time', 'visitors', 'as', 'special', 'or', 'unusual', 'when', 'they', 'arrive', 'in', 'chongqing,', 'china?']
0 23389
['what', 'are', 'the', 'most', 'common', 'traffic', 'convictions'

['what', 'individuals', 'and', 'events', 'in', 'history', 'are', 'a', 'source', 'of', 'pride', 'for', 'germany?']
0 45038
['how', 'much', 'salary', 'does', 'a', 'couple', 'need', 'to', 'earn', 'for', 'an', 'average/good', 'lifestyle', 'in', 'pune?']
['how', 'much', 'salary', 'does', 'a', 'couple', 'need', 'to', 'earn', 'for', 'an', 'average/good', 'lifestyle', 'in', 'ncr?']
0 45049
['who', 'are', 'the', 'biggest', 'influencers', 'in', 'the', "'qrops'", 'niche?']
['who', 'are', 'the', 'biggest', 'influencers', 'in', 'the', "'qnups'", 'niche?']
0 45053
['which', 'is', 'the', 'best', 'engineering', 'college', 'in', 'gujarat?']
['which', 'is', 'the', 'best', 'engineering', 'college', 'in', 'orissa?']
1 45073
['which', 'laptop', 'is', 'best', 'for', 'accounting?']
['which', 'laptop', 'is', 'best', 'for', 'accountants?']
0 45087
['what', 'is', 'the', 'future', 'of', 'hadoop', 'and', 'bigdata?']
['what', 'is', 'the', 'future', 'of', 'bigdata?']
1 45097
['how', 'can', 'i', 'see', 'who', 'viewe

['what', 'is', 'the', 'noun', 'form', 'of', '“complex”?']
['what', 'is', 'the', 'noun', 'form', 'of', 'far?']
1 68233
['what', 'is', 'the', 'best', 'gift', 'for', 'mom', 'and', 'dad?']
['what', 'is', 'the', 'best', 'gift', 'for', 'a', 'mom', 'and/or', 'dad?']
0 68280
['why', 'does', 'ezekiel', 'bread', 'have', 'to', 'be', 'refrigerated?']
['why', 'does', 'ezekiel', 'bread', 'have', 'to', 'be', 'frozen?']
1 68293
['is', 'diet', 'coke', 'really', 'a', '"diet"', 'coke?']
['is', 'diet', 'coke', 'really', 'diet?']
0 68355
['how', 'do', 'i', 'translate', '"骂人"', 'into', 'english?']
['how', 'do', 'i', 'translate', '"长夜难明赤县天，百年魔怪舞翩跹"', 'into', 'english?']
0 68388
['where', 'does', 'the', 'word', "'jitterbug'", 'come', 'from?']
['where', 'does', 'the', 'word', '"milk"', 'come', 'from?']
1 68466
['what', 'is', 'the', 'difference', 'between', 'dual', 'core', 'and', 'quad', 'core', 'processors?']
['what', 'is', 'the', 'difference', 'between', 'a', 'dual', 'core', 'and', 'a', 'quad', 'core', 'proce

['what', 'is', 'the', 'best', 'way', 'to', 'transfer', 'money', 'from', 'brazil', 'to', 'the', 'uk?']
['what', 'is', 'the', 'best', 'way', 'to', 'transfer', 'money', 'from', 'brazil', 'to', 'the', 'us?']
0 92648
['what', 'are', 'the', 'pros', 'and', 'cons', 'of', 'learning', 'c++', 'as', 'a', 'first', 'programming', 'language?']
['what', 'are', 'the', 'pros', 'and', 'cons', 'of', 'learning', 'c#', 'as', 'a', 'first', 'programming', 'language?']
0 92658
['what', 'hotel', 'in', 'bishramganj', 'would', 'be', 'safe', 'for', 'unmarried', 'couples,', 'without', 'the', 'harassment', 'of', 'police,', 'hotel', 'staff,', 'and', 'moral', 'police?']
['what', 'hotel', 'in', 'fatehgarh', 'would', 'be', 'safe', 'for', 'unmarried', 'couples,', 'without', 'the', 'harassment', 'of', 'police,', 'hotel', 'staff,', 'and', 'moral', 'police?']
1 92677
['which', 'city', 'is', 'the', 'best', 'in', 'india?', 'why?']
['which', 'is', 'the', 'best', 'city', 'in', 'india?']
0 92685
['what', 'are', 'some', 'mind-blo

0 116724
['what', 'is', 'the', 'history', 'of', 'the', 'glassboro', 'train', 'station,', 'and', 'how', 'does', 'it', 'compare', 'to', 'bassendean?']
['what', 'is', 'the', 'history', 'of', 'the', 'glassboro', 'train', 'station,', 'and', 'how', 'does', 'it', 'compare', 'to', 'kwinana?']
0 116732
['how', 'is', 'the', 'word', "'audible'", 'used', 'in', 'a', 'sentence?']
['how', 'is', 'the', 'word', "'quibble'", 'used', 'in', 'a', 'sentence?']
0 116742
['what', 'are', 'some', 'tips', 'on', 'making', 'it', 'through', 'the', 'job', 'interview', 'process', 'at', 'citizens?']
['what', 'are', 'some', 'tips', 'on', 'making', 'it', 'through', 'the', 'job', 'interview', 'process', 'at', 'zendesk?']
0 116743
['what', 'is', 'cisco', '2.0?']
['what', 'is', 'cisco', 'ios?']
0 116759
['who', 'is', 'the', 'founder', 'of', 'patnabeats.com?']
['who', 'is', 'the', 'founder', 'of', 'findingclue.com?']
0 116763
['what', 'strikes', 'first', 'time', 'visitors', 'as', 'special', 'or', 'unusual', 'when', 'they', 

0 144125
['where', 'can', 'i', 'download', 'sherlock', 'holmes', 'season', '4?']
['where', 'can', 'i', 'download', 'sherlock', 'holmes', 'season', '3?']
0 144153
['what', 'hotel', 'in', 'anjaw', 'district', 'would', 'be', 'safe', 'for', 'unmarried', 'couples,', 'without', 'the', 'harassment', 'of', 'police,', 'hotel', 'staff,', 'and', 'moral', 'police?']
['what', 'hotel', 'in', 'panchkula', 'district', 'would', 'be', 'safe', 'for', 'unmarried', 'couples,', 'without', 'the', 'harassment', 'of', 'police,', 'hotel', 'staff,', 'and', 'moral', 'police?']
1 144184
['is', 'there', 'life', 'after', 'death?']
['is', 'there', 'life', 'after', 'life?']
0 144195
['where', 'can', 'i', 'find', 'a', 'list', 'of', "yahoo's", 'entire', 'product', 'offering?']
['where', 'can', 'i', 'find', 'a', 'list', 'of', "pinterest's", 'entire', 'product', 'offering?']
0 144196
['where', 'can', 'i', 'buy', 'quinoa', 'in', 'bangalore?']
['where', 'can', 'i', 'buy', 'quinoa', 'in', 'india?']
0 144199
['what', 'are', '

0 167450
['what', 'things', 'can', 'i', 'do', 'in', 'my', '50s', 'to', 'become', 'a', 'millionaire', 'by', '65?']
['what', 'things', 'can', 'i', 'do', 'in', 'my', '20s', 'to', 'become', 'a', 'millionaire', 'by', '30?']
0 167470
['how', 'has', 'quora', 'changed', 'your', 'views/beliefs', 'of', 'christians?']
['how', 'has', 'quora', 'changed', 'your', 'view/belief', 'of', 'men?']
1 167554
['what', 'are', 'sharpies', 'made', 'of,', 'and', 'how', 'are', 'they', 'made?']
['what', 'are', 'sharpies', 'made', 'of', 'and', 'how', 'are', 'they', 'made?']
1 167638
['is', 'this', 'sentence', 'grammatically', 'correct?', 'i', 'am', 'used', 'to', 'hard', 'work.']
['is', 'this', 'sentence', 'grammatically', 'correct?', 'i', 'am', 'used', 'to', 'hard', 'work.?']
0 167674
['what', 'is', 'your', 'definition', 'of', 'an', 'ideal', 'indian', 'boy?']
['what', 'is', 'your', 'definition', 'of', 'an', 'ideal', 'indian', 'girl?']
0 167695
['how', 'is', 'the', 'word', "'fragrant'", 'used', 'in', 'a', 'sentence?

0 191722
['what', 'are', 'some', 'show', 'and', 'tell', 'ideas', 'that', 'begin', 'with', 'the', 'letter', '"j"?']
['what', 'are', 'some', 'show', 'and', 'tell', 'ideas', 'that', 'begin', 'with', 'the', 'letter', '"z"?']
1 191729
['what', 'are', 'the', 'safety', 'precautions', 'on', 'handling', 'shotguns', 'proposed', 'by', 'the', 'nra', 'in', 'wisconsin?']
['what', 'are', 'the', 'safety', 'precautions', 'on', 'handling', 'shotguns', 'proposed', 'by', 'the', 'nra', 'in', 'pennsylvania?']
0 191730
['what', 'does', 'salesforce', 'want?']
['what', 'does', 'salesforce', 'mean?']
1 191760
['what', 'were', 'the', 'major', 'effects', 'of', 'the', 'cambodia', 'earthquake,', 'and', 'how', 'do', 'these', 'effects', 'compare', 'to', 'the', 'sanriku', 'earthquake', 'in', '1896?']
['what', 'were', 'the', 'major', 'effects', 'of', 'the', 'cambodia', 'earthquake,', 'and', 'how', 'do', 'these', 'effects', 'compare', 'to', 'the', 'iquique', 'earthquake', 'in', '1877?']
0 191787
['how', 'many', 'bollywo

['what', 'is', 'the', 'hardest', 'thing(s)', 'about', 'raising', 'children', 'in', 'japan?']
0 216549
['what', 'is', 'the', 'precise', 'meaning', 'of', 'the', 'sanskrit', 'word', 'समवेताः?']
['what', 'is', 'the', 'precise', 'meaning', 'of', 'the', 'sanskrit', 'word', 'अनुशासन?']
0 216626
['what', 'are', 'some', 'tips', 'on', 'making', 'it', 'through', 'the', 'job', 'interview', 'process', 'at', 'premier?']
['what', 'are', 'some', 'tips', 'on', 'making', 'it', 'through', 'the', 'job', 'interview', 'process', 'at', 'etsy?']
0 216630
['what', 'individuals', 'and', 'events', 'in', 'history', 'are', 'a', 'source', 'of', 'pride', 'for', 'qatar?']
['what', 'individuals', 'and', 'events', 'in', 'history', 'are', 'a', 'source', 'of', 'pride', 'for', 'israel?']
0 216650
['can', 'i', 'learn', 'karate', 'at', 'the', 'age', 'of', '25?']
['can', 'i', 'learn', 'karate', 'at', 'the', 'age', 'of', '30?']
1 216725
['what', 'is', 'epsom', 'salt?']
['what', 'is', 'epsom', 'salts?']
0 216768
['what', 'is',

['what', 'are', 'the', 'safety', 'precautions', 'on', 'handling', 'shotguns', 'proposed', 'by', 'the', 'nra', 'in', 'georgia?']
0 244601
['how', 'is', 'school', 'changing', 'in', 'the', '21st', 'century', 'in', 'palestine?']
['how', 'is', 'school', 'changing', 'in', 'the', '21st', 'century', 'in', 'japan?']
1 244632
['does', 'this', 'sentence', 'seem', 'weird?', '"the', 'alert', 'policeman', 'crossed', 'the', 'road."?']
['does', 'this', 'sentence', 'seem', 'weird?', '"the', 'alert', 'policeman', 'crossed', 'the', 'road."']
1 244650
['how', 'do', 'you', 'clean', 'your', 'ears?']
['how', 'do', 'you', 'clean', 'your', 'ear?']
0 244654
['what', 'are', 'the', 'safety', 'precautions', 'on', 'handling', 'shotguns', 'proposed', 'by', 'the', 'nra', 'in', 'colorado?']
['what', 'are', 'the', 'safety', 'precautions', 'on', 'handling', 'shotguns', 'proposed', 'by', 'the', 'nra', 'in', 'vermont?']
0 244655
['what', 'are', 'some', 'venture', 'capital', 'firms', 'that', 'focus', 'on', 'early', 'stage'

['does', 'fan', 'have', 'post', 'credits', 'scenes?']
['does', 'fan', '(2016', 'film)', 'have', 'post', 'credits', 'scenes?']
0 270133
['how', 'is', 'the', 'word', "'penance'", 'used', 'in', 'a', 'sentence?']
['how', 'is', 'the', 'word', "'perfidious'", 'used', 'in', 'a', 'sentence?']
0 270177
['how', 'does', 'the', 'hp', 'officejet', '4620', 'airprint', 'compare', 'to', 'the', 'hp', 'laserjet', 'enterprise', 'm605x?']
['how', 'does', 'the', 'hp', 'officejet', '4620', 'airprint', 'compare', 'to', 'the', 'hp', 'laserjet', 'enterprise', 'm605n?']
0 270181
['what', 'are', 'some', 'things', 'to', 'look', 'for', 'when', 'buying', 'a', 'house', 'or', 'condo', 'built', 'before', '1980?']
['what', 'are', 'some', 'things', 'to', 'look', 'for', 'when', 'buying', 'a', 'house', 'or', 'condo', 'built', 'before', '1950?']
0 270188
['what', 'is', 'the', 'latest', 'trend', 'of', 'ethnic', 'dress', 'for', 'girls?']
['what', 'is', 'the', 'latest', 'trend', 'of', 'ethnic', 'dress', 'for', 'men?']
0 27019

['what', 'is', 'a', 'suitable', 'solar', 'panel', 'installation', 'provider', 'in', 'glendale,', 'arizona', 'az?']
['what', 'is', 'a', 'suitable', 'solar', 'panel', 'installation', 'provider', 'in', 'florence,', 'arizona', 'az?']
0 297784
['what', 'is', 'the', 'lewis', 'dot', 'structure', 'of', 'ch3nh2?']
['what', 'is', 'the', 'lewis', 'dot', 'structure', 'of', 'c2h3cl?']
0 297832
['how', 'is', 'school', 'changing', 'in', 'the', '21st', 'century', 'in', 'afghanistan?']
['how', 'is', 'school', 'changing', 'in', 'the', '21st', 'century', 'in', 'japan?']
1 297848
['where', 'can', 'i', 'get', 'the', 'second', 'edition', 'of', 'alan', 'v.', "oppenheim's", '"signals', '&', 'systems?']
['where', 'can', 'i', 'get', 'the', 'second', 'edition', 'of', 'alan', 'v.', "oppenheim's", '"signals', '&', 'systems"', 'pdf?']
0 297906
['what', 'are', 'some', 'examples', 'of', 'sentences', 'using', 'the', 'word', '"brigand"?']
['what', 'are', 'some', 'examples', 'of', 'sentences', 'using', 'the', 'word', '"

0 321957
['do', 'you', 'consider', 'yourself', 'ugly?']
['do', 'you', 'consider', 'yourself', 'special?']
0 321980
['how', 'is', 'the', 'word', "'courtier'", 'used', 'in', 'a', 'sentence?']
['how', 'is', 'the', 'word', "'irascible'", 'used', 'in', 'a', 'sentence?']
1 321993
['what', 'is', 'pokémon?']
['what', 'is', 'a', 'pokémon?']
1 322029
['what', 'is', 'the', 'reason', 'behind', 'appointing', 'vijay', 'rupani', 'as', 'a', 'gujarat', 'cm?']
['what', 'is', 'the', 'reason', 'behind', 'appointing', 'vijay', 'rupani', 'as', 'gujarat', 'cm?']
0 322095
['what', 'language', 'is', 'used', 'in', 'visual', 'basic?', 'how', 'does', 'it', 'compare', 'to', 'python?']
['what', 'language', 'is', 'used', 'in', 'visual', 'basic?', 'how', 'does', 'it', 'compare', 'to', 'ruby?']
1 322102
['what', 'is', 'the', 'electoral', 'college?']
['what', 'is', 'the', 'electoral', 'college(usa)?']
0 322113
['what', 'individuals', 'and', 'events', 'in', 'history', 'are', 'a', 'source', 'of', 'pride', 'for', 'monaco?

0 348356
['what', 'is', 'unusual', 'or', 'different', 'about', 'the', 'food', 'and', 'cuisine', 'in', 'jamaica?']
['what', 'is', 'unusual', 'or', 'different', 'about', 'the', 'food', 'and', 'cuisine', 'in', 'germany?']
1 348361
['should', 'i', 'listen', 'to', 'my', 'heart', 'or', 'to', 'my', 'parents?']
['should', 'i', 'listen', 'to', 'my', 'heart', 'or', 'my', 'parents?']
0 348372
['what', 'is', 'a', 'cumulative', 'frequency?']
['what', 'is', 'cumulative', 'percentage?']
0 348382
['how', 'dangerous', 'is', 'tecate,', 'mexico?']
['how', 'dangerous', 'is', 'mexico?']
0 348425
['what', 'are', 'the', 'best', 'job', 'fairs', 'near', 'sacramento?', 'when', 'and', 'where', 'are', 'they', 'held?', 'what', 'were', 'your', 'experiences', 'at', 'them?']
['what', 'are', 'the', 'best', 'job', 'fairs', 'near', 'toronto?', 'when', 'and', 'where', 'are', 'they', 'held?', 'what', 'were', 'your', 'experiences', 'at', 'them?']
0 348471
['what', 'is', 'a', 'suitable', 'inpatient', 'drug', 'and', 'alcohol

0 373465
['what', 'individuals', 'and', 'events', 'in', 'history', 'are', 'a', 'source', 'of', 'pride', 'for', 'venezuela?']
['what', 'individuals', 'and', 'events', 'in', 'history', 'are', 'a', 'source', 'of', 'pride', 'for', 'singapore?']
0 373475
['what', 'are', 'the', 'main', 'imports', 'and', 'exports', 'of', 'venezuela,', 'and', 'how', 'does', "venezuela's", 'industry', 'compare', 'to', "aruba's?"]
['what', 'are', 'the', 'main', 'imports', 'and', 'exports', 'of', 'venezuela,', 'and', 'how', 'does', "venezuela's", 'industry', 'compare', 'to', "anguilla's?"]
0 373486
['what', 'are', 'the', 'best', 'restaurants', 'to', 'try', 'when', 'visiting', 'kindia,', 'guinea?', 'what', 'should', 'you', 'try', 'while', "you're", 'there?']
['what', 'are', 'the', 'best', 'restaurants', 'to', 'try', 'when', 'visiting', 'labé,', 'guinea?', 'what', 'should', 'you', 'try', 'while', "you're", 'there?']
1 373491
['what', 'was', 'the', 'significance', 'of', 'the', 'battle', 'of', 'somme,', 'and', 'how',

0 396136
['does', 'masturbation', 'affect', 'fertility?']
['does', 'masturbation', 'affect', 'immunity?']
1 396146
['how', 'can', 'i', 'become', 'radio', 'jockey?']
['how', 'can', 'i', 'become', 'a', 'radio', 'jockey?']
1 396223
['what', 'are', 'some', 'examples', 'of', 'vertebrated', 'animals?']
['what', 'are', 'some', 'examples', 'of', 'vertebrates?']
0 396271
['what', 'is', 'shopmebook.com?']
['what', 'is', 'zombo.com?']
0 396327
['what', 'type', 'of', 'government', 'does', 'guatemala', 'have?', 'how', 'does', 'it', 'compare', 'to', 'the', 'one', 'in', 'canada?']
['what', 'type', 'of', 'government', 'does', 'guatemala', 'have?', 'how', 'does', 'it', 'compare', 'to', 'the', 'one', 'in', 'poland?']
0 396383
['what', 'is', 'the', 'relation', 'between', 'electrical', 'engineering', 'and', 'the', 'environment?']
['what', 'is', 'the', 'relation', 'between', 'electrical', 'engineering', 'and', 'the', 'ecology?']
1 396390
['what', 'are', 'the', 'last', 'known', '30', 'digits', 'of', 'pi?']


In [None]:
nltk.corpus.stopwords.words('english')

In [59]:
cosine(vectors.query("PhD"), vectors.query("Ph.D."))

0.09376221895217896

In [49]:
"WW1" in vectors

True

In [65]:
"Chick-fil-A" in vectors

False

In [44]:
data[299692:299692+1]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,vec1,vec2,stop1,stop_vec1,stop2,stop_vec2
299692,299692,588624,588625,"[who, is, this]","[who, is, this]",0,"[0.12313239, 0.1113666, 0.0644162, 0.1328491, ...","[0.12313239, 0.1113666, 0.0644162, 0.1328491, ...",[],"[1e-32, 1e-32, 1e-32, 1e-32, 1e-32, 1e-32, 1e-...",[],"[1e-32, 1e-32, 1e-32, 1e-32, 1e-32, 1e-32, 1e-..."
