In [1]:
import re, os
from multiprocessing.dummy import Pool

import numpy as np
import pandas as pd
import swifter
from tqdm import tqdm
tqdm.pandas()

from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords


from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

# download('punkt') #tokenizer, run once
# download('stopwords') #stopwords dictionary, run once

np.random.seed = 0

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 1000)
pd.set_option('display.expand_frame_repr', False)

os.chdir(r'C:\Users\pbhavsa\OneDrive - MORNINGSTAR INC\git\msai')

In [2]:
df = pd.read_csv('data/data.tsv', sep='\t', header=None, nrows = 1000, names=['query_id', 'query', 'passage_text', 'label', 'passage_id'])
df.head()

Unnamed: 0,query_id,query,passage_text,label,passage_id
0,0,. what is a corporation?,"A company is incorporated in a specific nation, often within the bounds of a smaller subset of that nation, such as a state or province. The corporation is then governed by the laws of incorporation in that state. A corporation may issue stock, either private or public, or may be classified as a non-stock corporation. If stock is issued, the corporation will usually be governed by its shareholders, either directly or indirectly.",0,0
1,0,. what is a corporation?,"Today, there is a growing community of more than 2,100 Certified B Corps from 50 countries and over 130 industries working together toward 1 unifying goal: to redefine success in business. Join the Movement",0,1
2,0,. what is a corporation?,"Corporation definition, an association of individuals, created by law or under authority of law, having a continuous existence independent of the existences of its members, and powers and liabilities distinct from those of its members. See more.",0,2
3,0,. what is a corporation?,Examples of corporation in a Sentence. 1 He works as a consultant for several large corporations. 2 a substantial corporation that showed that he was a sucker for all-you-can-eat buffets.,0,3
4,0,. what is a corporation?,"1: a government-owned corporation (as a utility or railroad) engaged in a profit-making enterprise that may require the exercise of powers unique to government (as eminent domain) — called also government corporation, publicly held corporation",0,4


In [3]:
len(df)

1000

In [4]:
#Initialize Global variables
GloveEmbeddings = {}
max_query_words = 12
max_passage_words = 50
emb_dim = 300
# embeddingFileName = "glove.6B/glove.6B.%sd.txt"%emb_dim
embeddingFileName = "embeddings/paragram_300_sl999.txt"

stop_words = stopwords.words('english')

# stop_words

In [5]:
def preprocess1(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
#     if doc == []: doc = word_tokenize(text.lower())
    return doc

def loadEmbeddings(embeddingfile):
    global GloveEmbeddings,emb_dim

    fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
    for line in fe:
        tokens= line.strip().split()
        word = tokens[0]
        vec = tokens[1:]
        vec = " ".join(vec)
        GloveEmbeddings[word]=vec
    #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
    GloveEmbeddings["zerovec"] = "0.0 "*emb_dim
    fe.close()
    
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == '../embeddings/crawl-300d-2M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='utf-8') if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index
    
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")
    
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
    
def get_average_wv(words):
    feature_vector = []
    for word in words:
        if(word in GloveEmbeddings):
            try:
                vec = GloveEmbeddings[word]
            except:
                print(word)
                continue
            feature_vector.append(vec)
#         else:
#             vec = [float(v) for v in GloveEmbeddings["zerovec"].strip().split()]
#             feature_vector.append(vec)
    return np.average(feature_vector, axis=0)

In [6]:
df_eval = pd.read_csv('data/eval1_unlabelled.tsv', sep='\t', header=None, names=['query_id', 'query', 'passage_text', 'passage_id'])
df_eval.head()

Unnamed: 0,query_id,query,passage_text,passage_id
0,1135787,distance between erie in buffalo new york,"Erie Canal Distance Tables The Erie Canal is the longest canal section in the New York State Canal System. Beginning at lock #E-2 in Waterford, NY, the Erie Canal runs 293 NM (337 SM or 543 KM) to the Niagara River at Tonawanda, NY and from there onto Buffalo, NY and Lake Erie via the Black Rock Canal.",0
1,1135787,distance between erie in buffalo new york,"What is the distance between Erie AND Buffalo? The distance between Erie and Buffalo in a straight line is 87 miles or 139.98 Kilometers . Driving Directions & Drive Times from Erie to Buffalo can be found further down the page. Driving distances, maps and journey times are currently provided by Google mapping systems.",1
2,1135787,distance between erie in buffalo new york,"The distance between Erie and Buffalo in a straight line is 87 miles or 139.98 Kilometers. Driving Directions & Drive Times from Erie to Buffalo can be found further down the page. Driving distances, maps and journey times are currently provided by Google mapping systems.",2
3,1135787,distance between erie in buffalo new york,"Erie Canal Distances. Erie Canal Distance Tables. The Erie Canal is the longest canal section in the New York State Canal System. Beginning at lock #E-2 in Waterford, NY, the Erie Canal runs 293 NM (337 SM or 543 KM) to the Niagara River at Tonawanda, NY and from there onto Buffalo, NY and Lake Erie via the Black Rock Canal. The Erie Canal Distance Table (shown below) provides distances between some of the major points along the length of the canal.",3
4,1135787,distance between erie in buffalo new york,"Erie's Metropolitan Area consists of approximately 280,000 residents and an Urbanized Area population of approximately 195,000. The city is the seat of government for Erie County. Erie is near Cleveland, Ohio; Buffalo, New York; and Pittsburgh, Pennsylvania. Once teeming with heavy industry, Erie's manufacturing sector remains prominent in the local economy, though healthcare, higher education, and tourism are emerging as greater economic drivers.",4


In [7]:
df_vocab = pd.DataFrame(np.reshape(df_eval[['query', 'passage_text']].values, (-1,1)), columns=['text'])
print(df_vocab.shape)
df_vocab.head()

(208340, 1)


Unnamed: 0,text
0,distance between erie in buffalo new york
1,"Erie Canal Distance Tables The Erie Canal is the longest canal section in the New York State Canal System. Beginning at lock #E-2 in Waterford, NY, the Erie Canal runs 293 NM (337 SM or 543 KM) to the Niagara River at Tonawanda, NY and from there onto Buffalo, NY and Lake Erie via the Black Rock Canal."
2,distance between erie in buffalo new york
3,"What is the distance between Erie AND Buffalo? The distance between Erie and Buffalo in a straight line is 87 miles or 139.98 Kilometers . Driving Directions & Drive Times from Erie to Buffalo can be found further down the page. Driving distances, maps and journey times are currently provided by Google mapping systems."
4,distance between erie in buffalo new york


In [9]:
GloveEmbeddings = load_embed(embeddingFileName)
vocab = build_vocab(df_vocab['text'])
add_lower(GloveEmbeddings, vocab)
# get_average_wv(['how', 'are', 'you'])

Added 0 words to embedding


In [9]:
# query_vectors = np.load('data/query_vectors.npy')
# passage_vectors = np.load('data/passage_vectors.npy')

In [10]:
a = [[1,0], [8,9]]
b = [[1,1], [8,9]]
num = np.sum(np.multiply(a, b), axis=1)
denom = np.multiply(norm(a, axis=1), norm(b, axis=1))
np.divide(num, denom)

array([0.70710678, 1.        ])

### Pre-processing funcs

In [11]:
text_cols = ['query', 'passage_text']
    
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

mispell_dict = {'efinition': 'definition', 'onfidence':'confidence', 'irections': 'directions', 
                 'itamin':'vitamin', 'epending':'depending', 'auses':'causes', 
                 'nstructions':'instructions', 'ypically': 'typically', 'ummary':'summary',
                 'acronymfinder':'acronym finder', 'wikianswers®':'wiki answers', 
                 'ackground':'background', 'omments': 'comments', 'uscle': 'muscle',
                 'microsoft®': 'microsoft', 'iktionary': 'dictionary', 'nswered': 'answered',
                 'rigin': 'origin', 'ypical': 'typical', 'botox®': 'botox', 'elevance':'relevance',
                 'epatitis': 'hepatitis', 'alcium': 'calcium'
                 }

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def preprocess2(df, text_cols):
    for col in text_cols:
        df[col] = df[col].apply(lambda x: x.lower())
        df[col] = df[col].apply(lambda x: clean_contractions(x, contraction_mapping))
        df[col] = df[col].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
        df[col] = df[col].apply(lambda x: correct_spelling(x, mispell_dict))
    return df

In [12]:
df = preprocess2(df, text_cols)
df.head()

Unnamed: 0,query_id,query,passage_text,label,passage_id
0,0,. what is a corporation ?,"a company is incorporated in a specific nation , often within the bounds of a smaller subset of that nation , such as a state or province . the corporation is then governed by the laws of incorporation in that state . a corporation may issue stock , either private or public , or may be classified as a non - stock corporation . if stock is issued , the corporation will usually be governed by its shareholders , either directly or indirectly .",0,0
1,0,. what is a corporation ?,"today , there is a growing community of more than 2 , 100 certified b corps from 50 countries and over 130 industries working together toward 1 unifying goal : to redefine success in business . join the movement",0,1
2,0,. what is a corporation ?,"corporation ddefinition , an association of individuals , created by law or under authority of law , having a continuous existence independent of the existences of its members , and powers and liabilities distinct from those of its members . see more .",0,2
3,0,. what is a corporation ?,examples of corporation in a sentence . 1 he works as a consultant for several large corporations . 2 a substantial corporation that showed that he was a sucker for all - you - can - eat buffets .,0,3
4,0,. what is a corporation ?,"1 : a government - owned corporation ( as a utility or railroad ) engaged in a profit - making enterprise that may require the exercise of powers unique to government ( as eminent domain ) - called also government corporation , publicly held corporation",0,4


In [13]:
df_eval = preprocess2(df_eval, text_cols)
df_eval.head()

Unnamed: 0,query_id,query,passage_text,passage_id
0,1135787,distance between erie in buffalo new york,"erie canal distance tables the erie canal is the longest canal section in the new york state canal system . beginning at lock # e - 2 in waterford , ny , the erie canal runs 293 nm ( 337 sm or 543 km ) to the niagara river at tonawanda , ny and from there onto buffalo , ny and lake erie via the black rock canal .",0
1,1135787,distance between erie in buffalo new york,"what is the distance between erie and buffalo ? the distance between erie and buffalo in a straight line is 87 miles or 139 . 98 kilometers . driving ddirections & drive times from erie to buffalo can be found further down the page . driving distances , maps and journey times are currently provided by google mapping systems .",1
2,1135787,distance between erie in buffalo new york,"the distance between erie and buffalo in a straight line is 87 miles or 139 . 98 kilometers . driving ddirections & drive times from erie to buffalo can be found further down the page . driving distances , maps and journey times are currently provided by google mapping systems .",2
3,1135787,distance between erie in buffalo new york,"erie canal distances . erie canal distance tables . the erie canal is the longest canal section in the new york state canal system . beginning at lock # e - 2 in waterford , ny , the erie canal runs 293 nm ( 337 sm or 543 km ) to the niagara river at tonawanda , ny and from there onto buffalo , ny and lake erie via the black rock canal . the erie canal distance table ( shown below ) provides distances between some of the major points along the length of the canal .",3
4,1135787,distance between erie in buffalo new york,"erie ' s metropolitan area consists of approximately 280 , 000 residents and an urbanized area population of approximately 195 , 000 . the city is the seat of government for erie county . erie is near cleveland , ohio ; buffalo , new york ; and pittsburgh , pennsylvania . once teeming with heavy industry , erie ' s manufacturing sector remains prominent in the local economy , though healthcare , higher education , and tourism are emerging as greater economic drivers .",4


In [14]:
def cosine_similarity_list(a,b):
    num = np.sum(np.multiply(a, b), axis=1)
    denom = np.multiply(norm(a, axis=1), norm(b, axis=1))
    return np.divide(num, denom)

In [15]:
def cosine_similarity_pandas(row):
    q = get_average_wv(preprocess1(row['query']))
    a = get_average_wv(preprocess1(row['passage_text']))
    return 1 - spatial.distance.cosine(q,a)

# def cosine_similarity_pandas(row):
#     q = get_average_wv(row['query'].str().split())
#     a = get_average_wv(row['passage_text'].str().split())
#     return 1 - spatial.distance.cosine(q,a)

In [16]:
df['cs'] = df.swifter.apply(cosine_similarity_pandas, axis=1)
df.head(n=10)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 850.98it/s]


Unnamed: 0,query_id,query,passage_text,label,passage_id,cs
0,0,. what is a corporation ?,"a company is incorporated in a specific nation , often within the bounds of a smaller subset of that nation , such as a state or province . the corporation is then governed by the laws of incorporation in that state . a corporation may issue stock , either private or public , or may be classified as a non - stock corporation . if stock is issued , the corporation will usually be governed by its shareholders , either directly or indirectly .",0,0,0.584834
1,0,. what is a corporation ?,"today , there is a growing community of more than 2 , 100 certified b corps from 50 countries and over 130 industries working together toward 1 unifying goal : to redefine success in business . join the movement",0,1,0.398805
2,0,. what is a corporation ?,"corporation ddefinition , an association of individuals , created by law or under authority of law , having a continuous existence independent of the existences of its members , and powers and liabilities distinct from those of its members . see more .",0,2,0.356746
3,0,. what is a corporation ?,examples of corporation in a sentence . 1 he works as a consultant for several large corporations . 2 a substantial corporation that showed that he was a sucker for all - you - can - eat buffets .,0,3,0.627065
4,0,. what is a corporation ?,"1 : a government - owned corporation ( as a utility or railroad ) engaged in a profit - making enterprise that may require the exercise of powers unique to government ( as eminent domain ) - called also government corporation , publicly held corporation",0,4,0.626559
5,0,. what is a corporation ?,mcdonald ' s corporation is one of the most recognizable corporations in the world . a corporation is a company or group of people authorized to act as a single entity ( legally a person ) and recognized as such in law . early incorporated entities were established by charter ( i . e . by an ad hoc act granted by a monarch or passed by a parliament or legislature ) .,1,5,0.551896
6,0,. what is a corporation ?,"corporations are owned by their stockholders ( shareholders ) who share in profits and losses generated through the firm ' s operations , and have three distinct characteristics ( 1 ) legal existence : a firm can ( like a person ) buy , sell , own , enter into a contract , and sue other persons and firms , and be sued by them .",0,6,0.536037
7,0,. what is a corporation ?,"an association is an organized group of people who share in a common interest , activity , or purpose . 1 start a business plan your business . create your business structure types of business structures . 2 change or update your business add a new location to your existing business . add an endorsement to your existing business .",0,7,0.48999
8,0,. what is a corporation ?,b corp certification shines a light on the companies leading the global movement . . .,0,8,0.462866
9,0,. what is a corporation ?,llcs offer greater flexibility when it comes to income taxes . 1 the owner or member of an llc can have their income taxed in three ways : 2 a single owner llc is treated as a schedule c ( sole proprietor ) for tax purposes .,0,9,0.176873


In [21]:
df['cs'] = df.swifter.apply(cosine_similarity_pandas, axis=1)
df.head(n=10)

Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 902.02it/s]


Unnamed: 0,query_id,query,passage_text,label,passage_id,cs
0,0,. what is a corporation ?,"a company is incorporated in a specific nation , often within the bounds of a smaller subset of that nation , such as a state or province . the corporation is then governed by the laws of incorporation in that state . a corporation may issue stock , either private or public , or may be classified as a non - stock corporation . if stock is issued , the corporation will usually be governed by its shareholders , either directly or indirectly .",0,0,0.584834
1,0,. what is a corporation ?,"today , there is a growing community of more than 2 , 100 certified b corps from 50 countries and over 130 industries working together toward 1 unifying goal : to redefine success in business . join the movement",0,1,0.398805
2,0,. what is a corporation ?,"corporation ddefinition , an association of individuals , created by law or under authority of law , having a continuous existence independent of the existences of its members , and powers and liabilities distinct from those of its members . see more .",0,2,0.356746
3,0,. what is a corporation ?,examples of corporation in a sentence . 1 he works as a consultant for several large corporations . 2 a substantial corporation that showed that he was a sucker for all - you - can - eat buffets .,0,3,0.627065
4,0,. what is a corporation ?,"1 : a government - owned corporation ( as a utility or railroad ) engaged in a profit - making enterprise that may require the exercise of powers unique to government ( as eminent domain ) - called also government corporation , publicly held corporation",0,4,0.626559
5,0,. what is a corporation ?,mcdonald ' s corporation is one of the most recognizable corporations in the world . a corporation is a company or group of people authorized to act as a single entity ( legally a person ) and recognized as such in law . early incorporated entities were established by charter ( i . e . by an ad hoc act granted by a monarch or passed by a parliament or legislature ) .,1,5,0.551896
6,0,. what is a corporation ?,"corporations are owned by their stockholders ( shareholders ) who share in profits and losses generated through the firm ' s operations , and have three distinct characteristics ( 1 ) legal existence : a firm can ( like a person ) buy , sell , own , enter into a contract , and sue other persons and firms , and be sued by them .",0,6,0.536037
7,0,. what is a corporation ?,"an association is an organized group of people who share in a common interest , activity , or purpose . 1 start a business plan your business . create your business structure types of business structures . 2 change or update your business add a new location to your existing business . add an endorsement to your existing business .",0,7,0.48999
8,0,. what is a corporation ?,b corp certification shines a light on the companies leading the global movement . . .,0,8,0.462866
9,0,. what is a corporation ?,llcs offer greater flexibility when it comes to income taxes . 1 the owner or member of an llc can have their income taxed in three ways : 2 a single owner llc is treated as a schedule c ( sole proprietor ) for tax purposes .,0,9,0.176873


In [20]:
len(df_eval)

104170

In [18]:
df_eval['cs'] = df_eval.swifter.apply(cosine_similarity_pandas, axis=1)
df_eval.head()

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
Pandas Apply: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104170/104170 [01:53<00:00, 920.09it/s]


Unnamed: 0,query_id,query,passage_text,passage_id,cs
0,1135787,distance between erie in buffalo new york,"erie canal distance tables the erie canal is the longest canal section in the new york state canal system . beginning at lock # e - 2 in waterford , ny , the erie canal runs 293 nm ( 337 sm or 543 km ) to the niagara river at tonawanda , ny and from there onto buffalo , ny and lake erie via the black rock canal .",0,0.631949
1,1135787,distance between erie in buffalo new york,"what is the distance between erie and buffalo ? the distance between erie and buffalo in a straight line is 87 miles or 139 . 98 kilometers . driving ddirections & drive times from erie to buffalo can be found further down the page . driving distances , maps and journey times are currently provided by google mapping systems .",1,0.529501
2,1135787,distance between erie in buffalo new york,"the distance between erie and buffalo in a straight line is 87 miles or 139 . 98 kilometers . driving ddirections & drive times from erie to buffalo can be found further down the page . driving distances , maps and journey times are currently provided by google mapping systems .",2,0.442638
3,1135787,distance between erie in buffalo new york,"erie canal distances . erie canal distance tables . the erie canal is the longest canal section in the new york state canal system . beginning at lock # e - 2 in waterford , ny , the erie canal runs 293 nm ( 337 sm or 543 km ) to the niagara river at tonawanda , ny and from there onto buffalo , ny and lake erie via the black rock canal . the erie canal distance table ( shown below ) provides distances between some of the major points along the length of the canal .",3,0.601884
4,1135787,distance between erie in buffalo new york,"erie ' s metropolitan area consists of approximately 280 , 000 residents and an urbanized area population of approximately 195 , 000 . the city is the seat of government for erie county . erie is near cleveland , ohio ; buffalo , new york ; and pittsburgh , pennsylvania . once teeming with heavy industry , erie ' s manufacturing sector remains prominent in the local economy , though healthcare , higher education , and tourism are emerging as greater economic drivers .",4,0.512365


In [22]:
uniq, index = np.unique(df_eval['query_id'], return_index=True)
query_id = uniq[index.argsort()]
query_id[:5]

array([1135787,  281922,  120233,  319757,  193633], dtype=int64)

In [23]:
scores = df_eval['cs'].values.reshape(-1,10)
print(scores.shape)
scores

(10417, 10)


array([[0.63194883, 0.52950072, 0.44263837, ..., 0.44661158, 0.52314013,
        0.52950072],
       [0.62676936, 0.58504933, 0.42729929, ..., 0.50139046, 0.35140836,
        0.44546625],
       [0.56732315, 0.33952868, 0.27434143, ..., 0.44880232, 0.69334298,
        0.59132701],
       ...,
       [0.39129597, 0.57580882, 0.4602989 , ..., 0.40315902, 0.53362089,
        0.43069425],
       [0.4965497 , 0.49968326, 0.68661767, ..., 0.61596775, 0.62507588,
        0.55384755],
       [0.07156961, 0.35254332, 0.36625025, ..., 0.39824459, 0.30842263,
        0.16469923]])

In [24]:
print(query_id.shape)
scores.shape

(10417,)


(10417, 10)

In [25]:
answer = np.column_stack((query_id,scores))

In [26]:
answer = pd.DataFrame(answer)
answer.iloc[:,0] = answer.iloc[:,0].astype('int')
answer.to_csv('data/answer.tsv', sep='\t', header=None, index=False)
answer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1135787,0.631949,0.529501,0.442638,0.601884,0.512365,0.470188,0.193005,0.446612,0.52314,0.529501
1,281922,0.626769,0.585049,0.427299,0.590925,0.521186,0.42049,0.561364,0.50139,0.351408,0.445466
2,120233,0.567323,0.339529,0.274341,0.290288,0.638483,0.423648,0.374105,0.448802,0.693343,0.591327
3,319757,0.503534,0.451573,0.553113,0.405955,0.470845,0.513022,0.584795,0.414875,0.450322,0.505588
4,193633,0.487467,0.538764,0.800719,0.430609,0.442169,0.546619,0.420859,0.691221,0.534668,0.338755


In [27]:
df_b1 = pd.read_csv('data/answer-bl1.tsv', sep='\t', header=None)
df_b1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1135787,27.367275,36.495855,32.915477,30.555301,13.866893,26.627313,16.511408,8.397489,17.608676,36.68866
1,281922,11.744009,11.575771,8.146057,2.169596,1.03463,13.514372,2.190186,5.789057,10.278486,9.16484
2,120233,14.281651,0.0,23.911319,0.0,10.886682,10.71539,8.659164,20.936059,8.44772,14.982256
3,319757,-0.158872,-0.474097,6.921238,7.171402,11.052829,9.210453,19.166688,12.897403,7.666373,-0.472748
4,193633,14.82929,0.0,21.061208,9.554759,7.337612,11.054364,12.213071,21.539519,9.041871,0.0


In [28]:
problemid = 1124933
answer[answer.iloc[:,0]==problemid ]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
85,1124933,0.17693,0.069058,0.350504,0.250938,-0.153614,0.202595,0.245932,0.081394,0.050719,0.466091


In [31]:
# df_eval[df_eval.query_id==problemid]

In [32]:
submission = open(os.path.join("data/answer.tsv"), "r")
preds = dict()
truths = dict()
problemids = []
for sub in submission:
    try:
        sub = list(map(float, sub.strip("\n").split("\t")))
        preds[int(sub[0])] = sub[1:]
    except:
#         print(sub)
        if '\t' in sub:
            sub = sub.split('\t')[0]
        problemids.append(sub.strip())
problemids

['1123794',
 '1135284',
 '1118042',
 '1117419',
 '1115344',
 '1114993',
 '1114747',
 '1114743',
 '1134707',
 '1114533',
 '1113839',
 '1113834',
 '1113829',
 '1113815',
 '1113762',
 '1113688',
 '1113423',
 '1113353',
 '1110365',
 '1110345',
 '1108788',
 '1106838',
 '1103959',
 '1102617',
 '133607',
 '730985',
 '757680',
 '511300',
 '710310',
 '721005',
 '91913',
 '587074',
 '676274',
 '754092',
 '438011',
 '452640',
 '705429',
 '757348',
 '861500',
 '780848',
 '783948',
 '861865',
 '1127377',
 '1034290',
 '1127337',
 '761014',
 '862064',
 '1038909',
 '799201',
 '785528',
 '772199',
 '1126883',
 '765547',
 '1126351',
 '792952',
 '919355',
 '1057572',
 '857148',
 '113143',
 '1125241',
 '1125126',
 '1125095']

In [33]:
for problemid in problemids:
    problemid = int(problemid)
    answer[answer.iloc[:,0]==problemid] = df_b1[df_b1.iloc[:,0]==problemid]
answer.to_csv('data/answer.tsv', sep='\t', header=None, index=False)

In [34]:
answer[answer.iloc[:,0]==1115195]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
1330,1115195,0.557021,0.356912,0.72342,0.843406,0.64784,0.61097,0.17492,0.554027,0.665069,0.324322


In [35]:
submission = open(os.path.join("data/answer.tsv"), "r")
preds = dict()
truths = dict()
problemids = []
for sub in submission:
    try:
        sub = list(map(float, sub.strip("\n").split("\t")))
        preds[int(sub[0])] = sub[1:]
    except:
#         print(sub)
        if '\t' in sub:
            sub = sub.split('\t')[0]
        problemids.append(sub.strip())
problemids

[]