In [11]:
import re, os
from multiprocessing.dummy import Pool

import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
from scipy import spatial

# download('punkt') #tokenizer, run once
# download('stopwords') #stopwords dictionary, run once

np.random.seed = 0

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 1000)
pd.set_option('display.expand_frame_repr', False)

os.chdir(r'C:\Users\pbhavsa\OneDrive - MORNINGSTAR INC\git\msai')

In [14]:
df = pd.read_csv('data/data_p.tsv', sep='\t', header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,100126,how do i achieve my sphr certificate,"Applications can be completed online at the Human Resources Certification Institutes website. The cost of the SPHR Certification exam itself is $425, but depending on your method of preparation, total costs can vary.",0,7
1,100075,how much fat is in a bubba burger,"Meat and meat products. Beef. Bubba Burgers, Gluten Free, Original (32 oz.) Bubba Burgers, Gluten Free, Original (32 oz.) FREE: You can’t exercise your way out of a bad diet. FoodFacts shows you how to. change the way you eat, by showing you what's in the food you're eating. Allergens & Ingredients 1 Allergic.",0,9
2,1004,do spiders eat other animals,"Spider - Description. 1 1: pedipalp 2: trichobothria 3: carapace of prosoma (cephalothorax) 2 15: sternum of prosoma 16: pedicel (also called pedicle) 3 Spiders are chelicerates and therefore arthropods. 4 Spiders and scorpions are members of one chelicerate group, the arachnids.",0,5
3,100490,definition of chiropractic,"chiropractic [(keye-ruh-prak-tik)] A system of treating disease and musculoskeletal disorders that involves manipulation of the backbone and other body parts. In chiropractic, disorders of the nerves are considered the cause of illness.",0,7
4,10283,the ________ is divided into the parasympathetic nervous system and the sympathetic nervous system.,"What is the sympathetic nervous system? The sympathetic nervous system, also part of the autonomic nervous system, originates in the spinal cord; specifically in the thoracic and lumbar regions. It controls the body's fight or flight responses, or how the body reacts to perceived danger. Sympathetic vs Parasympathetic Responses",0,3


In [171]:
#Initialize Global variables
GloveEmbeddings = {}
max_query_words = 12
max_passage_words = 50
emb_dim = 300
embeddingFileName = "glove.6B/glove.6B.%sd.txt"%emb_dim

stop_words = stopwords.words('english')

inputfile = 'data/data_p.tsv'
trainFileName = "data/traindata.tsv"
validationFileName = "data/validationdata.tsv"
EvaluationFileName = "data/eval1_unlabelled.tsv"

In [220]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def loadEmbeddings(embeddingfile):
    global GloveEmbeddings,emb_dim

    fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
    for line in fe:
        tokens= line.strip().split()
        word = tokens[0]
        vec = tokens[1:]
        vec = " ".join(vec)
        GloveEmbeddings[word]=vec
    #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
    GloveEmbeddings["zerovec"] = "0.0 "*emb_dim
    fe.close()
    
def get_average_wv(words):
    feature_vector = []
    for word in words:
        if(word in GloveEmbeddings):
            vec = [float(v) for v in GloveEmbeddings[word].strip().split()]
            feature_vector.append(vec)
        else:
            vec = [float(v) for v in GloveEmbeddings["zerovec"].strip().split()]
            feature_vector.append(vec)
    return np.average(feature_vector, axis=0)

In [173]:
text = df.iloc[1,2]
preprocess(text)

['meat',
 'meat',
 'products',
 'beef',
 'bubba',
 'burgers',
 'gluten',
 'free',
 'original',
 'oz',
 'bubba',
 'burgers',
 'gluten',
 'free',
 'original',
 'oz',
 'free',
 'exercise',
 'way',
 'bad',
 'diet',
 'foodfacts',
 'shows',
 'change',
 'way',
 'eat',
 'showing',
 'food',
 'eating',
 'allergens',
 'ingredients',
 'allergic']

In [174]:
loadEmbeddings(embeddingFileName)
# get_average_wv(['how', 'are', 'you'])

In [175]:
def TextDataToWV(inputfile):
    global GloveEmbeddings,emb_dim,max_query_words,max_passage_words
    
    f = open(inputfile,"r",encoding="utf-8",errors="ignore")  # Format of the file : query_id \t query \t passage \t label \t passage_id 
    n_lines = sum(1 for row in f)
    f = open(inputfile,"r",encoding="utf-8",errors="ignore")
    print(n_lines)
    query_vectors = np.zeros((n_lines, emb_dim), dtype=np.float32)
    passage_vectors = np.zeros((n_lines, emb_dim), dtype=np.float32)
    
    i=0
    for line in tqdm(f):
        tokens = line.strip().lower().split("\t")
        query_id,query,passage,label = tokens[0],tokens[1],tokens[2],tokens[3]

        #****Query Processing****
#         words = re.split('\W+', query)
#         words = [x for x in words if x] # to remove empty words
        words = preprocess(query)
        word_count = len(words)
        remaining = max_query_words - word_count
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_query_words
        words = words[:max_query_words] # trim extra words
        #create Query Feature vector
        query_feature_vector = get_average_wv(words)
        query_vectors[i,:] = query_feature_vector 
        
        #***** Passage Processing **********
#         words = re.split('\W+', passage)
#         words = [x for x in words if x] # to remove empty words
        words = preprocess(passage)
        word_count = len(words)
        remaining = max_passage_words - word_count
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_passage_words
        words = words[:max_passage_words] # trim extra words
        #create Passage Feature vector
        passage_feature_vector = get_average_wv(words)
        passage_vectors[i,:] = passage_feature_vector
        i+=1
    
    f.close()    
     
    np.save('data/query_vectors', query_vectors)
    np.save('data/passage_vectors', passage_vectors)

In [176]:
# TextDataToWV(inputfile)

In [177]:
query_vectors = np.load('data/query_vectors.npy')
a[:1]

array([[-7.72671700e-02,  1.33255005e-01,  9.02333297e-03,
        -7.70941675e-02,  7.58008361e-02,  1.72386672e-02,
         3.05008329e-02, -5.26166670e-02,  1.49550006e-01,
         8.28561634e-02,  1.40221670e-01,  5.81533350e-02,
         3.75550017e-02, -6.95841685e-02, -1.55074997e-02,
         4.92458344e-02, -1.14083337e-02,  1.57451667e-02,
         8.12449977e-02, -4.87955846e-02,  4.62933332e-02,
         4.21416685e-02, -4.47116680e-02, -9.65033323e-02,
         4.80137505e-02, -1.34985000e-01, -8.72633308e-02,
        -1.46935835e-01, -6.41416665e-03,  2.81416662e-02,
         3.87683332e-01,  7.53149986e-02, -2.05016673e-01,
        -5.38025014e-02,  1.44632503e-01, -3.01900003e-02,
         1.09588332e-01,  7.19183311e-02, -3.43233347e-02,
        -1.04147501e-01,  3.09166673e-04, -1.88608333e-01,
         2.36416664e-02,  3.53191681e-02, -1.49660841e-01,
        -1.05861664e-01,  6.53624982e-02,  1.40525833e-01,
         7.68377036e-02,  6.98000006e-03]], dtype=float3

In [178]:
passage_vectors = np.load('data/passage_vectors.npy')
a[:1]

array([[ 5.2754842e-02,  2.4805017e-02,  6.0253602e-02, -1.6258260e-02,
        -3.8344037e-02,  1.2528920e-02, -5.3626139e-02, -1.7626020e-01,
         2.8071639e-01,  3.2800280e-02,  7.9087563e-02, -5.4271478e-02,
         1.1345578e-01, -8.1654683e-02,  1.0957960e-01, -2.1444101e-02,
        -4.9717382e-02,  2.1745481e-02,  2.7835939e-02, -1.4682667e-01,
         1.5809610e-01, -7.6202601e-02,  3.8391132e-02, -4.5927040e-02,
        -6.9674239e-03, -2.1974353e-01, -9.0616062e-02, -1.7701054e-01,
        -5.2986581e-02,  5.3307880e-02,  9.4379401e-01,  4.0093000e-04,
        -1.2065554e-01, -1.9842941e-01,  6.9153823e-02,  5.6132529e-02,
         1.1889410e-01,  1.8512060e-01,  4.5860078e-02, -5.5571720e-02,
         1.1778220e-02, -1.5297364e-01,  9.6865021e-02,  2.0596980e-01,
        -1.3228372e-01, -6.8142161e-02,  9.1977268e-02,  2.3333991e-01,
         4.2583983e-02,  6.6842481e-02]], dtype=float32)

In [186]:
df.iloc[:,0][:50].values

array([100126, 100075,   1004, 100490,  10283,  10726, 100798, 100744,
       100724, 100164, 100819, 100341,  10682,  10220,  10161,  10432,
       100107, 100941, 100694, 100762, 100825, 100555,  10691, 100855,
        10603,  10027,  10349, 100870, 100734, 101075, 101073, 101006,
       100024,  10372, 100172,  10651,  10652,  10391, 100849,  10143,
       100390, 100977,  10709,  10052, 100869, 100163, 100542,  10335,
       100636,  10159], dtype=int64)

In [269]:
# wrong 10161 100490 10283
id = 100798
df_t = df[df.iloc[:, 0] == id].reset_index()
df_t = df_t.rename({0:'query_id',1:'query', 2:'passage_text', 3:'label', 4:'passage_id'}, axis=1)
df_t['cs'] = 0
df_t

Unnamed: 0,index,query_id,query,passage_text,label,passage_id,cs
0,6,100798,how long before a bearded dragon starts to grow,"The average is 15 to 18 inches - however, there are exceptions.",0,9,0
1,921,100798,how long before a bearded dragon starts to grow,"Bearded dragons reach sexual maturity between the ages of 8 months and 18 months. Determining the gender of your bearded dragon before maturity isn't easy. When grown, males are larger than females, with notably bigger heads.",1,4,0
2,294742,100798,cost of access card,"HID 1326 ProxCard II is the value-priced industry choice for prox card access control. Designed for 125 kHz access control, HID 1326 ProxCard II offers 26 bits standard, more than 137 million possible codes, up to 24” read range, and a vertical clamshell card format.",0,9,0
3,295345,100798,cost of access card,"Card Access Control Average Costs. Expect to pay an average of $1,500 to $2,500 per door for a high quality system for up to 150 people. If you’re looking for a card access service that will service 150+ employees and have two to three access doors it will cost upwards of $2,500 to $3,500.",1,0,0
4,295608,100798,cost of access card,"HID 1326 ProxCard II is the value-priced industry choice for prox card access control. Designed for 125 kHz access control, HID 1326 ProxCard II offers 26 bits standard, more than 137 million possible codes, up to 24” read range, and a vertical clamshell card format. Standard 26-Bit/H10301 Format.",0,8,0


In [270]:
def cosine_similarity(row):
    q = get_average_wv(preprocess(row['query']))
    a = get_average_wv(preprocess(row['passage_text']))
    return 1 - spatial.distance.cosine(q,a)

In [271]:
df_t['cs'] = df_t.apply(lambda row: cosine_similarity(row), axis=1)
df_t

Unnamed: 0,index,query_id,query,passage_text,label,passage_id,cs
0,6,100798,how long before a bearded dragon starts to grow,"The average is 15 to 18 inches - however, there are exceptions.",0,9,0.392703
1,921,100798,how long before a bearded dragon starts to grow,"Bearded dragons reach sexual maturity between the ages of 8 months and 18 months. Determining the gender of your bearded dragon before maturity isn't easy. When grown, males are larger than females, with notably bigger heads.",1,4,0.691843
2,294742,100798,cost of access card,"HID 1326 ProxCard II is the value-priced industry choice for prox card access control. Designed for 125 kHz access control, HID 1326 ProxCard II offers 26 bits standard, more than 137 million possible codes, up to 24” read range, and a vertical clamshell card format.",0,9,0.787717
3,295345,100798,cost of access card,"Card Access Control Average Costs. Expect to pay an average of $1,500 to $2,500 per door for a high quality system for up to 150 people. If you’re looking for a card access service that will service 150+ employees and have two to three access doors it will cost upwards of $2,500 to $3,500.",1,0,0.855234
4,295608,100798,cost of access card,"HID 1326 ProxCard II is the value-priced industry choice for prox card access control. Designed for 125 kHz access control, HID 1326 ProxCard II offers 26 bits standard, more than 137 million possible codes, up to 24” read range, and a vertical clamshell card format. Standard 26-Bit/H10301 Format.",0,8,0.772008


In [272]:
# def num_replacer(x):
#     return re.sub(r'[0-9\,]+', 'number', x)
#     return re.sub(r'[\n()]+', ' ', x).lower()

In [274]:
# t = 'There are 340- -calories in a 1 burger serving of Bubba Burger Sweet Onion Burger. Calorie breakdown: 70% fat, 2% carbs, 28% protein.	'
# num_replacer(t)