In [1]:
import sys
from elasticsearch import Elasticsearch
import json
import time
import sys
# from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from bs4 import BeautifulSoup
import csv
import re
import numpy as np
from tqdm import tqdm
import pandas as pd

### Connecting to Elastic Search

In [43]:
# connect to ES on localhost on port 9200
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}])
# es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}], timeout=30, retry_on_timeout=True)
if es.ping():
	print('Connected to ES!')
else:
	print('Could not connect!')

print("*********************************************************************************");




Connected to ES!
*********************************************************************************


### Creating a Index inside Elastic Search

In [23]:
#Refer: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html
# Mapping: Structure of the index
  
b = {"mappings": {
  	"properties": {
    		"title": {
      			"type": "text"
    		},
    		"title_vector": {
      			"type": "dense_vector",
      			"dims": 512
		}  
	}
     }
   }


ret = es.indices.create(index='questions-index', ignore=400, body=b) #400 caused by IndexAlreadyExistsException, 
print(json.dumps(ret,indent=4))

# TRY this in browser: http://localhost:9200/questions-index

print("*********************************************************************************");


{
    "error": {
        "root_cause": [
            {
                "type": "resource_already_exists_exception",
                "reason": "index [questions-index/mvA4EETgTruME4-IZlU3aA] already exists",
                "index_uuid": "mvA4EETgTruME4-IZlU3aA",
                "index": "questions-index"
            }
        ],
        "type": "resource_already_exists_exception",
        "reason": "index [questions-index/mvA4EETgTruME4-IZlU3aA] already exists",
        "index_uuid": "mvA4EETgTruME4-IZlU3aA",
        "index": "questions-index"
    },
    "status": 400
}
*********************************************************************************


### Text Cleaning

In [5]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

def clean_text(sentance):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    return sentance

In [32]:
clean_text("hey! </>")

'hey'

### GloVE model

In [36]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model
def get_glove_vectors_from_sentence(sentence, glove_model):
    cleaned_text=clean_text(sentence)
    sent_vec = np.zeros(300)
    sent=cleaned_text.split()
    sent_vec = np.array([0]*300)
    cnt_words=0
    for word in sent:
        
        try:
            vec = glove_model[word]
            sent_vec = sent_vec + vec
            cnt_words += 1
        except:
            pass
    if len(sent) != 0:
        sent_vec /= cnt_words
    return sent_vec
glove_model=load_glove_model("/Users/amansawarn/Documents/tech/iitd/Building-Search-and-Recommendations-Together/glove_6B/glove.6B.300d.txt")



Loading Glove Model
400000 words loaded!


In [37]:
glove_model['pip']

array([-4.1543e-01,  2.6432e-02, -2.9833e-01,  3.8520e-02, -1.6885e-01,
        1.4777e-01,  7.4666e-02, -9.9253e-02,  1.2404e-01,  3.3905e-01,
        3.2387e-01, -2.2475e-01,  2.9743e-01, -3.4243e-01,  3.3233e-01,
        5.4088e-01,  7.1009e-02, -5.2102e-02,  2.9350e-01, -2.2977e-01,
        2.6803e-01,  2.7157e-01, -3.0135e-02,  3.1491e-01,  5.3907e-01,
       -2.1553e-01, -4.1116e-01,  7.3269e-01,  6.2361e-02,  1.0754e-01,
       -8.8195e-02, -3.0234e-01,  4.7027e-01,  2.3538e-01,  7.3184e-02,
       -3.2322e-01,  1.4991e-01, -4.0437e-03, -6.9021e-02,  1.8356e-01,
       -4.1822e-02, -2.0838e-01, -4.5536e-01, -4.5171e-01, -3.3383e-01,
        5.4319e-02, -1.3049e-02, -3.3091e-01, -3.2058e-01, -3.5089e-02,
        2.5962e-01,  2.3639e-01,  2.8112e-01,  1.4933e-02,  9.7111e-02,
        3.6954e-01, -1.3614e-01, -7.0864e-02,  3.3607e-01, -5.2663e-02,
       -7.4458e-03, -3.8942e-01,  2.3830e-01, -2.1575e-01,  8.5522e-01,
        7.2343e-01,  3.4432e-01, -1.7570e-01,  3.3140e-01, -2.00

In [39]:
get_glove_vectors_from_sentence("how to install pip?", glove_model)


array([-0.105775  , -0.180179  , -0.292815  , -0.185675  , -0.16133   ,
       -0.00518   ,  0.383673  ,  0.0551535 ,  0.39287   , -0.30341   ,
        0.1852705 ,  0.143085  ,  0.40492   , -0.282405  ,  0.21715   ,
        0.387395  ,  0.0784145 , -0.232821  ,  0.113358  , -0.25917   ,
        0.237805  , -0.0208    ,  0.0405425 ,  0.263875  ,  0.141095  ,
       -0.37024   , -0.2848    ,  0.77685   , -0.0662145 ,  0.334875  ,
        0.0880625 ,  0.03691   ,  0.42693   ,  0.330575  ,  0.053031  ,
        0.090245  , -0.0582    , -0.17283685, -0.050465  ,  0.0376    ,
       -0.127841  ,  0.234615  , -0.2683885 , -0.089345  , -0.22611   ,
       -0.0242905 ,  0.2137205 , -0.326455  , -0.2298    , -0.3963745 ,
        0.19596   ,  0.11845613,  0.313205  , -0.2409635 ,  0.2518005 ,
        0.35633   , -0.004845  , -0.040631  , -0.05523   ,  0.2140835 ,
       -0.3028579 , -0.1510835 , -0.09157   , -0.16451   ,  0.54078   ,
        0.677395  ,  0.124162  ,  0.114485  ,  0.37663   ,  0.11

## Dumping Indices to Elastic Search

In [13]:
# df=pd.read_json("/Users/amansawarn/Documents/tech/iitd/Building-Search-and-Recommendations-Together/jupyter_notebooks/questions_titles90k.json", orient='records')
# list_of_questions_dict=df.to_dict('records')

with open(r"/Users/amansawarn/Documents/tech/iitd/Building-Search-and-Recommendations-Together/jupyter_notebooks/questions_titles90k.json", "r") as read_file:
    list_of_questions_dict = json.load(read_file)


In [20]:
list_of_questions_dict=list_of_questions_dict[:20000]

In [21]:
len(list_of_questions_dict)

20000

In [44]:
b = {"mappings": {
      "properties": {
        "title": {
            "type": "text"
            },
        
        "title_vector": {
            "type": "dense_vector",
            "dims": 512
                        }
                    }
     }
   }


ret = es.indices.create(index='titles_questions_only_vectors-index', ignore=400, body=b) #400 caused by IndexAlreadyExistsException, 
print(json.dumps(ret,indent=4))

{
    "acknowledged": true,
    "shards_acknowledged": true,
    "index": "titles_questions_only_vectors-index"
}


In [47]:
df=pd.read_json("/Users/amansawarn/Downloads/questions_w_embeddings (1).json")
df=df.to_dict('records')

In [48]:
df[0]

{'Id': 80,
 'OwnerUserId': 26.0,
 'CreationDate': '2008-08-01T13:57:07Z',
 'ClosedDate': None,
 'Score': 26,
 'Title': 'SQLStatement.execute() - multiple queries in one statement',
 'Body': '<p>I\'ve written a database generation script in <a href="http://en.wikipedia.org/wiki/SQL">SQL</a> and want to execute it in my <a href="http://en.wikipedia.org/wiki/Adobe_Integrated_Runtime">Adobe AIR</a> application:</p>\n\n<pre><code>Create Table tRole (\n      roleID integer Primary Key\n      ,roleName varchar(40)\n);\nCreate Table tFile (\n    fileID integer Primary Key\n    ,fileName varchar(50)\n    ,fileDescription varchar(500)\n    ,thumbnailID integer\n    ,fileFormatID integer\n    ,categoryID integer\n    ,isFavorite boolean\n    ,dateAdded date\n    ,globalAccessCount integer\n    ,lastAccessTime date\n    ,downloadComplete boolean\n    ,isNew boolean\n    ,isSpotlight boolean\n    ,duration varchar(30)\n);\nCreate Table tCategory (\n    categoryID integer Primary Key\n    ,categoryN

In [50]:
for question in tqdm(df):
    
    doc_id = question['Id']
    title = question['Title']
    
    title_vector = question['Title_embeddings']
#     title_vec = list(get_glove_vectors_from_sentence(title,glove_model))
#     body_vec = list(get_glove_vectors_from_sentence(ques,glove_model))
    
    data_to_dump_in_elastic_search = {"title":title,
            
                                      "title_vector":title_vector }
    res = es.index(index="titles_questions_only_vectors-index", id=doc_id, body=data_to_dump_in_elastic_search)
    
    

100%|████████████████████████████████████| 20000/20000 [01:42<00:00, 195.08it/s]


In [57]:
b = {"mappings": {
      "properties": {
        "title": {
            "type": "text"
            },
        
        "title_question_vector": {
            "type": "dense_vector",
            "dims": 300
                        }
                    }
     }
   }


ret = es.indices.create(index='titles_questions_vectors-index', ignore=400, body=b) #400 caused by IndexAlreadyExistsException, 
print(json.dumps(ret,indent=4))


{
    "acknowledged": true,
    "shards_acknowledged": true,
    "index": "titles_questions_vectors-index"
}


In [58]:
for question in tqdm(list_of_questions_dict):
    
    doc_id = question['Id']
    title = question['Title']
    ques = question['Body']
    title_body = question['title_body']
#     title_vec = list(get_glove_vectors_from_sentence(title,glove_model))
#     body_vec = list(get_glove_vectors_from_sentence(ques,glove_model))
    title_body_vec = list(get_glove_vectors_from_sentence(title_body,glove_model))
#     title_body_vec = [float(i) for i in title_body_vec]
    data_to_dump_in_elastic_search = {"title":title_body,
            
                                      "title_question_vector":title_body_vec }
    res = es.index(index="titles_questions_vectors-index", id=doc_id, body=data_to_dump_in_elastic_search)
    
    

100%|████████████████████████████████████| 20000/20000 [02:02<00:00, 162.73it/s]


In [None]:
# curl -X GET "localhost:9200/titles_questions_vectors-index/_stats?pretty"