## Loading Imports

In [1]:
import os
import pandas as pd
import numpy as np
import regex as re
from tqdm import tqdm
import pprint as pp

In [2]:
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch import helpers

In [3]:
# to remove warnings for unverified requests
import urllib3
urllib3.disable_warnings()

# to remove warnings in Jupyter Notebook 
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
# mainly for checking the language and removing non-english language
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

In [5]:
# record audio imports

import wave
import time
import threading

import tkinter as tk
import pyaudio


In [6]:
# open AI init

import openai
openai.api_key = "<your-open-ai-key>"

## page crawl/get data

In [7]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

from urllib.parse import urlsplit
def get_domain(text):
    return (urlsplit(text).netloc)

In [8]:
df = pd.read_csv("./data/articles.csv", dtype=str)
df.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [9]:
df.drop_duplicates(["title"], inplace=True)

In [10]:
# filtering by language
df["language"] = df["title"].apply(lambda x: nlp(x)._.language["language"])
df = df[df["language"] == 'en']
print(df.shape)

(216, 7)


In [11]:
# checking domain names
df.link.apply(get_domain).value_counts()[:4]

medium.com                 129
towardsdatascience.com      30
hackernoon.com               6
medium.freecodecamp.org      6
Name: link, dtype: int64

In [12]:
# taking only 50 rows (for test)
df = df.iloc[:50,:]

In [13]:
df.reset_index(drop=True, inplace=True)
df.shape

(50, 7)

## create embeddings

In [14]:
from sentence_transformers import SentenceTransformer

In [15]:
model = SentenceTransformer("sentence-t5-base")

In [16]:
def get_tokens(documents): # since we are using dense vectors
#     documents = title + ": " + text
    sentences  = [documents]
    sentence_embeddings = model.encode(sentences)
    sentence_embeddings = (sentence_embeddings.flatten())
    return sentence_embeddings

In [17]:
df["combined_text"] = df["title"] + ": " + df["text"]

In [18]:
tqdm.pandas()
df["embedding_vectors"] = df["combined_text"].progress_apply(get_tokens)

100%|██████████| 50/50 [00:10<00:00,  4.81it/s]


In [19]:
df_json = df.to_dict("records")

## elasticSearch (local)

### connect to elasticSearch server

In [20]:
# For local
# to run elasticsearch server in local:
# > cd /<path-to-elasticsearch-folder>/elasticsearch-8.9.2/
# > ./bin/elasticsearch  

HOST_IP = "localhost"
PORT = 9200
CA_CERT_PATH = "/Users/arskayal/elasticsearch-8.9.2/config/certs/http_ca.crt"
USERNAME="elastic" # by default
PASSWORD="e5FmVlxHa568xrO33guk" # copied from the terminal

# es_conn = None

In [21]:

def connect_elastic():

    es_conn = Elasticsearch(
        [{
            "host": HOST_IP,
            "port": PORT
        }],
        http_auth=(USERNAME, PASSWORD),
        verify_certs=True,
        use_ssl=True,
        ca_certs=CA_CERT_PATH
    )
    if es_conn.ping():
        print("Connected to elasticsearch ... ")
    else:
        print("Elasticsearch connection error ...")
    return es_conn


In [22]:
es_conn = connect_elastic()

Connected to elasticsearch ... 


In [23]:
# list all existing indices
es_conn.indices.get_alias(index="*")

{'.security-7': {'aliases': {'.security': {'is_hidden': True}}}}

### create Index and upload data to index

In [24]:
index_name = "medium-article"

In [25]:
index_schema = {
    "mappings": {
        "properties": {
            "embedding_vectors": {  # column name
                "type": "dense_vector",
                "dims": 768, # based on the dimension of the model
                "index": True,
                "similarity": "dot_product"
            },
            "title": {  # column name
                "type": "text"
            },
            'text': {  # column name
                "type": "text"
            },
            'link': {  # column name
                "type": "text"
            },
            'author': {  # column name
                "type": "text"
            }
        }
    }
}

In [26]:
def create_index(es_client, index_name):
    # for dense vectors: https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html

    index_body = index_schema
    try:
        if not es_client.indices.exists(index_name):
            es_client.indices.create(index=index_name, body=index_body)
            print(f"Created Index -> {index_name}")
        else:
            print(f"Index {index_name} exists ...")

    except Exception as ex:
        print(str(ex))


def _insert(es_client, index_name, body):
#     index_name = "database300"
    if not es_client.indices.exists(index_name):
        create_index(es_client, index_name)

    es_client.index(index=index_name, body=body)


In [27]:
# create and add index
for job in tqdm(df_json):
    _insert(es_conn, index_name, job)

  0%|          | 0/50 [00:00<?, ?it/s]

Created Index -> medium-article


100%|██████████| 50/50 [00:03<00:00, 13.15it/s]


In [28]:
# get count of data in server
result = es_conn.count(index=index_name)

#print the total number of documents in the index
print(result)

{'count': 6, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


### search using text query

In [29]:
def semantic_search(es_client, index_name, query_vec, thresh=1.6, top_n=5):
    count = es_client.cat.count(index=index_name, params={"format": "json"})
    print('count', count)
    if not es_client.indices.exists(index=index_name):
        return "No records found"

    query2 = {
        "size": top_n,
        "query": {
            "bool": {
                "must": []
            }
        },
        "knn": {
            "field": "embedding_vectors",
            "query_vector": query_vec,
            "k": 10,
            "num_candidates": 50
        }
    }
    result = es_client.search(index=index_name, body=query2)
    total_match = len(result['hits']['hits'])
    print("Total Matches: ", str(total_match))

    output_list = []
    counter = 0
    if total_match > 0:
        for hit in result['hits']['hits']:
            counter += 1
            output_json = {
                           "title": hit["_source"]["title"],
                           "score": hit["_score"]}
            output_list.append(output_json)

    pp.pprint(output_list)
    return output_list, count


In [30]:
query_text = "renfrcement learning"
token_vector = get_tokens(query_text)
output, count = semantic_search(es_conn, index_name, token_vector)

count [{'epoch': '1696252256', 'timestamp': '13:10:56', 'count': '38'}]
Total Matches:  5
[{'score': 1.9171492,
  'title': 'Reinforcement Learning from scratch – Insight Data'},
 {'score': 1.9073708,
  'title': 'Machine Learning: how to go from Zero to Hero – freeCodeCamp'},
 {'score': 1.9070585,
  'title': 'Reinventing Social Sciences in the Era of Big Data – I love '
           'experiments – Medium'},
 {'score': 1.9066023,
  'title': 'Deep Learning Is Going to Teach Us All the Lesson of Our Lives: '
           'Jobs Are for Machines'},
 {'score': 1.9063506,
  'title': 'Every single Machine Learning course on the internet, ranked by '
           'your reviews'}]


## Voice Search

In [31]:
class VoiceSearch():
    def __init__(self, client):
        self.client = client
        self.filename = ""
        self.root = tk.Tk()
        self.recording = None
        self.root.resizable(False, False)
        self.button = tk.Button(text="rec", font=("Ariel", 50, "bold"),
                                command=self.click_handler)
        self.button.pack()
        
        self.label = tk.Label(text="00:00:00")
        self.label.pack()
        
        self.root.mainloop()
        print("exiting!")
        
    
    def click_handler(self):
        if self.recording:
            self.recording = False
            self.button.config(fg="black")
        else:
            self.recording = True
            self.button.config(fg="red")
            threading.Thread(target=self.record).start()
            
    def record(self):
        print("Starting session ...")
        audio = pyaudio.PyAudio()
        stream = audio.open(format=pyaudio.paInt16, channels=1, rate=44100,
                           input=True, frames_per_buffer=1023)
    
        frames = []
        start = time.time()
        
        while self.recording:
            data = stream.read(1024)
            frames.append(data)
            
            passed = time.time() - start
            secs = passed % 60
            mins = passed // 60
            hours = mins // 60
            self.label.config(text=f"{int(hours):02d}:{int(mins):02d}:{int(secs):02d}")
            
        stream.stop_stream()
        stream.close()
        audio.terminate()
        
        exists = True
        i = 1
        while exists:
            if os.path.exists(f"recordings/recording{i}.wav"):
                i+= 1
            else:
                exists = False
                
        self.filename = f"recordings/recording{i}.wav"
        sound_file = wave.open(self.filename, "wb")
        sound_file.setnchannels(1)
        sound_file.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        sound_file.setframerate(44100)
        sound_file.writeframes(b"".join(frames))
        sound_file.close()
        print(f"audio recorded and saved at {self.filename} \n")
        
        # transcript
        audio_file= open(self.filename, "rb")
        self.transcript = openai.Audio.transcribe("whisper-1", audio_file)
        print(f"Transript: {self.transcript['text']}\n")
        
        # search
        token_vector = get_tokens(self.transcript["text"])
        semantic_search(self.client, index_name, token_vector)
        
        print("\n\n---------------end of session--------------------\n\n")
               


In [32]:
# Once you run this cell, close the newly window to continue to the next step
voice_obj = VoiceSearch(es_conn)

Starting session ...
audio recorded and saved at recordings/recording1.wav 

Transript: reinforcement learning.

count [{'epoch': '1696252274', 'timestamp': '13:11:14', 'count': '50'}]
Total Matches:  5
[{'score': 1.9167885,
  'title': 'Reinforcement Learning from scratch – Insight Data'},
 {'score': 1.9092228,
  'title': 'Machine Learning: how to go from Zero to Hero – freeCodeCamp'},
 {'score': 1.9084986,
  'title': 'From word2vec to doc2vec: an approach driven by Chinese restaurant '
           'process'},
 {'score': 1.9083655,
  'title': '6 Tricks I Learned From The OTTO Kaggle Challenge – Christophe '
           'Bourguignat – Medium'},
 {'score': 1.9065192,
  'title': 'Machine Learning เรียนอะไร, รู้ไปทําไม – O v e r f i t t e d – '
           'Medium'}]


---------------end of session--------------------


exiting!


## Delete Index

In [33]:
es_conn.indices.delete(index_name)

{'acknowledged': True}