In [3]:
from elasticsearch import Elasticsearch

In [29]:
es = Elasticsearch("http://localhost:9200")

In [136]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "analysis": {
            "analyzer": {
                "english_with_stop": {
                    "type": "standard",
                    "stopwords": "_english_"   # built-in English stopwords
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "video_id": {"type": "keyword"},
            "title": {
                "type": "text",
                "analyzer": "english_with_stop",
                "search_analyzer": "english_with_stop"
            },
            "subtitles": {
                "type": "text",
                "analyzer": "english_with_stop",
                "search_analyzer": "english_with_stop"
            }
        }
    }
}

In [137]:
es.indices.delete(index="podcasts", body=index_settings)

ObjectApiResponse({'acknowledged': True})

In [138]:
es.indices.create(index="podcasts", body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'podcasts'})

In [114]:
from pathlib import Path

data = Path('data/')

files = sorted(data.glob('*.txt'))

In [115]:
subtitle_file = files[0]

In [116]:
def read_doc(subtitle_file):

    raw_text = subtitle_file.read_text(encoding='utf8')
    
    lines = raw_text.split('\n')
    
    video_title = lines[0]
    subtitles = '\n'.join(lines[2:]).strip()

    video_id = subtitle_file.stem

    return {
        "video_id": video_id,
        "title": video_title,
        "subtitles": subtitles
    }

In [117]:
doc = read_doc(subtitle_file)

In [118]:
print(doc['subtitles'][:500])

0:00 everyone Welcome to our event this event
0:02 is brought to you by datadox club which
0:04 is a community of people who love data
0:05 we have weekly events and today is one
0:07 of such events if you want to find out
0:09 more about the events we have there is a
0:11 link in the description go there check
0:13 it out and see what you like do not
0:16 forget to subscribe to our YouTube
0:17 channel this way you will get notified
0:20 about amazing live streams like we have
0:23 today and we


In [119]:
es.index(index="podcasts", id=video_id, document=doc)

ObjectApiResponse({'_index': 'podcasts', '_id': '-Gj7SaI-QW4', '_version': 1, 'result': 'created', '_shards': {'total': 1, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [120]:
from tqdm.auto import tqdm

In [121]:
for subtitle_file in tqdm(files):
    doc = read_doc(subtitle_file)
    es.index(index="podcasts", id=doc['video_id'], document=doc)

  0%|          | 0/78 [00:00<?, ?it/s]

In [130]:
def search_videos(query: str, size: int = 10):
    """
    Search over both `title` and `subtitles`,
    boosting `title` 3x for higher relevance.
    """
    body = {
        "size": size,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "subtitles"],
                "type": "best_fields",
                "analyzer": "english_with_stop"
            }
        },
        "highlight": {
            "pre_tags": ["*"],      # highlight start
            "post_tags": ["*"],    # highlight end
            "fields": {
                "subtitles": {
                    "fragment_size": 150,  # size of snippet
                    "number_of_fragments": 1
                },
                "title": {}               # highlight title too
            }
        }
    }

    response = es.search(index='podcasts', body=body)
    return response

In [131]:
result = search_videos('how do I get rich')

In [132]:
res = result.body['hits']['hits'][0]

In [133]:
res['highlight']

{'subtitles': ["of videos.\n17:02 >> But that's *how* *I* *do* this."],
 'title': ['*How* to Build and Evaluate AI systems in the Age of LLMs']}

In [139]:
print(es.indices.get_mapping(index="podcasts"))


{'podcasts': {'mappings': {'properties': {'subtitles': {'type': 'text', 'analyzer': 'english_with_stop'}, 'title': {'type': 'text', 'analyzer': 'english_with_stop'}, 'video_id': {'type': 'keyword'}}}}}


In [140]:
es.indices.analyze(
    index="podcasts",
    body={
        "text": "how do I get rich",
        "analyzer": "english_with_stop"
    }
)

ObjectApiResponse({'tokens': [{'token': 'how', 'start_offset': 0, 'end_offset': 3, 'type': '<ALPHANUM>', 'position': 0}, {'token': 'do', 'start_offset': 4, 'end_offset': 6, 'type': '<ALPHANUM>', 'position': 1}, {'token': 'i', 'start_offset': 7, 'end_offset': 8, 'type': '<ALPHANUM>', 'position': 2}, {'token': 'get', 'start_offset': 9, 'end_offset': 12, 'type': '<ALPHANUM>', 'position': 3}, {'token': 'rich', 'start_offset': 13, 'end_offset': 17, 'type': '<ALPHANUM>', 'position': 4}]})