In [1]:
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch("http://localhost:9200")

In [78]:

stopwords = [
    "a","about","above","after","again","against","all","am","an","and","any",
    "are","aren","aren't","as","at","be","because","been","before","being",
    "below","between","both","but","by","can","can","can't","cannot","could",
    "couldn't","did","didn't","do","does","doesn't","doing","don't","down",
    "during","each","few","for","from","further","had","hadn't","has","hasn't",
    "have","haven't","having","he","he'd","he'll","he's","her","here","here's",
    "hers","herself","him","himself","his","how","how's","i","i'd","i'll",
    "i'm","i've","if","in","into","is","isn't","it","it's","its","itself",
    "let's","me","more","most","mustn't","my","myself","no","nor","not","of",
    "off","on","once","only","or","other","ought","our","ours","ourselves",
    "out","over","own","same","shan't","she","she'd","she'll","she's","should",
    "shouldn't","so","some","such","than","that","that's","the","their",
    "theirs","them","themselves","then","there","there's","these","they",
    "they'd","they'll","they're","they've","this","those","through","to",
    "too","under","until","up","very","was","wasn't","we","we'd","we'll",
    "we're","we've","were","weren't","what","what's","when","when's","where",
    "where's","which","while","who","who's","whom","why","why's","with",
    "won't","would","wouldn't","you","you'd","you'll","you're","you've",
    "your","yours","yourself","yourselves",
    "get"
]

index_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stopwords
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                }
            },
            "analyzer": {
                "english_with_stop_and_stem": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_possessive_stemmer",
                        "english_stop",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "english_with_stop_and_stem",
                "search_analyzer": "english_with_stop_and_stem"
            },
            "subtitles": {
                "type": "text",
                "analyzer": "english_with_stop_and_stem",
                "search_analyzer": "english_with_stop_and_stem"
            }
        }
    }
}


In [79]:
es.indices.delete(index="podcasts", body=index_settings)

ObjectApiResponse({'acknowledged': True})

In [80]:
es.indices.create(index="podcasts", body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'podcasts'})

In [81]:
from pathlib import Path

data = Path('data/')

files = sorted(data.glob('*.txt'))

In [82]:
def read_doc(subtitle_file):

    raw_text = subtitle_file.read_text(encoding='utf8')
    
    lines = raw_text.split('\n')
    
    video_title = lines[0]
    subtitles = '\n'.join(lines[2:]).strip()

    video_id = subtitle_file.stem

    return {
        "video_id": video_id,
        "title": video_title,
        "subtitles": subtitles
    }

In [83]:
doc = read_doc(subtitle_file)

In [84]:
print(doc['subtitles'][:500])

0:00 hi everyone Welcome to our event this
0:02 event is brought to you by data do club
0:04 which is a community of people who love
0:07 data we have weekly events and today is
0:10 one of such events if you want to find
0:12 out more about the events we have there
0:14 is a link in the description go there
0:16 click on that link and you'll see the
0:17 things we have in our pipeline well well
0:20 future events uh because things we have
0:24 in our pipeline if we think because the
0:26 topic 


In [85]:
from tqdm.auto import tqdm

In [86]:
for subtitle_file in tqdm(files):
    doc = read_doc(subtitle_file)
    es.index(index="podcasts", id=doc['video_id'], document=doc)

  0%|          | 0/78 [00:00<?, ?it/s]

In [3]:
def search_videos(query: str, size: int=5):
    """
    Search over both `title` and `subtitles`,
    boosting `title` 3x for higher relevance.
    Uses stemming + stopword removal.
    """
    body = {
        "size": size,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "subtitles"],
                "type": "best_fields",
                "analyzer": "english_with_stop_and_stem"
            }
        },
        "highlight": {
            "pre_tags": ["*"],      # highlight start
            "post_tags": ["*"],     # highlight end
            "fields": {
                "title": {
                    "fragment_size": 150,
                    "number_of_fragments": 1
                },
                "subtitles": {
                    "fragment_size": 150,
                    "number_of_fragments": 1
                }
            }
        }
    }

    response = es.search(index="podcasts", body=body)

    hits = response.body['hits']['hits']

    results = []

    for hit in hits:
        highlight = hit['highlight']
        highlight['video_id'] = hit['_id']
        results.append(highlight)

    return results
        

In [4]:
def get_subtitles_by_id(video_id):
    result = es.get(index="podcasts", id='1aMuynlLM3o')
    return result['_source']

In [5]:
result = search_videos('how do I get rich with ai')

In [6]:
result

[{'subtitles': ['first but you still\n42:37 want to want to leverage *Ai* and you want\n42:39 to you know in introduce *AI* into\n42:43 services that your bank offers um of\n42'],
  'title': ['Trends in *AI* Infrastructure'],
  'video_id': '1aMuynlLM3o'},
 {'subtitles': ['and we have a special guest\n1:34 today BOS BOS is an *AI* and data engineer\n1:38 he specializes in moving *AI* projects\n1:40 from the good enough for demo'],
  'title': ['Data Intensive *AI*'],
  'video_id': 'BP6w_vKySN0'},
 {'subtitles': ["okay this\n1:15 week we'll talk about bringing together\n1:16 research and Industry and how\n1:18 explainable and interpretable machine\n1:20 learning and *AI*"],
  'title': ['Interpretable *AI* and ML'],
  'video_id': 'EQcY83VA0Us'},
 {'subtitles': ["0:00 this week we'll talk about *AI* for\n0:03 digital Healthcare and we have a special\n0:05 guest today Maria and by the way should\n0:07 I say Maria Lisa"],
  'title': ['*AI* for Digital Health'],
  'video_id': 'whpkDmVVGUE'},
 

In [7]:
result = get_subtitles_by_id('1aMuynlLM3o')
print(result['subtitles'][:500])

0:01 everyone Welcome to our event this is
0:03 our first event in
0:05 2025 happy New Year everyone and yeah
0:08 I'm really excited to start the new year
0:10 with this stream and uh but before like
0:14 I'll do the usual thing so we have a lot
0:17 of uh things a lot of events planned uh
0:21 so there is a link in the description
0:22 you can go there click on that link and
0:24 see all the event we have on our
0:25 pipeline uh there are a few like there
0:27 is one Workshop there is um a few
