In [125]:
import os.path
import datetime
from whoosh.qparser import QueryParser
from whoosh import scoring
from tqdm import tqdm
from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT, DATETIME, NUMERIC
from whoosh.index import create_in, open_dir
from functools import cached_property 
from whoosh import index


class FullTextSearch:
    def __init__(self, name_index):
        self.name_index = name_index
        self.schema = Schema(
            channel=TEXT(stored=True),
            message=TEXT(stored=True),
            date=DATETIME(stored=True),
            id=NUMERIC(stored=True),
            true_id=NUMERIC(stored=True, unique=True)
        )
        self.channel2max = defaultdict(int)
        self.fields = ['message', 'date', 'id']
        self.counter = Counter()

    @cached_property
    def ix(self):
        if not os.path.exists(self.name_index):
            os.mkdir(self.name_index)
            
        name_index = self.name_index
        if not index.exists_in(self.name_index):
            ix = create_in(self.name_index, self.schema)
        else:
            ix = open_dir(self.name_index)

        return ix
        
    def search(self, query_text):
        qp = QueryParser("message", schema=self.schema)
        q = qp.parse(query_text)
        docs = []
        with self.ix.searcher(weighting=scoring.TF_IDF()) as s:
            results = s.search(q)
            for result in results:
                idx_doc = result['id']  # Get the document ID
                # Retrieve the full document using its ID
                doc = s.document(id=idx_doc)  # Get the full document by ID
                print(doc)
                
                # Now you can access all fields of the document
                channel = doc['channel']
                content = doc['message']
                dt = doc['date']
                docs.append((channel, content, dt))

        return docs

    def update_docs(self, docs):
        for channel, messages in docs.items():
            for doc in messages:
                idx = doc['id']
                writer = self.ix.writer()
                self.channel2max[channel] = max(self.channel2max[channel], idx)
                my_doc = dict()
                for field in self.fields:
                    my_doc[field] = doc.get(field, None)
                my_doc['channel'] = channel
                my_doc['true_id'] = self.counter.get_index(channel + '_^_' + str(doc['id']))
                writer.update_document(**my_doc)
                writer.commit()


In [126]:
class Counter:
    def __init__(self):
        self.true_id_str_to_int = dict()

    def get_index(self, true_id):
        if index := self.true_id_str_to_int.get(true_id, None):
            return index

        idx = len(self.true_id_str_to_int)
        self.true_id_str_to_int[true_id] = idx

        return idx

In [132]:
import os
import csv
import time
import pickle
import datetime

from telethon.sync import TelegramClient, events
from collections import defaultdict
from tqdm import tqdm
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.types import InputPeerEmpty
from datetime import timedelta


class Downloader:
    def __init__(
        self,
        api_id=28589545,
        api_hash='c06f3b98720ea55ba2bb076125fb2ed3',
        phone='89187548210',
        channels=['@mash', 'bbbreaking', 'ENews112'],
        time2wait=1
    ):
        self.client = TelegramClient(phone, api_id, api_hash)
        self.client.start()
        self.channels = channels
        self.api_id = api_id
        self.phone = phone
        self.api_hash = api_hash
        self.time2wait = time2wait

    async def download_messages_by_last_n_days(self, n=5):
        channel2messages_fin = defaultdict(list)
        today = datetime.datetime.today()
        for i in range(n):
            new_date = datetime.datetime.strftime(today - timedelta(days=i), '%Y-%m-%d')
            channel2messages = await self.download_messages_by_date(new_date)
            for channel, messages in channel2messages.items():
                channel2messages_fin[channel].extend(channel2messages[channel])

        return channel2messages_fin
        

    async def download_messages_by_date(self, date):
        channel2messages = defaultdict(list)
        start_date_string = date
        start_date = datetime.datetime.strptime(start_date_string, '%Y-%m-%d')
        tommorow_start_date = start_date + timedelta(days=1)
        tommorow_start_date_string = datetime.datetime.strftime(tommorow_start_date, '%Y-%m-%d')
        async with TelegramClient('name', self.api_id, self.api_hash) as client:
            for channel in self.channels:
                entity = await client.get_entity(channel)
                id_channel = entity.id
                
                async for message in client.iter_messages(id_channel, offset_date=tommorow_start_date):
                    # print(message.date, message.id)
                    if start_date_string in str(message.date):
                        channel2messages[channel].append(message.to_dict())
                    else:
                        time.sleep(self.time2wait)
                        break
        return channel2messages

    async def update_messages(self, channel2max_message_id):
        channel2messages = defaultdict(list)
        async with TelegramClient('name', self.api_id, self.api_hash) as client:
            for channel, max_id in channel2max_message_id.items():
                entity = await client.get_entity(channel)
                count = 0
                id_channel = entity.id
                async for message in client.iter_messages(id_channel, min_id=channel2max_message_id[channel]):
                    # print(message.date, message.id)
                    count += 1
                    channel2messages[channel].append(message.to_dict())
                print(f"Channel {channel} has new {count} messages!") 
                time.sleep(self.time2wait)
                        
        return channel2messages
        

In [133]:
download = Downloader()
fts = FullTextSearch(name_index='index19')

channel2messages = await download.download_messages_by_last_n_days()
fts.update_docs(channel2messages)

  self.client.start()


In [None]:
while True:
    print("Start update")
    await download.update_messages(fts.channel2max)
    print("I am sleeping...")
    time.sleep(60)

Start update
Channel @mash has new 0 messages!
Channel bbbreaking has new 0 messages!
Channel ENews112 has new 0 messages!
I am sleeping...
