# Semantic search on telegram channels

Showcasing how to extract telegram messages for a specific channel and perform semantic search given an user query. 
I use pretrained models for semantic search from huggingface.

The fetched messages are first embbeded with the model and then uploaded to a pinecode index. Then the query is embedded and compared against the indexed data.

An option for future work is to fine tune a model specific for telegram messages of certain topic. 

In [21]:
import os
import json
import sys
import pandas as pd
import numpy as np
import openai
import time
import re
import math
from telethon.sync import TelegramClient
from IPython.display import display
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import pinecone

from telethon.tl.types import InputMessagesFilterEmpty

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ['OPENAI_API_KEY']
pd.options.display.max_columns = None

# do not truncate column width in pandas
pd.options.display.max_colwidth = 200

pd.options.mode.chained_assignment = None

In [22]:
api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ["TELEGRAM_API_HASH"]
pinecone_key = os.environ["PINECONE_APIKEY"]
phone = os.environ["TELEGRAM_PHONE"]
username = os.environ["TELEGRAM_USERNAME"]
messages = []

## Data fetching and cleaning

In [40]:
pd_data = []

columns = ["channel_name", "id", "peer_id", "date", "message", "out", "mentioned",
        "media_unread", "silent", "post", "from_scheduled", "legacy", 
        "edit_hide", "pinned","noforwards", "from_id", "fwd_from", "via_bot_id",
        "reply_to", "media", "reply_markup", "entities", "views",
        "forwards", "replies", "edit_date", "post_author", "grouped_id",
        "reactions", "restriction_reason", "ttl_period"]

client = TelegramClient(f"../sessions_data/{phone}", api_id, api_hash)
channel_id = "singularitynet"
n = 1000

async with client:        
    async for msg in client.iter_messages(channel_id, filter=InputMessagesFilterEmpty, limit=n):
        try:
            pd_data.append((channel_id, msg.id, msg.peer_id, msg.date, msg.message,
                    msg.out, msg.mentioned, msg.media_unread, msg.silent,msg.post,
                    msg.from_scheduled, msg.legacy, msg.edit_hide, msg.pinned, msg.noforwards,
                    msg.from_id.user_id if hasattr(msg.from_id, "user_id") else msg.from_id.channel_id, msg.fwd_from, msg.via_bot_id, msg.reply_to, msg.media, msg.reply_markup,
                    msg.entities, msg.views, msg.forwards, msg.replies, msg.edit_date, msg.post_author,
                    msg.grouped_id, msg.reactions, msg.restriction_reason, msg.ttl_period
            ))
        except Exception as e:
            print(msg.from_id)
            break

In [43]:
df = pd.DataFrame(pd_data, columns=columns)
df = df[df['message'] != ''] # remove empty messages
df = df[~df["message"].isna()] # remove nan text
df = df.set_index(["channel_name", "id"])
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,peer_id,date,message,out,mentioned,media_unread,silent,post,from_scheduled,legacy,edit_hide,pinned,noforwards,from_id,fwd_from,via_bot_id,reply_to,media,reply_markup,entities,views,forwards,replies,edit_date,post_author,grouped_id,reactions,restriction_reason,ttl_period
channel_name,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
singularitynet,1007866,PeerChannel(channel_id=1140090881),2024-04-28 19:47:12+00:00,"⚠️ Safety Warning ‼️ \n\n‼️ The ASI token is not available for sale, trading, airdrop, etc. \n\n🔺 There is no ASI contract (yet). \n\n🔹 Stay informed by reading pinned messages and announcements i...",False,False,False,False,False,False,False,False,False,False,210944655,,,,,,"[MessageEntityBold(offset=3, length=14), MessageEntityBold(offset=30, length=3), MessageEntityBold(offset=40, length=6), MessageEntityUnderline(offset=95, length=23)]",,,"MessageReplies(replies=0, replies_pts=1422986, comments=False, recent_repliers=[], channel_id=None, max_id=None, read_max_id=None)",NaT,,,,,
singularitynet,1007864,PeerChannel(channel_id=1140090881),2024-04-28 19:35:07+00:00,they have a $2.5 million marketcap. I would just be carefull and check contract address🤷‍♂️,False,False,False,False,False,False,False,False,False,False,578573938,,,"MessageReplyHeader(reply_to_scheduled=False, forum_topic=False, quote=False, reply_to_msg_id=1007855, reply_to_peer_id=None, reply_from=None, reply_media=None, reply_to_top_id=None, quote_text=Non...",,,,,,,NaT,,,,,


In [45]:
import re
import ast

def extract_reply_id(val):
    """ search for the matching id
    """
    if isinstance(val, str):
        match = re.search(r'reply_to_msg_id=(\d+)', val)
        if match:
            return int(match.group(1))
    else:
        return None

def compute_message_historical(df):
    # compute message history for each message if available

    df_extended = df.copy()
    df_extended["in_history"] = False
    df_extended["reply_to_msg_id"] = df_extended["reply_to"].apply(lambda x: int(x.reply_to_msg_id) if x is not None else None)

    # Only use if Class was parsed from text
    # df['reply_to_msg_id'] = df['reply_to'].apply(extract_reply_id)
    
    for (channel_name, message_id), row in df_extended.iterrows():
        history = [f"User_{row['from_id']}: {row['message']}"] # Initialize historical with current message
        reply_id = row["reply_to_msg_id"]
        
        try:
            while not np.isnan(reply_id) and (channel_name, reply_id) in df_extended.index:

                df_extended.loc[(channel_name, reply_id), "in_history"] = True

                # Get reply row
                reply_row = df_extended.loc[(channel_name, reply_id)]

                # Update history
                history.append(f"User_{reply_row['from_id']}: {reply_row['message']}")

                # Delete message appended to history
                df_extended = df_extended.drop((channel_name, reply_id))                

                # assign next reply id
                reply_id = reply_row["reply_to_msg_id"]


        except Exception as e:
            print(type(reply_id))
            print("something failed", e)

        df_extended.loc[(channel_name, message_id), "history"] = str(history[::-1])

        # Ignore already iterated replies

    df_extended["history"] = df_extended["history"].apply(ast.literal_eval)
    df_extended["history_str"] = df_extended["history"].apply(lambda x: "- " + "\n- ".join(x))
    df_extended["thread_length"] = df_extended["history"].str.len()
    return df_extended

In [46]:
df_plus = compute_message_historical(df)

In [49]:
print(df_plus[df_plus['history'].apply(lambda x: len(x) > 1)].iloc[11]["history_str"])

- User_630927899.0: We need to migrate?
- User_1723326597.0: Yes, for non-custodial wallets, there will be a migration tool
- User_630927899.0: Ok when how
- User_1723326597.0: Here is approximate timeline


In [50]:
# Data cleaning and filtering 

df_plus["token_count"] = df_plus["message"].apply(lambda x: len(x.split(" "))  if type(x) == str else 0)
print("Total message count", len(df_plus))

# What is the distribution of token counts?
print("token count distribution")
display(df_plus["token_count"].quantile([.1, .25, .5, .75, .95, 0.98]))

Total message count 972
token count distribution


0.10     0.0
0.25     0.0
0.50     2.0
0.75    10.0
0.95    54.0
0.98    54.0
Name: token_count, dtype: float64

In [51]:
channel_id

'singularitynet'

In [52]:
df_plus.to_csv(f"data/{channel_id}_replies.csv", index=False)

## Encoding with Pytorch

In [11]:
%load_ext autoreload
%autoreload 2
from semantic_search_generator import SemanticSearchGenerator

In [16]:
channel_id = "@runonflux"
df = pd.read_csv(f"notebooks/data/{channel_id}_replies.csv")
messages = df.sample(10, random_state=11)["clean_message"].to_list()
messages

FileNotFoundError: [Errno 2] No such file or directory: 'notebooks/data/@runonflux_replies.csv'

In [31]:
generator = SemanticSearchGenerator()

# Text to encode
query = "investment strategy"
doc_score_pairs = generator.search_batch(query, messages, device="mps")

#Output passages & scores
for doc, score in doc_score_pairs[:10]:
    print(score, doc)


0.2212686538696289 My buy in was 60c but I asked literally everyone big in the flux space if we could maintain the dollar and they were all bullish including Dan. I wanted to sell to buy back cheaper and am kicking myself I didn't lol
0.10311347246170044 For example, my stratus gives me 110 a month with current price and APR. Cost of the dedicated server is 70.If i wanted to run cumulus instead, it would be 40 cumulus, giving me about 220 total a month, but costing me about 210 in VPS cost.Plus now i have to monitor and maintain 40 nodes instead of one 
0.09914617985486984 Join our CBO Davy Wittock for a special mining episode for Around the Blockchain today at 5 PM EST Get ready for an exciting discussion about Proof of Useful Work and the future of mining Link  httpsyoutube.comaroundtheblockchainofficial
0.08931327611207962 I would suggest you to stop promoting the group BettyK0
0.06712281703948975 Anybody withdraw Flux  from CoinEX?Looks suspended
0.04982435703277588  many crypto pr

## Data indexing

In [5]:
channel_id = "@runonflux"
model_name = "multi-qa-MiniLM-L6-cos-v1"
df = pd.read_csv(f"notebooks/data/{channel_id}.csv")

In [6]:
# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.environ["PINECONE_APIKEY"],
    environment="us-west1-gcp"
)

In [7]:
# embeddings model
multi_qa_encoder = SentenceTransformer(model_name)

query = "This coin will moon soon"
vec = multi_qa_encoder.encode(query, convert_to_tensor=False)

In [8]:
FORCE_DELETE_INDEX = False # Set True only for initializing the index
INDEX_NAME = "telegram-embeddings"

if FORCE_DELETE_INDEX:
    pinecone.delete_index(INDEX_NAME)

if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(INDEX_NAME, dimension=len(vec))
    
# connect to index
index = pinecone.Index(INDEX_NAME)

In [13]:
import torch

def get_device():
    has_gpu = torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built()
    device = "mps" if has_mps else "gpu" if has_gpu else "cpu"
    return device

get_device()

'mps'

In [9]:
# Upload data to index
COMPUTE_EMBEDDINGS = True
if COMPUTE_EMBEDDINGS:
    # create embeddings
    df["embeddings_cpu"] = df["clean_message"].apply(lambda x: multi_qa_encoder.encode(x, device="cpu", convert_to_tensor=True, show_progress_bar=False))  


In [12]:
generator = SemanticSearchGenerator(model_name)

# Faster with mps
df["embeddings_mps"] = df["clean_message"].apply(lambda x: generator.encode_messages(x, device="mps"))

In [13]:
df[["embeddings_cpu", "embeddings_mps"]].head(3)

Unnamed: 0,embeddings_cpu,embeddings_mps
0,"[tensor(-0.0749), tensor(-0.1107), tensor(-0.0339), tensor(0.0374), tensor(-0.0197), tensor(0.0569), tensor(0.0786), tensor(-0.0522), tensor(-0.0159), tensor(0.0207), tensor(-0.0155), tensor(-0.01...","[[tensor(-0.0749, device='mps:0'), tensor(-0.1107, device='mps:0'), tensor(-0.0339, device='mps:0'), tensor(0.0374, device='mps:0'), tensor(-0.0197, device='mps:0'), tensor(0.0569, device='mps:0')..."
1,"[tensor(0.0309), tensor(-0.0022), tensor(0.0416), tensor(0.0120), tensor(0.0061), tensor(0.0405), tensor(-0.1566), tensor(-0.0018), tensor(-0.1030), tensor(0.0587), tensor(0.0292), tensor(0.0281),...","[[tensor(0.0309, device='mps:0'), tensor(-0.0022, device='mps:0'), tensor(0.0416, device='mps:0'), tensor(0.0120, device='mps:0'), tensor(0.0061, device='mps:0'), tensor(0.0405, device='mps:0'), t..."
2,"[tensor(0.0337), tensor(-0.0346), tensor(-0.0315), tensor(-0.0759), tensor(-0.0189), tensor(0.0352), tensor(-0.0644), tensor(0.0090), tensor(-0.0890), tensor(-0.0099), tensor(-0.1018), tensor(-0.0...","[[tensor(0.0337, device='mps:0'), tensor(-0.0346, device='mps:0'), tensor(-0.0315, device='mps:0'), tensor(-0.0759, device='mps:0'), tensor(-0.0189, device='mps:0'), tensor(0.0352, device='mps:0')..."


## Upload embeddings

In [14]:
import math
UPLOAD_VECTORS = False # only for index initialization
if UPLOAD_VECTORS:    
    batch_size = 1000
    total_batches = math.ceil(len(df) / batch_size)
    start = 0

    for i in range(total_batches):
        index_upsert = [] # always initialize for each batch        
        end = start + batch_size

        print(f"iterating messages {start}-{end}")
        for j, item in df[start:end].iterrows():
            index_upsert += [
                    (str(j), 
                    item["embeddings_cpu"].tolist(),
                    {
                        "clean_message": item["clean_message"],
                        "channel_name": item["channel_name"],
                        "messagee_id": item["id"]
                    })
            ]
        start = end
        print(f"inserting batch {i}")
        index.upsert(vectors=index_upsert) # can contain maximum 1000 items        

## Search on index

In [32]:
def search_results(query, limit=20):
    query_emb = generator.encode_messages(query)

    results = index.query(
      vector=query_emb.tolist(),
      top_k=limit,
      include_values=False,
      include_metadata=True
    )

    messages = []
    for item in results["matches"]:
        print(f"\nscore {item['score']}")
        print(item["metadata"]["clean_message"])
        messages.append(item["metadata"]["clean_message"])

    return messages

In [35]:
query = "bearish outlook"
search_results(query, limit=5);



score 0.410763919
Typical bear market... Engagement drops, people hesitate. Meanwhile the team continues to build, POUW just presented a couple of weeks ago

score 0.401366264
keep an eye on announcements

score 0.387041956
Nothing happened, bear markets are a bitch

score 0.362539232
If you interested list in our exchange I can help you

score 0.350785732
Bearmarkets dont last forever. Unless you are in a scam project. I dont see any other thing here than non stopping hardwork. So chill 


In [37]:
query = "bullish news"
search_results(query);


score 0.501404643
Hey here A beautiful week to all, its amazing how when the market is bearing or in halt the vibes in communities change, you can see clearly the vision and the rational unlike in bull where everyone is jumping in without research

score 0.4879556
I've never been a fan of the bullish halving narrative, to many disappointed people who fell for the hype.

score 0.407264948
Who said anything about expecting a bull run ? I said i will cash out my flux when it hits 1$ again

score 0.40503487
Is there a bot's news function or roadmap?

score 0.404052913
I know what you mean but in bull it will be worthed

score 0.403915346
Of course but hopefully not until next bull run to maximise exposure

score 0.400119781
 We're excited to introduce a new speaker, Daniel Weiss httpswww.linkedin.comindanielweissesqcpaHe will join a panel discussion on the topic "The Heart of AI Governance, Data, and Society's Wellbeing".Join us in Florida for our exciting Web3 event. Get your tickets now

In [40]:
query = "PouW"
search_results(query, limit=5);


score 0.790179
there is no pouw yet  whats the question?

score 0.697877347
Pouw isn't even live yet.Also there's still a lot of hardware available i think the network is at 40% right now

score 0.691974163
Hi guys what is the latest re. POuW?

score 0.682593167
Hi , you can discuss PoUW in the mining section. All news will be in the announcements section.

score 0.659035504
Not atm.. Pouw coming towards end of the year


In [41]:
query = "Node setup"
search_results(query, limit=5);


score 0.661882877
make some node will be fine

score 0.655703783
If you create the node as privileged it should be good from the start

score 0.6469993
All nodes are appreciated. What's most needed is decentralization.So if you can, run a bare metal node from home

score 0.623049259
Not looked at yet, just wondering if any info on how to set up a fractus node

score 0.587409914
What would I need properly? In my country the internet is precarious for what each node needs


## Using Pinecone index

In [None]:
# Index initialization
from semantic_search_generator import SemanticSearchGenerator

channel_id = "singularitynet"
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
INDEX_NAME = "telegram-embeddings"

generator = SemanticSearchGenerator(model_name)

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.environ["PINECONE_APIKEY"],
    environment="us-west1-gcp"
)

# connect to index
index = pinecone.Index(INDEX_NAME)

In [None]:
from langchain.chains import LLMChain

prompt_template = """Use the chat messages (not sorted in any particular order) below to answer the given user query:
    messages_list: {messages}
    query: {query}
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["messages", "query"])
llm = OpenAIChat(temperature=0)
chain = LLMChain(llm=llm, prompt=PROMPT)


In [None]:
def search_results(query, limit=50):
    query_emb = generator.encode_messages(query)

    results = index.query(
      vector=query_emb.tolist(),
      top_k=limit,
      include_values=False,
      include_metadata=True
    )

    messages = []
    for item in results["matches"]:
        # print(f"\nscore {item['score']}")
        # print(item["metadata"]["clean_message"])
        messages.append(item["metadata"]["clean_message"])

    return messages
    

In [None]:
query = "what is good about this project?"
messages = search_results(query)
inputs = [{"message": msg} for _, msg in zip(range(len(messages)), messages)]
result = chain.run({"messages":inputs, "query":query})
print(result)

In [None]:
query = "what do users complain about this project?"
messages = search_results(query)
inputs = [{"index": i, "message": msg} for i, msg in zip(range(len(messages)), messages)]
result = chain.run({"messages":inputs, "query":query})
print(result)

In [None]:
# create the open-source embedding function
hf_embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# comput embeddings and load to chroma
db = Chroma.from_documents(docs, hf_embedding_function)

In [None]:
# time to query
def search_db(db_client, query: str, top_k = 100):
    docs = db_client.similarity_search(query, k=top_k)

    # print results
    for item in docs[:5]:
        print(f"\n{item.page_content}")

In [None]:
query = "fomo"
search_db(db, query)


Fosho

seems like they 'RMI'  really FOMO'ing for this,  especially after reading the

Hellyea

saoyem Nice Francesco

zealy


In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

client = Chroma.from_documents(
    docs, 
    embedding_function, 
    client=persistent_client, 
    collection_name = f"embeddings_collection_{channel_id[1:]}"
)

In [None]:
query = "bearish"
search_db(client, query)


- User_1853540265: A coinbase listing would be bullish
- User_6010276849: In a bear season as well this,  not sure there will be effect. Listing sometimes has been o do with the right timing as well. The hype is there, you list and boom up to the sky.

- User_1460230397: We're thrilled to announce that the @xenergyweb Crowdloan on @Polkadot
 is now live! Join Energy Web in shaping the energy future and contribute.

For more information about the Crowdloan, visit: https://crowdloan.energywebx.com

Like and Retweet here: https://twitter.com/energywebx/status/1698820250643411030
- User_5986732143: This crowdloan went so well!!! Cannot believe it’s a bear market at all.

- User_1618131036: Another week went by. Hopefully the crypto market will recover soon
- User_5418407519: I hope soo , we have endure bearish for a long time .

- User_530622263: We are fortunate to be aware of such a great project during the last months of the bear market. A good time to dollar cost average IMO.
- User_6