# Langchain Demo with telegram data

In [1]:
import os
import pinecone
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv()) # read local .env file

from langchain.llms import OpenAIChat
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

  from tqdm.autonotebook import tqdm


## Using existing Pinecone index

In [None]:
# Index initialization
from semantic_search_generator import SemanticSearchGenerator

channel_id = "@runonflux"
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
INDEX_NAME = "telegram-embeddings"

generator = SemanticSearchGenerator(model_name)

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.environ["PINECONE_APIKEY"],
    environment="us-west1-gcp"
)

# connect to index
index = pinecone.Index(INDEX_NAME)

In [None]:
from langchain.chains import LLMChain

prompt_template = """Use the chat messages (not sorted in any particular order) below to answer the given user query:
    messages_list: {messages}
    query: {query}
"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["messages", "query"])
llm = OpenAIChat(temperature=0)
chain = LLMChain(llm=llm, prompt=PROMPT)


In [None]:
def search_results(query, limit=50):
    query_emb = generator.encode_messages(query)

    results = index.query(
      vector=query_emb.tolist(),
      top_k=limit,
      include_values=False,
      include_metadata=True
    )

    messages = []
    for item in results["matches"]:
        # print(f"\nscore {item['score']}")
        # print(item["metadata"]["clean_message"])
        messages.append(item["metadata"]["clean_message"])

    return messages
    

In [None]:
query = "what is good about this project?"
messages = search_results(query)
inputs = [{"message": msg} for _, msg in zip(range(len(messages)), messages)]
result = chain.run({"messages":inputs, "query":query})
print(result)

In [None]:
query = "what do users complain about this project?"
messages = search_results(query)
inputs = [{"index": i, "message": msg} for i, msg in zip(range(len(messages)), messages)]
result = chain.run({"messages":inputs, "query":query})
print(result)

## Using Chroma DB

In [42]:
# import
import pandas as pd
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import Chroma
from chromadb.config import Settings

# How to create a client with reset allowed
# client = chromadb.HttpClient(settings=Settings(allow_reset=True))
# client.reset()  # resets the database

In [48]:
# load telegram messages
channel_id = "@energyweb"
df = pd.read_csv(f"notebooks/data/{channel_id}.csv")

df_loader = DataFrameLoader(df, page_content_column="history_str")
docs = df_loader.load()

In [49]:
df["history_str"].iloc[0:4]

0    - User_5297034533: @a84765641 [6401503585] to ...
1                              - User_1174003449: 👋🏻👋🏻
2    - User_1482244625: I cannot wait to spin up a ...
3    - User_1132548662: @saoyem Samuel, do you know...
Name: history_str, dtype: object

In [36]:
# create the open-source embedding function
hf_embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# comput embeddings and load to chroma
db = Chroma.from_documents(docs, hf_embedding_function)

In [55]:
# time to query
def search_db(db_client, query: str, top_k = 100):
    docs = db_client.similarity_search(query, k=top_k)

    # print results
    for item in docs[:5]:
        print(f"\n{item.page_content}")

In [56]:
query = "fomo"
search_db(db, query)


Fosho

seems like they 'RMI'  really FOMO'ing for this,  especially after reading the

Hellyea

saoyem Nice Francesco

zealy


### Peristent Chroma client

In [57]:
import chromadb

persistent_client = chromadb.PersistentClient()

CLEAR_COLLECTION= True

if CLEAR_COLLECTION:
    try:
        persistent_client.delete_collection(f"embeddings_collection_{channel_id[1:]}")
    except Exception as e:
        print("unable to delete ", e)

unable to delete  Collection embeddings_collection_energyweb does not exist.


In [58]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

client = Chroma.from_documents(
    docs, 
    embedding_function, 
    client=persistent_client, 
    collection_name = f"embeddings_collection_{channel_id[1:]}"
)

In [59]:
query = "bearish"
search_db(client, query)


- User_1853540265: A coinbase listing would be bullish
- User_6010276849: In a bear season as well this,  not sure there will be effect. Listing sometimes has been o do with the right timing as well. The hype is there, you list and boom up to the sky.

- User_1460230397: We're thrilled to announce that the @xenergyweb Crowdloan on @Polkadot
 is now live! Join Energy Web in shaping the energy future and contribute.

For more information about the Crowdloan, visit: https://crowdloan.energywebx.com

Like and Retweet here: https://twitter.com/energywebx/status/1698820250643411030
- User_5986732143: This crowdloan went so well!!! Cannot believe it’s a bear market at all.

- User_1618131036: Another week went by. Hopefully the crypto market will recover soon
- User_5418407519: I hope soo , we have endure bearish for a long time .

- User_530622263: We are fortunate to be aware of such a great project during the last months of the bear market. A good time to dollar cost average IMO.
- User_6

## Using OpenAI embeddings

In [37]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DataFrameLoader
from IPython.display import display, Markdown
import pandas as pd

In [38]:
channel_id = "@runonflux"
df = pd.read_csv(f"notebooks/data/{channel_id}.csv")
df_loader = DataFrameLoader(df, page_content_column="clean_message")
docs = df_loader.load()

In [40]:
import chromadb

persistent_client = chromadb.PersistentClient()

CLEAR_COLLECTION = True

if CLEAR_COLLECTION:
    try:
        persistent_client.delete_collection(f"openai_embeddings_{channel_id[1:]}")
    except Exception as e:
        print("unable to delete ", e)

In [41]:
openai_embeddings_function = OpenAIEmbeddings()

openai_chroma_client = Chroma.from_documents(
    docs, 
    openai_embeddings_function, 
    client=persistent_client, 
    collection_name = f"openai_embeddings_{channel_id[1:]}"
)

In [42]:
query = "use cases"
search_db(openai_chroma_client, query)

- in addition to securing the network you also provide a real world case for solving problems  eg genome sequencing, graphics rendering, etc...
- upto you to find and research
- Can you please write in simple words the usecases of flux coin ? Services which requires flux token ?
- What are the $FLUX usecases?Let's find out httpstwitter.comHouseofChimerastatus1671902756808818690
- It may help to 
- Harness the power of decentralisation Explore RunOnFlux use cases Let's read onhttpstwitter.comHouseofChimerastatus1709901477689364513
- cycle through the different options and see what the issue could be
- Usage is one of the key things that separates Flux from its competitors, show the below some love on twitter and help spread the word httpstwitter.comcryptoviumstatus1686329414940823552?s20
- Anyone can apply to use their services for payments
- bring the network to life


In [17]:
qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(), 
    chain_type="stuff", 
    retriever=openai_chroma_client.as_retriever(search_type="mmr", search_kwargs={'fetch_k': 30}), 
    return_source_documents=True
)

In [18]:
query = "What are some use cases of flux?"
response = qa({"query":query})

In [19]:
response["result"]

"Some potential use cases for Flux could include:\n\n1. Real-time data processing and analytics: Flux is designed to handle large volumes of data and perform computations in real-time, making it suitable for applications that require fast data processing and analytics.\n\n2. Internet of Things (IoT) applications: Flux can be used to collect and process data from various IoT devices, enabling the creation of smart and connected systems.\n\n3. Financial services: Flux's real-time capabilities make it well-suited for financial applications, such as high-frequency trading, fraud detection, and risk analysis.\n\n4. E-commerce and recommendation systems: Flux can be used to process and analyze user data in real-time, allowing for personalized recommendations and targeted advertising.\n\n5. Social media analytics: Flux's ability to handle streaming data makes it a good choice for analyzing social media feeds in real-time, enabling sentiment analysis, trend detection, and social network analys

### Reusing persisted chroma collection

In [50]:
# check if the existing collection has documents
channel_id = "@runonflux"
collection_name = f"openai_embeddings_{channel_id[1:]}"

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_collection(collection_name)
collection.count()

3117

In [51]:
# load from disk

openai_embeddings_function = OpenAIEmbeddings()

openai_client = Chroma(
    persist_directory="./chroma", 
    collection_name=collection_name, 
    embedding_function=openai_embeddings_function
)

In [52]:
query = "fud"
docs = openai_client.similarity_search(query, k=20)
# print results
for item in docs[:10]:
    print(f"- {item.page_content}")

- People love to Fud and create unneeded distractions
- I hope so too and we all hope so. But let's say that seeing your deleted message doesn't do much pleasure. And I certainly didn't take the liberty of making fud.
- lets hope for the best
- it's up to date mate 
- I think that now come dump. 
- chaos trouble violence disorder destruction confusion havocfracas commotion
- reddug akhtarg need to be more vigilant with fake admin tagging
- reddug could I do you ?
- Ill give it another go
- this is probably the only thing holding the project back


In [81]:
template= """
The context given are several messages retrieved from a chat about a blockchain project. Based on those messages try to answer the user question.
----------------
context: {context}
user question: {question}
"""

In [82]:
qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(), 
    chain_type="stuff", 
    retriever=openai_client.as_retriever(search_type="mmr", search_kwargs={'fetch_k': 100}), 
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    }
)

In [83]:
print(qa.combine_documents_chain.llm_chain.prompt)

input_variables=['context', 'question'] template='\nThe context given are several messages retrieved from a chat about a blockchain project. Based on those messages try to answer the user question.\n----------------\ncontext: {context}\nuser question: {question}\n'


In [84]:
query = "What makes this project unique?"
response = qa({"query":query})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [85]:
response["result"]

'Based on the given context, it seems that the project mentioned (Flux) is being compared to other blockchain projects. One user mentions that the project is "probably the only thing holding the project back." Another user asks what features make Flux stand out compared to other chains.\n\nHowever, there is not enough information provided in the given messages to determine what specifically makes Flux unique. Further details or messages are needed to answer the user\'s question accurately.'