In [1]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone import Pinecone,  PodSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnablePassthrough,RunnableLambda
from langchain_core.pydantic_v1 import BaseModel, Field

import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
import pandas as pd
# load_dotenv("API_KEYS")

In [2]:
from openai import OpenAI

In [3]:
# load environment variables from .env file
load_dotenv("API_KEYS.env")

# get the environment variable
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [8]:
class Classification(BaseModel):
    answer: str = Field(description="""Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                            If you can't answer the question, reply 'I do not have enough information to answer this question' """)
    sentiment: str = Field(description="The sentiment of the comments. Rate each comment.")
    aggressiveness: str = Field(
        description="How aggressive the comments is on a scale from 1 to 10. 1 being polite and 10 means aggressive."
    )
    language: str = Field(description="The language the text is written in")
    political_tendency : str = Field(description= "The political tendency of the text. Emphasize both economic and social policies")


class FinFeedRAG:
    def __init__(self ,pinecone_index, pine_cone_api_key= pinecone_api_key, openai_api_key= openai_api_key,
                 embeddings_model= OpenAIEmbeddings(openai_api_key=openai_api_key),model='gpt-3.5-turbo', path_to_df = 'final_df.pkl'):
        self.openai_api_key=openai_api_key
        self.api_key_pinecone = pine_cone_api_key
        self.pinecone_index = pinecone_index
        # Initialize Pinecone connection
        self.vector_db = None
        self.embeddings=embeddings_model
        self.model=model
        self.template = """
                Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                If you can't answer the question, reply "I do not have enough information to answer this question".
                
                Context: {context}
                
                Question: {question}
                """
        
        self.template_classification = """
                            Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                            If you can't answer the question, reply "I do not have enough information to answer this question".
                
                            Extract the properties mentioned in the 'Classification' function and also justify your analysis.

                            Context: {context}

                            Public Comments: {comments}
                            
                            Question: {question}
                            """
        
        # self.df_youtube_data= pd.read_pickle(path_to_df)

    def initialize_pinecone(self):
        if self.vector_db is None:  # Check if it's already initialized
            pc = Pinecone(api_key=self.api_key_pinecone)
            self.vector_db = pc.Index(self.pinecone_index)  # Connect to the index and store the connection
        return self.vector_db
        
   
    def preprocess_youtube_text(self, text_file, chunksize,chunkoverlap):

        self.preprocess_input(text_file,save_back_to_file=True)
        
        loader = TextLoader(text_file) #text instance of langchain
        text_documents = loader.load() 
        # Assuming RecursiveCharacterTextSplitter is a class you have access to or have created
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunksize, chunk_overlap=chunkoverlap)
        processed_text = splitter.split_documents(text_documents)
        # Further processing can be done here if necessary
        return processed_text

    def upload_to_vb(self,text,embeddings,chunksize, chunkoverlap,index=None):
        if index is None:
            index = self.pinecone_index
        return PineconeVectorStore.from_documents(self.preprocess_youtube_text(text,chunksize,chunkoverlap), self.embeddings, index_name=index)


    def preprocess_input(self, text_file,save_back_to_file=True):
        # Simple text preprocessing: lowercasing, removing punctuation need to add more preprocessing steps do research on it
        # Read and process the content and rewrite it
        if save_back_to_file==True:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            # Write the processed content back, replacing the original
            with open(text_file, 'w') as file:
                file.write(final_text)
        else:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            return final_text
        
    def most_common(self, input_text_file,most_common=10):
        # Preprocess the text
        processed_text = self.preprocess_input(input_text_file,save_back_to_file=False)    
        # Extract keywords based on frequency, assuming more frequent terms are more relevant
        words = processed_text.split()
        word_freq = Counter(words)
        common_words = word_freq.most_common(most_common)  # Get the top 5 words       
        # Form a query by joining the most common words
        query = ' '.join(word for word, _ in common_words)
        return query

    def retrieve_embeddings(self, query, most_similar=2):
        assert self.vector_db is not None, "Initialize Pinecone first"
        query_result = self.vector_db.query(vector=self.embeddings.embed_query(query), top_k=most_similar)
        ids = [item['id'] for item in query_result['matches']]
        return [self.vector_db.fetch(ids)['vectors'][id]['values'] for id in ids]

    def provide_context(self, query,index=None,most_similar=2):
        if index is None:
            index = self.pinecone_index
        # Provide context to LLM
        return PineconeVectorStore.from_existing_index(index_name=index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}).invoke(query)
    
    def extract_comments(self, query):
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}
        )

        documents = retriever.invoke(query)
        comments = []
        for doc in documents:
            comments.extend(doc.metadata.get('youtube_comments', []))
        comments_str = "\n".join(comments)
        return comments_str
        
    def prompt(self,template=None):
        if template is None:
            template = self.template
        return ChatPromptTemplate.from_template(template)
    
    def prompt_tagging(self,template=None):
        if template is None:
            template = self.template_classification
        return ChatPromptTemplate.from_template(template)
        
    def llm(self,model=None):
        if model is None:
            model = self.model
        return ChatOpenAI(openai_api_key=self.openai_api_key, model=model)
    
    def llm_tagging(self, model = None):
        if model is None:
            model = self.model
        llm = ChatOpenAI(openai_api_key=self.openai_api_key, model=model).with_structured_output(Classification)
        return llm
        
    def parser(self):
        return StrOutputParser()

    def chain(self,query):
        #complete_query = self.prompt().format(context=self.provide_context(query),question=query)
        #response = self.llm().invoke(complete_query)
        #return self.parser().invoke(response)
        chaining = (
        {"context": PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}), 
         "question": RunnablePassthrough()}
        | self.prompt()
        | self.llm()
        | self.parser())
        return chaining.invoke(query)
    
    def tagging_chain(self, query):
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}
        )

        comments = self.extract_comments(query)
        comments_runnable = RunnableLambda(lambda _: comments)

        chaining = (
            {
                "context": retriever,
                "comments": comments_runnable,
                "question": RunnablePassthrough()
            }
            | self.prompt_tagging() #uses self.template_classification
            | self.llm_tagging() #for structured output
        )

        return chaining.invoke(query).dict()

    def get_all_vector_ids(self,  input_vector, top_k = 10000, index= None):
        if index is None:
            index = self.pinecone_index
        results = index.query(vector=input_vector, top_k= top_k, include_values=False)
        # ids = set()
        ids=[]
        for result in results['matches']:
            # ids.add(result['id'])
            ids.append(result['id'])
        return ids
    
    def insert_youtube_metadata(self, top_k=10000, index= None):
        """Set top_k >= total_vector_count of vector database
        """
        if index is None:
            index= self.pinecone_index
        assert top_k > self.vector_db.describe_index_stats().total_vector_count, "top_k is less that total_vector_count of vector database"

        my_index = Pinecone(self.api_key_pinecone).Index(index)
        all_ids = self.get_all_vector_ids([0.0] * 1536, top_k= top_k , index=my_index)  

        for id in all_ids:
            old_metadata = index.fetch([id])['vectors'][id]['metadata']
            index.update(
            id= id, 
            set_metadata={'youtube_response_metadata': str(self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) == 
                                                                old_metadata['source']]['response_metadata'].values[0]),
                        'youtube_comments': self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) 
                                                                 == old_metadata['source']]['list_of_comments'].values[0]
        }, 
            namespace=""
        )
     

In [9]:
bot = FinFeedRAG(pinecone_index='day1')

In [11]:
bot.tagging_chain("give me latest news on stock market.")

{'answer': 'The stock market is showing volatility with investors closely monitoring movements in various companies such as AMD, NVIDIA, and Revux in the technology sector.',
 'sentiment': 'Neutral',
 'aggressiveness': '2',
 'language': 'English',
 'political_tendency': 'Neutral'}

In [12]:
bot.extract_comments('give news on china economy')

'good,buy\ngood,buy...\nis it true that with commercial subscription there is no this aughful loud drum beats at the beginnind\nWinnie Wu 😍\nMinmin Low 😍\nHealthcare, why glasses hearing aids when they are outcomes of nutrition?\nWill USA Survive from China\'s - USA govt bond selling\nYa … China is going to go bust 😂😂\nDon’t worry.  FED changes its mind like a girl changes clothes.\nAll these efforts yet China has been on a decline since the Great Leap Forward. Rofl\nDealing with over Capacity takes time ! Being Bold and Coming in at what might be a Great Price with State Assurance Could get the Ball Rolling !.\nIt was a huge mistake for China to prop up property prices.\nlike if you saw Chinese bot in comment section :D\nYes!\nIf this was their 2008 GFC moment that your guest says it is, why are the markets at year highs and pumping higher? And DO NOT say its because their economy is more  "stable" 😂😂\nJika ingin melihat China coleps  , Amerika harus mengangkat  Gordon Chang  , Peter 

In [13]:
bot.chain("give me 3 bullet points on China's economy.")

"- China is experiencing a shift from investment-led growth to consumer-led growth in order to diversify its economy.\n- The country's real estate sector is still facing challenges, which are impacting the consumer sector.\n- China's manufacturing capacity advantage has led to a significant drop in battery manufacturing prices."