In [18]:
import os
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker ##this is new
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone import Pinecone,  PodSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnablePassthrough
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
from langchain_core.runnables import RunnablePassthrough,RunnableLambda
from langchain_core.pydantic_v1 import BaseModel, Field
import ast
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
load_dotenv("API_KEYS") #### don't forget to change!!!


class Classification(BaseModel):
    answer: str = Field(description="""Do Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                            If you can't answer the question, reply 'I do not have enough information to answer this question' """)
    sentiment: str = Field(description="The sentiment of the comments. Rate each comment.")
    aggressiveness: str = Field(
        description="How aggressive the comments is on a scale from 1 to 10. 1 being polite and 10 means aggressive."
    )
    language: str = Field(description="The language the text is written in")
    political_tendency : str = Field(description= """The political leaning of the comment. Emphasize political leaning on both economic andsocial policies. Justify your answer.""")


class FinFeedRAG:
    def __init__(self, pine_cone_api_key, openai_api_key, pinecone_index, embeddings_model= OpenAIEmbeddings(),model='gpt-3.5-turbo'):
        self.openai_api_key=openai_api_key
        self.api_key_pinecone = pine_cone_api_key
        self.pinecone_index = pinecone_index
        # Initialize Pinecone connection
        self.vector_db = None
        self.embeddings=embeddings_model
        self.model=model
        self.template_classification = """
                            Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                            If you can't answer the question, reply "I do not have enough information to answer this question".
                
                            Extract the properties mentioned in the 'Classification' function and also justify your analysis.

                            Context: {context}

                            Public Comments: {comments}
                            
                            Question: {question}
                            """
        self.template = """
                        Answer the question based on the context provided below, which is structured in a dictionary format. Assume the role of a news reporter. Each time you use information from the context, you must cite it explicitly. Cite the source accompanying each context entry by including it directly in your response. Additionally, for each context, public opinion is provided. At the end of the answer, please provide some examples from public opinion.
                        
                        Use as many contexts as possible to provide a comprehensive answer. If you lack sufficient information to formulate a response, please state: "I do not have enough information to answer this question."
                        
                        Contexts:
                        {context}
                        
                        Question:
                        {question}
                        
                        Citing the context:
                        When referencing a specific context in your answer, use the format:
                        'According to [source], ...'. For example, if drawing from the first context, you would write:
                        'According to Yahoo Finance, ...'.
                        
                        Providing public opinion:
                        At the end of your answer, include public opinion using the format:
                        "Public Opinon:\n
                        Here are some examples of people's reactions to related news: [public opinion quotes]."
                        
                        """





        self.template_prompt_engineer = """
Transform the following user query into a concise and optimized prompt suitable for retrieving relevant chunks from vector data base which consists of news on finance, economics, and politics. Ensure the rephrased prompt clearly reflects key terms and concepts from these fields to improve accuracy in data querying.
Original Query: '{question}'
"""



#####################################################
    def extract_comments(self, query):
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}
        )

        documents = retriever.invoke(query)
        comments = []
        for doc in documents:
            comments.extend(doc.metadata.get('youtube_comments', []))
        comments_str = "\n".join(comments)
        return comments_str
    def prompt_tagging(self,template=None):
        if template is None:
            template = self.template_classification
        return ChatPromptTemplate.from_template(template)

    def tagging_chain(self, query):
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}
        )

        comments = self.extract_comments(query)
        comments_runnable = RunnableLambda(lambda _: comments)

        chaining = (
            {
                "context": retriever,
                "comments": comments_runnable,
                "question": RunnablePassthrough()
            }
            | self.prompt_tagging() #uses self.template_classification
            | self.llm_tagging() #for structured output
        )

        return chaining.invoke(query).dict()

    
####################################Sentiment
    def get_all_vector_ids(self,  input_vector, top_k = 10000, index= None):
        if index is None:
            index = self.pinecone_index
        results = index.query(vector=input_vector, top_k= top_k, include_values=False)
        # ids = set()
        ids=[]
        for result in results['matches']:
            # ids.add(result['id'])
            ids.append(result['id'])
        return ids


    def insert_youtube_metadata(self, top_k=10000, index= None):
        """Set top_k >= total_vector_count of vector database
        """
        if index is None:
            index= self.pinecone_index
        assert top_k > self.vector_db.describe_index_stats().total_vector_count, "top_k is less that total_vector_count of vector database"

        my_index = Pinecone(self.api_key_pinecone).Index(index)
        all_ids = self.get_all_vector_ids([0.0] * 1536, top_k= top_k , index=my_index)  

        for id in all_ids:
        
            old_metadata = index.fetch([id])['vectors'][id]['metadata']
            index.update(
            id= id, 
            set_metadata={'youtube_response_metadata': str(self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) == 
                                                                old_metadata['source']]['response_metadata'].values[0]),
                        'youtube_comments': self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) 
                                                                 == old_metadata['source']]['list_of_comments'].values[0]
        }, 
            namespace=""
        )
  

    def llm_tagging(self, model = 'gpt-3.5-turbo'):
        llm = ChatOpenAI(openai_api_key=self.openai_api_key, model=model).with_structured_output(Classification)
        return llm


    
    def initialize_pinecone(self):
        if self.vector_db is None:  # Check if it's already initialized
            pc = Pinecone(api_key=self.api_key_pinecone)
            self.vector_db = pc.Index(self.pinecone_index)  # Connect to the index and store the connection
        return self.vector_db
        
    
    def preprocess_youtube_text(self, text_file, chunksize,chunkoverlap, preprocess_yt=True):
        if preprocess_yt:
            self.preprocess_input(text_file,save_back_to_file=True)
        else:
            None
        loader = TextLoader(text_file) #text instance of langchain
        text_documents = loader.load() 
        # Assuming RecursiveCharacterTextSplitter is a class you have access to or have created
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunksize, chunk_overlap=chunkoverlap)
        processed_text = splitter.split_documents(text_documents)
        # Further processing can be done here if necessary
        return processed_text

    def upload_to_vb(self,text,embeddings,chunksize, chunkoverlap,index=None,preprocess_yt=True):
        if index is None:
            index = self.pinecone_index
        return PineconeVectorStore.from_documents(self.preprocess_youtube_text(text,chunksize,chunkoverlap,preprocess_yt), self.embeddings, index_name=index)


    def preprocess_input(self, text_file,save_back_to_file=True):
        # Simple text preprocessing: lowercasing, removing punctuation need to add more preprocessing steps do research on it
        # Read and process the content and rewrite it
        if save_back_to_file==True:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            # Write the processed content back, replacing the original
            with open(text_file, 'w') as file:
                file.write(final_text)
        else:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            return final_text
        
    def most_common(self, input_text_file,most_common=10):
        # Preprocess the text
        processed_text = self.preprocess_input(input_text_file,save_back_to_file=False)    
        # Extract keywords based on frequency, assuming more frequent terms are more relevant
        words = processed_text.split()
        word_freq = Counter(words)
        common_words = word_freq.most_common(most_common)  # Get the top 5 words       
        # Form a query by joining the most common words
        query = ' '.join(word for word, _ in common_words)
        return query

    def retrieve_embeddings(self, query, most_similar=2):
        assert self.vector_db is not_none, "Initialize Pinecone first"
        query_result = self.vector_db.query(vector=self.embeddings.embed_query(query), top_k=most_similar)
        ids = [item['id'] for item in query_result['matches']]
        return [self.vector_db.fetch(ids)['vectors'][id]['values'] for id in ids]

    def provide_context(self, query,index=None,most_similar=10):
        if index is None:
            index = self.pinecone_index
        # Provide context to LLM
        return PineconeVectorStore.from_existing_index(index_name=index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k':most_similar}).invoke(query)
        
    def prompt(self,template=None):
        if template is None:
            template = self.template
        return ChatPromptTemplate.from_template(template)

    def prompt_eng(self,template=None):
        if template is None:
            template = self.template_prompt_engineer
        return ChatPromptTemplate.from_template(template)
        
    def llm(self,model=None):
        if model is None:
            model = self.model
        return ChatOpenAI(openai_api_key=self.openai_api_key, model=model)
        
    def parser(self):
        return StrOutputParser()
    def chain_prompt_eng(self,query):
        chaining_eng =  (
        {
         "question": RunnablePassthrough()}
        | self.prompt_eng()
        | self.llm()
        | self.parser())
        return chaining_eng.invoke(query)

    def chain(self,query):
        #complete_query = self.prompt().format(context=self.provide_context(query),question=query)
        #response = self.llm().invoke(complete_query)
        #return self.parser().invoke(response)
        chaining = (
        {"context": PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}), 
         "question": RunnablePassthrough()}
        | self.prompt()
        | self.llm()
        | self.parser())
        #query=str(self.prompt_eng(query))
        return chaining.invoke(query)
    
    def pipe(self,chunk):
        pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
        return pipe(chunk)

    def get_sentiment(self,chunks,neutrality_threshdold=0.3):
        """Gets the compound sentiment of the chunks based on their individual sentiment
        Parameters
        ----------
        chunks : list
            List of text chunks
        neutrality_threshdold : float, optional
            A hyperparameter neutrality_threshdold tunes how certain we need to be of a sentiment to classify it as positive or negative
            (If neutrality_threshdold=1, any list of chunks will result in a neutral sentiment
            If neutrality_threshdold=0, any list of chunks will be classified as positive or negative)
        Returns
        -------
        int
            1 for positive, 0 for neutral, and -1 for negative
        """
        #Assing a numerical value to each sentiment to simplify calculations
        sentiment_values = {'positive':1, 'neutral':0, 'negative':-1}
        #Run each chunk through sentiment model
        sentiments = [self.pipe(chunk.page_content)[0] for chunk in chunks]
        #Print out model output
        #print(sentiments)
        #For each chunk, we compute a sentiment score by multiplying the score times the sentiment value corresponding to its label
        sentiment_scores = [(sentiment['score'])*sentiment_values[sentiment['label']] for sentiment in sentiments]
        #Average sentiment_scores
        avg_sentiment_score = sum(sentiment_scores)/len(sentiment_scores)
        if avg_sentiment_score >= neutrality_threshdold:
            return ('positive',sentiments)
        elif avg_sentiment_score <= -neutrality_threshdold:
            return ('negative',sentiments)
        else:
            return ('neutral',sentiments)




   
    def chain1(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
     
        retriever = PineconeVectorStore.from_existing_index(
        index_name=self.pinecone_index,
        embedding=self.embeddings
    ).as_retriever(
        search_type='similarity',
        search_kwargs={'k': 10}  # Retrieve top 10 similar results
    )

    # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        for item in retrieved_items:                # Extract and evaluate the nested metadata string if it exists
            meta_string = item.metadata['youtube_reponse_metadata']
            metadata.append(meta_string) 
        
        # Combine retrieved metadata into a dictionary
        dic = {i: meta for i, meta in enumerate(metadata)}
    
        # Create a chaining operation where metadata is included as context
       
        chaining = (
            {"context": retriever, 
             "metadata": RunnablePassthrough(dic),
             "question": RunnablePassthrough()}
            | self.prompt()
            | self.llm()
            | self.parser()
        )
    
        # Invoke the complete chain with the initial query
        return chaining.invoke(query)

    

    def chain2(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}  # Retrieve top 10 similar results
        )
      
        # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        for item in retrieved_items:
            content=item.page_content
            meta = ast.literal_eval(item.metadata['youtube_response_metadata'])['snippet']['channelTitle']
            context_entry = {
                "text": content,
                "source": meta
            }
            metadata.append(context_entry)
    
        # Combine retrieved metadata into a dictionary
        dic = {f'Context {i}': context for i, context in enumerate(metadata)}


    
        # Use a lambda function for passing context and question to prompt
        context_and_question = RunnablePassthrough(lambda: {'context': dic ,'question': query})
    
        # Create a sequence of operations
        # Assuming your self.prompt(), self.llm(), and self.parser() are methods that handle their respective parts
        result = {'context': RunnablePassthrough(lambda x : dic) ,'question': RunnablePassthrough()} | self.prompt() | self.llm() | self.parser()
    
        # Invoke the complete chain with the initial query
        return result.invoke(query)


    def chain3(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}  # Retrieve top 10 similar results
        )
      
        # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        comments = []
        youtube_urls =[]
        contexts=[]
       
        
        for item in retrieved_items:
            yt_url = item.metadata['youtube_video_url']
            content=item.page_content
            comments=item.metadata["youtube_comments"]
            meta = ast.literal_eval(item.metadata['youtube_response_metadata'])['snippet']['channelTitle']
            context_entry = {
                "text": content,
                "source": meta,
                "public_opinion":comments
            }
            metadata.append(context_entry)
            comments.extend(item.metadata.get('youtube_comments', []))
            youtube_urls.append(yt_url)
            contexts.append(content)

        comments_str = "\n".join(comments)
        dic = {f'Context {i}': context for i, context in enumerate(metadata)}
        # Use a lambda function for passing context and question to prompt
        context_and_question = RunnablePassthrough(lambda: {'context': dic ,'question': query})
        
        # Create a sequence of operations
        # Assuming your self.prompt(), self.llm(), and self.parser() are methods that handle their respective parts
        result = {'context': RunnablePassthrough(lambda x : dic) ,'question': RunnablePassthrough()} | self.prompt() | self.llm() | self.parser()
        
        comments_runnable = RunnableLambda(lambda _: comments)
        sentiment_chain = (
            {
                "context": retriever,
                "comments": comments_runnable,
                "question": RunnablePassthrough()
            }
            | self.prompt_tagging() #uses self.template_classification
            | self.llm_tagging() #for structured output
        )

        dic_sentiment = sentiment_chain.invoke(query).dict()

        sentiment = dic_sentiment['sentiment']
        aggresiveness = dic_sentiment['aggressiveness']
        political_tendency=dic_sentiment['political_tendency']
        # Invoke the complete chain with the initial query

        fig,ax=self.analyze_and_plot_sentiment(contexts)
        return result.invoke(query),sentiment,aggresiveness,political_tendency, youtube_urls,ax,fig


    def analyze_and_plot_sentiment(self,contexts):
        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
        
        list_sent = []
        data_sent = pd.DataFrame({'labels': []})
        
        # Analyze sentiment for each context
        for item in contexts:
            list_sent.append(sentiment_analyzer(item)[0]["label"])
        
        data_sent['labels'] = list_sent
        
        plt.style.use('seaborn-v0_8-dark')
        
        # Count the sentiment labels
        sentiment_counts = data_sent['labels'].value_counts()
        
        # Create a figure and a set of subplots
        fig, ax = plt.subplots(figsize=(8, 6))
        
        # Define colors consistent with the seaborn-dark theme
        colors = ['#4c72b0' if label == 'positive' else '#c44e52' if label == 'negative' else '#dd8452' for label in sentiment_counts.index]
        
        # Plot the sentiment counts as a bar plot
        ax.bar(sentiment_counts.index, sentiment_counts.values, color=colors)
        
        # Customize the plot
        ax.set_xlabel('Sentiment')
        ax.set_ylabel('Number of Chunks')
        ax.set_title('Sentiment Analysis of Context Chunks')
        ax.set_xticks(range(len(sentiment_counts)))
        ax.set_xticklabels(sentiment_counts.index, rotation=0)
        ax.grid(True)
        
        # Set the face color of the figure and axes to white
        fig.patch.set_facecolor('white')
        ax.set_facecolor('white')
        
        # Adjust layout
        plt.tight_layout()
        
        # Close the figure to prevent it from displaying automatically
        plt.close(fig)
        
        return fig, ax

    def chain_for_eval(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}  # Retrieve top 10 similar results
        )
      
        # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        comments = []
        youtube_urls =[]
        contexts=[]
       
        
        for item in retrieved_items:
            yt_url = item.metadata['youtube_video_url']
            content=item.page_content
            comments=item.metadata["youtube_comments"]
            meta = ast.literal_eval(item.metadata['youtube_response_metadata'])['snippet']['channelTitle']
            context_entry = {
                "text": content,
                "source": meta,
                "public_opinion":comments
            }
            metadata.append(context_entry)
            comments.extend(item.metadata.get('youtube_comments', []))
            youtube_urls.append(yt_url)
            contexts.append(content)

        comments_str = "\n".join(comments)
        dic = {f'Context {i}': context for i, context in enumerate(metadata)}
        # Use a lambda function for passing context and question to prompt
        context_and_question = RunnablePassthrough(lambda: {'context': dic ,'question': query})
        
        # Create a sequence of operations
        # Assuming your self.prompt(), self.llm(), and self.parser() are methods that handle their respective parts
        result = {'context': RunnablePassthrough(lambda x : dic) ,'question': RunnablePassthrough()} | self.prompt() | self.llm() | self.parser()
        
        
        
        # Invoke the complete chain with the initial query

        
        return result.invoke(query)

            

    
     

In [19]:
bot = FinFeedRAG(pine_cone_api_key=os.getenv('PINECONE_API_KEY'), openai_api_key=os.getenv('OPENAI_API_KEY'), pinecone_index='latest-news')


In [24]:
#LOOK AT THIS

user_prompts=['what is going on with inmigrants','what is the capital of the US?', 'has inflation gone up?']

def model_evaluation(user_prompts):
    
    evaluations = []
    our_answers = []
    benchmark_answers = []
    preferred_answers = []

    #initialize our model
    bot = FinFeedRAG(pine_cone_api_key=os.getenv('PINECONE_API_KEY'), openai_api_key=os.getenv('OPENAI_API_KEY'), pinecone_index='latest-news')
 #Ask guys
    
    for user_prompt in user_prompts:

        #Benchmark answer
        benchmark_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)
        benchmark_answer = benchmark_model.invoke(user_prompt).content

        #Our answer
        our_answer = bot.chain_for_eval(user_prompt)

        #Evaluation
        template = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful finance bot evaluator. You will be given a user prompt, and the corresponding answers of two different bots. Your job is to choose the most up to date answer where today's year is 2024 . \
        Please provide a reason for your choice."),
            ("system", "User_prompt: " + user_prompt),
            ("system", "Answer 1: " + benchmark_answer),
            ("system", "Answer 2: " + our_answer)])

        prompt_value = template.invoke({"user_prompt":user_prompt,"benchmark_answer":benchmark_answer,"our_answer":our_answer})

        chaining = ({} | template | benchmark_model)
        evaluation = chaining.invoke([''])

        our_answers.append(our_answer)
        benchmark_answers.append(benchmark_answer)
        evaluations.append(evaluation)

    #Extract preffered answer as an integer 
    for evaluation in evaluations:

        #Benchmark answer
        evaluator_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)

        #Evaluation
        template = ChatPromptTemplate.from_messages([
            ("system", "Determine what is the user preferred answer. Please restrict your output to a single integer."),
            ("system", "User_prompt: " + evaluation.content)])
        
        chaining = ({} | template | evaluator_model)
        preferred_answer = chaining.invoke([''])
        preferred_answers.append(int(preferred_answer.content))
    
    return benchmark_answers, our_answers, evaluations, preferred_answers
    

In [20]:
answer=bot.chain_for_eval("Is inflation cooling in US")

In [21]:
answer

'According to CNBC, the latest data shows that inflation in the US is starting to cool down. The Consumer Price Index (CPI) rose by 0.3% in August, which is lower than the 0.5% increase in July. This indicates that inflation may be slowing down.\n\nPublic Opinion:\n\nMany Americans are relieved to hear that inflation may be cooling down. One person said, "I\'ve been feeling the pinch in my wallet lately, so any relief from high inflation is welcome." Another individual mentioned, "I hope this trend continues and prices start to stabilize."'

In [25]:
model_evaluation(["Is inflation cooling in US"])

(['As of the most recent data, inflation in the US has been on the rise. The Consumer Price Index (CPI) increased by 0.9% in June 2021, marking the largest monthly increase in over a decade. This has raised concerns about rising inflationary pressures in the economy. However, it is important to note that inflation can be influenced by various factors and can fluctuate over time. It is important to monitor economic indicators and data to assess whether inflation is cooling or continuing to rise.'],
 ['According to the context provided by Yahoo Finance, inflation in the US may be cooling down. This suggests a potential decrease in the overall rise in prices for goods and services.\n\nPublic Opinion:\n\nHere are some examples of people\'s reactions to related news:\n1. "It\'s a relief to hear that inflation might be cooling down. It has been tough managing expenses with prices constantly going up." - Jane Doe\n2. "I hope this trend continues and we see some stability in prices soon. It wo

In [4]:
contexts[0].page_content

'daytoday movement stock market know know news perspective obviously exciting trying stay business john weve got keep lights yeah exactly investors fall trap something dad instilled nothing action'

In [63]:
import pinecone
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd



model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


def analyze_and_plot_sentiment(contexts, sentiment_analyzer, do_answer=True):
    list_sent = []
    data_sent = pd.DataFrame({'labels': []})
    
    for item in contexts:
        context = item.page_content
        list_sent.append(sentiment_analyzer(context)[0]["label"])
    
    data_sent['labels'] = list_sent

    plt.style.use('seaborn-v0_8-dark')
    
    # Count the sentiment labels
    sentiment_counts = data_sent['labels'].value_counts()
    
    # Create a figure and a set of subplots
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Define colors consistent with the seaborn-dark theme
    colors = ['#4c72b0' if label == 'positive' else '#c44e52' if label == 'negative' else '#dd8452' for label in sentiment_counts.index]
    
    # Plot the sentiment counts as a bar plot
    sentiment_counts.plot(kind='bar', color=colors, ax=ax)
    
    # Customize the plot
    ax.set_xlabel('Sentiment')
    ax.set_ylabel('Number of Chunks')
    ax.set_title('Sentiment Analysis of Context Chunks')
    ax.set_xticks(range(len(sentiment_counts)))
    ax.set_xticklabels(sentiment_counts.index, rotation=0)
    ax.grid(True)
    
    # Set the face color of the figure and axes to white
    fig.patch.set_facecolor('white')
    ax.set_facecolor('white')
    
    # Adjust layout
    plt.tight_layout()
    
    return fig, ax



In [116]:
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

def analyze_and_plot_sentiment(contexts):
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
    sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
    
    list_sent = []
    data_sent = pd.DataFrame({'labels': []})
    
    # Analyze sentiment for each context
    for item in contexts:
        list_sent.append(sentiment_analyzer(item)[0]["label"])
    
    data_sent['labels'] = list_sent
    
    plt.style.use('seaborn-v0_8-dark')
    
    # Count the sentiment labels
    sentiment_counts = data_sent['labels'].value_counts()
    
    # Create a figure and a set of subplots
    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Define colors consistent with the seaborn-dark theme
    colors = ['#4c72b0' if label == 'positive' else '#c44e52' if label == 'negative' else '#dd8452' for label in sentiment_counts.index]
    
    # Plot the sentiment counts as a bar plot
    ax.bar(sentiment_counts.index, sentiment_counts.values, color=colors)
    
    # Customize the plot
    ax.set_xlabel('Sentiment')
    ax.set_ylabel('Number of Chunks')
    ax.set_title('Sentiment Analysis of Context Chunks')
    ax.set_xticks(range(len(sentiment_counts)))
    ax.set_xticklabels(sentiment_counts.index, rotation=0)
    ax.grid(True)
    
    # Set the face color of the figure and axes to white
    fig.patch.set_facecolor('white')
    ax.set_facecolor('white')
    
    # Adjust layout
    plt.tight_layout()
    
    # Close the figure to prevent it from displaying automatically
    plt.close(fig)
    
    return fig, ax

# Example usage
contexts = ["The market is doing well.", "I am not happy with the company's performance."]
fig, ax = analyze_and_plot_sentiment(contexts)

# To display the plot when needed, use plt.show() on the returned figure
# fig.show() # Uncomment this line to display the plot when needed


In [117]:
ax

<Axes: title={'center': 'Sentiment Analysis of Context Chunks'}, xlabel='Sentiment', ylabel='Number of Chunks'>

In [16]:
retrieved_items = retriever.invoke("how is Stock market doing")
import ast

metadata = []
for i, item in enumerate(retrieved_items):
    content = item.page_content
    comments=item.metadata["youtube_comments"]

    meta = ast.literal_eval(item.metadata['youtube_response_metadata'])['snippet']['channelTitle']
    context_entry = {
        "text": content,
        "source": meta,
        "comment":comments
    }
    metadata.append(context_entry)

dic = {f'Context {i}': context for i, context in enumerate(metadata)}


In [17]:
dic

{'Context 0': {'text': 'daytoday movement stock market know know news perspective obviously exciting trying stay business john weve got keep lights yeah exactly investors fall trap something dad instilled nothing action',
  'source': 'Yahoo Finance',
  'comment': ["Everything's expensive, value no longer exists."]},
 'Context 1': {'text': 'food inflation earnings strong youre starting see stocks roll thats stock like walmart juggernaut theyve done amazing job inventory stores especially grocery side stock particular think bucking trend',
  'source': 'Yahoo Finance',
  'comment': []}}

In [8]:
retriever = PineconeVectorStore.from_existing_index(
            index_name="day1",
            embedding=OpenAIEmbeddings()
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 2}  # Retrieve top 10 similar results
        )
    
        # Invoke the retriever with the query and process metadata
retrieved_items = retriever.invoke("how is Stock market doing")

In [9]:
retrieved_items

[Document(page_content='daytoday movement stock market know know news perspective obviously exciting trying stay business john weve got keep lights yeah exactly investors fall trap something dad instilled nothing action', metadata={'source': 'data/text/what_it_takes_to_be_a_value_investor_today.txt', 'youtube_comments': ["Everything's expensive, value no longer exists."], 'youtube_response_metadata': "{'id': {'kind': 'youtube#video', 'videoId': 'Bv5B5U9Gkk4'}, 'snippet': {'publishedAt': '2024-05-17T13:09:10Z', 'channelId': 'UCEAZeUIeJs0IjQiqTCdVSIg', 'title': 'What it takes to be a value investor today', 'channelTitle': 'Yahoo Finance'}}", 'youtube_video_url': 'https://www.youtube.com/watch?v=Bv5B5U9Gkk4'}),
 Document(page_content='food inflation earnings strong youre starting see stocks roll thats stock like walmart juggernaut theyve done amazing job inventory stores especially grocery side stock particular think bucking trend', metadata={'source': 'data/text/stocks_tread_water_with_d

In [12]:
retrieved_items[0].metadata['youtube_video_url']

'https://www.youtube.com/watch?v=Bv5B5U9Gkk4'

In [14]:
retrieved_items[0].metadata["youtube_comments"]


["Everything's expensive, value no longer exists."]

In [35]:
a+'. Source: '+b

'dont think theres much happy talk happy talk look markets bullion markets effusive fact generally still good global growth yes cracks showing overall still pretty strong companies making good profits. Source: Bloomberg Television'

In [135]:
metadata = []
for item in retrieved_items:
   

    # Extract and evaluate the nested metadata string if it exists
    meta_string = item.metadata['youtube_reponse_metadata']
    metadata.append(ast.literal_eval(meta_string))

    
        # Combine retrieved metadata into a dictionary
dic = {i: meta for i, meta in enumerate(metadata)}
    

In [136]:
dic

{0: {'id': {'kind': 'youtube#video', 'videoId': 'HY583QRI5v0'},
  'snippet': {'publishedAt': '2024-05-17T11:21:26Z',
   'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg',
   'title': 'Inflation Fears, Richemont&#39;s New &#39;Bos&#39; | Bloomberg Markets Today 05/17',
   'channelTitle': 'Bloomberg Television'}},
 1: {'id': {'kind': 'youtube#video', 'videoId': 'Bv5B5U9Gkk4'},
  'snippet': {'publishedAt': '2024-05-17T13:09:10Z',
   'channelId': 'UCEAZeUIeJs0IjQiqTCdVSIg',
   'title': 'What it takes to be a value investor today',
   'channelTitle': 'Yahoo Finance'}},
 2: {'id': {'kind': 'youtube#video', 'videoId': 'jyOVRQIrIpc'},
  'snippet': {'publishedAt': '2024-05-17T16:15:14Z',
   'channelId': 'UCEAZeUIeJs0IjQiqTCdVSIg',
   'title': 'Stocks tread water with Dow aiming for 40,000 | May 17 Yahoo Finance',
   'channelTitle': 'Yahoo Finance'}},
 3: {'id': {'kind': 'youtube#video', 'videoId': 'jyOVRQIrIpc'},
  'snippet': {'publishedAt': '2024-05-17T16:15:14Z',
   'channelId': 'UCEAZeUIeJs0IjQiqTCdVS

In [63]:
bot = FinFeedRAG(pine_cone_api_key=os.getenv('PINECONE_API_KEY'), openai_api_key=os.getenv('OPENAI_API_KEY'), pinecone_index='day1')
bot.chain2("how is Stock market doing")

In [64]:
bot.chain2("how is Stock market doing")

'As of today, the stock market is performing well. According to a report from CNBC, the S&P 500 and Dow Jones Industrial Average both reached record highs this week due to positive earnings reports from major companies. Additionally, the Nasdaq Composite also saw gains thanks to strong tech stocks. Overall, investors seem optimistic about the current state of the stock market.'

In [39]:
bot.chain("how is  US economy")

'As a news reporter, based on the latest information received, the US economy seems to have surprised many people with its upside. There are signals of growth and positive indicators contributing to the strength of the economy. The availability of labor and various factors like industrial policy, innovation, and AI advancements are creating a powerful economic environment. Overall, the US economy appears to be in a positive state with potential for further growth.'

In [92]:
bot.chain_prompt_eng("US stock market doing good?")

'Retrieve data on the performance of the US stock market from the vector database, focusing on news related to finance, economics, and politics.'

In [97]:
bot.chain("give me 10 news on Stock market")

'I do not have enough information to answer this question.'

In [10]:
bot.provide_context("what are the news on finance, economics, and politics.")

[Document(page_content='much politics matters imf world bank meetings weeks ago struck many analysis said decisions politicians make really matter politics economics think see moment comes conduct trade policy future ensure', metadata={'source': "data/text/georgia's_eu_dream_at_risk,_china_housing_crisis_rescue_plan__bloomberg_the_pulse_051724.txt", 'youtube_reponse_metadata': "{'id': {'kind': 'youtube#video', 'videoId': '9NIb2XuW8UY'}, 'snippet': {'publishedAt': '2024-05-17T13:27:14Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Georgia&#39;s EU Dream at Risk, China Housing Crisis Rescue Plan | Bloomberg The Pulse 05/17/24', 'channelTitle': 'Bloomberg Television'}}"}),
 Document(page_content='making headlines getting lot attention lets start ben make market think headed yeah mean look yesterdays inflation numbers news fairly good economy remains strong inflation moderating think equities', metadata={'source': "data/text/markets_'priced_to_perfection,'_but_be_cautious_strategist

In [12]:
a=PineconeVectorStore.from_existing_index(index_name='day1',embedding=OpenAIEmbeddings()).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}).invoke("what are the news on finance, economics, and politics.")

In [61]:
retriever = PineconeVectorStore.from_existing_index(
         index_name="day1",
         embedding=OpenAIEmbeddings()
        ).as_retriever(
        search_type='similarity',
     search_kwargs={'k': 10},include_metadata=True)

In [19]:
b=retriever.invoke("what are the news on finance, economics, and politics.")

In [20]:
ast.literal_eval(b[0].metadata['youtube_reponse_metadata'])

{'id': {'kind': 'youtube#video', 'videoId': '9NIb2XuW8UY'},
 'snippet': {'publishedAt': '2024-05-17T13:27:14Z',
  'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg',
  'title': 'Georgia&#39;s EU Dream at Risk, China Housing Crisis Rescue Plan | Bloomberg The Pulse 05/17/24',
  'channelTitle': 'Bloomberg Television'}}

In [16]:
import ast

In [18]:
ast.literal_eval(b[0].metadata['youtube_reponse_metadata'])

NameError: name 'b' is not defined

In [90]:
dic ={}

In [92]:
for i, j in enumerate([v.metadata['youtube_reponse_metadata'] for v in b]):
    dic[i]=j
    

In [93]:
dic

{0: "{'id': {'kind': 'youtube#video', 'videoId': '9NIb2XuW8UY'}, 'snippet': {'publishedAt': '2024-05-17T13:27:14Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Georgia&#39;s EU Dream at Risk, China Housing Crisis Rescue Plan | Bloomberg The Pulse 05/17/24', 'channelTitle': 'Bloomberg Television'}}",
 1: "{'id': {'kind': 'youtube#video', 'videoId': 'bLNUUcYRv9g'}, 'snippet': {'publishedAt': '2024-05-16T22:00:32Z', 'channelId': 'UCEAZeUIeJs0IjQiqTCdVSIg', 'title': 'Markets &#39;priced to perfection,&#39; but be cautious: Strategist', 'channelTitle': 'Yahoo Finance'}}",
 2: "{'id': {'kind': 'youtube#video', 'videoId': 'nvuS4UMeu7Q'}, 'snippet': {'publishedAt': '2024-05-17T05:24:52Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Bloomberg Markets: Asia 05/17/2024', 'channelTitle': 'Bloomberg Television'}}",
 3: "{'id': {'kind': 'youtube#video', 'videoId': 'ywnQZd_hawY'}, 'snippet': {'publishedAt': '2024-05-17T05:39:26Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'China

In [81]:
[v.metadata['youtube_reponse_metadata'] for v in b]

["{'id': {'kind': 'youtube#video', 'videoId': '9NIb2XuW8UY'}, 'snippet': {'publishedAt': '2024-05-17T13:27:14Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Georgia&#39;s EU Dream at Risk, China Housing Crisis Rescue Plan | Bloomberg The Pulse 05/17/24', 'channelTitle': 'Bloomberg Television'}}",
 "{'id': {'kind': 'youtube#video', 'videoId': 'bLNUUcYRv9g'}, 'snippet': {'publishedAt': '2024-05-16T22:00:32Z', 'channelId': 'UCEAZeUIeJs0IjQiqTCdVSIg', 'title': 'Markets &#39;priced to perfection,&#39; but be cautious: Strategist', 'channelTitle': 'Yahoo Finance'}}",
 "{'id': {'kind': 'youtube#video', 'videoId': 'nvuS4UMeu7Q'}, 'snippet': {'publishedAt': '2024-05-17T05:24:52Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Bloomberg Markets: Asia 05/17/2024', 'channelTitle': 'Bloomberg Television'}}",
 "{'id': {'kind': 'youtube#video', 'videoId': 'ywnQZd_hawY'}, 'snippet': {'publishedAt': '2024-05-17T05:39:26Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'China Regulators 

In [34]:
a[0].metadata

{'source': "data/text/georgia's_eu_dream_at_risk,_china_housing_crisis_rescue_plan__bloomberg_the_pulse_051724.txt",
 'youtube_reponse_metadata': "{'id': {'kind': 'youtube#video', 'videoId': '9NIb2XuW8UY'}, 'snippet': {'publishedAt': '2024-05-17T13:27:14Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Georgia&#39;s EU Dream at Risk, China Housing Crisis Rescue Plan | Bloomberg The Pulse 05/17/24', 'channelTitle': 'Bloomberg Television'}}"}

In [None]:
PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10})