In [72]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone import Pinecone,  PodSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnablePassthrough,RunnableLambda
from langchain_core.pydantic_v1 import BaseModel, Field
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
import pandas as pd
from openai import OpenAI
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import ast
# load_dotenv("API_KEYS")

In [73]:
# load environment variables from .env file
load_dotenv("API_KEYS.env")

# get the environment variable
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [75]:
class Classification(BaseModel):
    answer: str = Field(description="""Do Answer the question based on the context below but pretend like you are a news reporter who just received the context
                         as the latest news. 
                        If you can't answer the question, reply 'I do not have enough information to answer this question' """)
    sentiment: str = Field(description="The sentiment of the comments. Rate each comment.")
    aggressiveness: str = Field(description="How aggressive the comments is on a scale from 1 to 10. 1 being polite and 10 means aggressive."
    )
    language: str = Field(description="The language the text is written in")
    political_tendency : str = Field(description= """The political leaning of the comment. Emphasize political leaning on both economic andsocial policies. 
                                     Justify your answer.""")


class FinFeedRAG:
    def __init__(self, pinecone_index,pine_cone_api_key= pinecone_api_key, openai_api_key= openai_api_key,
                  embeddings_model= OpenAIEmbeddings(),model='gpt-3.5-turbo', path_to_df = 'combined_df.pkl'):
        self.openai_api_key=openai_api_key
        self.api_key_pinecone = pine_cone_api_key
        self.pinecone_index = pinecone_index
        # Initialize Pinecone connection
        self.vector_db = None
        self.embeddings=embeddings_model
        self.model=model
        #read latest dataframe
        self.df_youtube_data= pd.read_pickle(path_to_df)
        self.template_classification = """
                            Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                            If you can't answer the question, reply "I do not have enough information to answer this question".
                
                            Extract the properties mentioned in the 'Classification' function and also justify your analysis.

                            Context: {context}

                            Public Comments: {comments}
                            
                            Question: {question}
                            """
        self.template = """
                        Answer the question based on the context provided below, which is structured in a dictionary format. Assume the role of a news reporter.
                        Each time you use information from the context, you must cite it explicitly. Cite the source accompanying each context entry by including 
                        it directly in your response. Additionally, for each context, comments from public is provided. At the end of the answer, please provide some 
                        examples from people's opinion.
                        
                        Use as many contexts as possible to provide a comprehensive answer. If you lack sufficient information to formulate a response, please state:
                        "I do not have enough information to answer this question."
                        
                        Contexts:
                        {context}
                        
                        Question:
                        {question}
                        
                        Citing the context:
                        When referencing a specific context in your answer, use the format:
                        'According to [source], ...'. For example, if drawing from the first context, you would write:
                        'According to Yahoo Finance, ...'.
                        
                        Providing people's opinion:
                        At the end of your answer, include public opinion using the format:
                        "Public Opinon:\n
                        Here are some examples of people's reactions to related news: [public opinion quotes]."
                        
                        """
        
        self.template_prompt_engineer = """
Transform the following user query into a concise and optimized prompt suitable for retrieving relevant chunks from vector data base which consists of news on 
finance, economics, and politics. Ensure the rephrased prompt clearly reflects key terms and concepts from these fields to improve accuracy in data querying.
Original Query: '{question}'
"""



#####################################################
    def extract_comments(self, query):
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}
        )

        documents = retriever.invoke(query)
        comments = []
        for doc in documents:
            comments.extend(doc.metadata.get('youtube_comments', []))
        comments_str = "\n".join(comments)
        return comments_str
    
    def prompt_tagging(self,template=None):
        if template is None:
            template = self.template_classification
        return ChatPromptTemplate.from_template(template)

    def tagging_chain(self, query):
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}
        )

        comments = self.extract_comments(query)
        comments_runnable = RunnableLambda(lambda _: comments)

        chaining = (
            {
                "context": retriever,
                "comments": comments_runnable,
                "question": RunnablePassthrough()
            }
            | self.prompt_tagging() #uses self.template_classification
            | self.llm_tagging() #for structured output
        )

        return chaining.invoke(query).dict()

    
####################################Sentiment
    def get_all_vector_ids(self,  input_vector= [0.0] * 1536, top_k = 10000, index= None):   # Replace 768 with your vector dimension
        if index is None:
            index = self.pinecone_index
            
        results = index.query(vector=input_vector, top_k= top_k, include_values=False)
        # ids = set()
        ids=[]
        for result in results['matches']:
            # ids.add(result['id'])
            ids.append(result['id'])
        return ids


    def insert_youtube_metadata(self, top_k=10000, index= None):
        """Set top_k >= total_vector_count of vector database
        """
        if index is None:
            index= self.pinecone_index
        assert top_k > self.vector_db.describe_index_stats().total_vector_count, "top_k is less that total_vector_count of vector database"

        #my_index = Pinecone(self.api_key_pinecone).Index(index)
        all_ids = self.get_all_vector_ids([0.0] * 1536, top_k= top_k , index=index)  

        for id in all_ids:
        
            old_metadata = index.fetch([id])['vectors'][id]['metadata']
            index.update(
            id= id, 
            set_metadata={'youtube_response_metadata': str(self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) == 
                                                                old_metadata['source']]['response_metadata'].values[0]),
                        'youtube_comments': self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) 
                                                                 == old_metadata['source']]['list_of_comments'].values[0],
                        'youtube_video_url': str(self.df_youtube_data[self.df_youtube_data['txt_file_path'].apply(lambda x : x.replace('\\','/')) 
                                                                 == old_metadata['source']]['video_urls'].values[0])  
        }, 
            namespace=""
        )
  

    def llm_tagging(self, model = 'gpt-3.5-turbo'):
       
        llm = ChatOpenAI(openai_api_key=self.openai_api_key, model=model).with_structured_output(Classification)
        return llm

    
    def initialize_pinecone(self):
        if self.vector_db is None:  # Check if it's already initialized
            pc = Pinecone(api_key=self.api_key_pinecone)
            self.vector_db = pc.Index(self.pinecone_index)  # Connect to the index and store the connection
        return self.vector_db
        
    
    def preprocess_youtube_text(self, text_file, chunksize,chunkoverlap, preprocess_yt=True):
        if preprocess_yt:
            self.preprocess_input(text_file,save_back_to_file=True)
        else:
            None
        loader = TextLoader(text_file) #text instance of langchain
        text_documents = loader.load() 
        # Assuming RecursiveCharacterTextSplitter is a class you have access to or have created
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunksize, chunk_overlap=chunkoverlap)
        processed_text = splitter.split_documents(text_documents)
        # Further processing can be done here if necessary
        return processed_text

    def upload_to_vb(self,text,embeddings,chunksize, chunkoverlap,index=None,preprocess_yt=True):
        if index is None:
            index = self.pinecone_index
        return PineconeVectorStore.from_documents(self.preprocess_youtube_text(text,chunksize,chunkoverlap,preprocess_yt), self.embeddings, index_name=index)

    def add_to_vb(self,text,chunksize, chunkoverlap,index=None,preprocess_yt=True):
        if index is None:
            index = self.pinecone_index
        embed= self.embeddings.embed_documents(self.preprocess_youtube_text(text,chunksize,chunkoverlap,preprocess_yt))
        return PineconeVectorStore.add_documents(embed, index_name=index)   

    def preprocess_input(self, text_file,save_back_to_file=True):
        # Simple text preprocessing: lowercasing, removing punctuation need to add more preprocessing steps do research on it
        # Read and process the content and rewrite it
        if save_back_to_file==True:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            # Write the processed content back, replacing the original
            with open(text_file, 'w') as file:
                file.write(final_text)
        else:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            return final_text
        
    def most_common(self, input_text_file,most_common=10):
        # Preprocess the text
        processed_text = self.preprocess_input(input_text_file,save_back_to_file=False)    
        # Extract keywords based on frequency, assuming more frequent terms are more relevant
        words = processed_text.split()
        word_freq = Counter(words)
        common_words = word_freq.most_common(most_common)  # Get the top 5 words       
        # Form a query by joining the most common words
        query = ' '.join(word for word, _ in common_words)
        return query

    def retrieve_embeddings(self, query, most_similar=10):
        assert self.vector_db is not None, "Initialize Pinecone first"
        query_result = self.vector_db.query(vector=self.embeddings.embed_query(query), top_k=most_similar)
        ids = [item['id'] for item in query_result['matches']]
        return [self.vector_db.fetch(ids)['vectors'][id]['values'] for id in ids]

    def provide_context(self, query,index=None,most_similar=10):
        if index is None:
            index = self.pinecone_index
        # Provide context to LLM
        return PineconeVectorStore.from_existing_index(index_name=index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k':most_similar}).invoke(query)
        
    def prompt(self,template=None):
        if template is None:
            template = self.template
        return ChatPromptTemplate.from_template(template)

    def prompt_eng(self,template=None):
        if template is None:
            template = self.template_prompt_engineer
        return ChatPromptTemplate.from_template(template)
        
    def llm(self,model=None):
        if model is None:
            model = self.model
        return ChatOpenAI(openai_api_key=self.openai_api_key, model=model)
        
    def parser(self):
        return StrOutputParser()
    def chain_prompt_eng(self,query):
        chaining_eng =  (
        {
         "question": RunnablePassthrough()}
        | self.prompt_eng()
        | self.llm()
        | self.parser())
        return chaining_eng.invoke(query)

    def chain(self,query):
        #complete_query = self.prompt().format(context=self.provide_context(query),question=query)
        #response = self.llm().invoke(complete_query)
        #return self.parser().invoke(response)
        chaining = (
        {"context": PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}), 
         "question": RunnablePassthrough()}
        | self.prompt()
        | self.llm()
        | self.parser())
        #query=str(self.prompt_eng(query))
        return chaining.invoke(query)
    

   
    def chain1(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
     
        retriever = PineconeVectorStore.from_existing_index(
        index_name=self.pinecone_index,
        embedding=self.embeddings
    ).as_retriever(
        search_type='similarity',
        search_kwargs={'k': 10}  # Retrieve top 10 similar results
    )

    # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        for item in retrieved_items:                # Extract and evaluate the nested metadata string if it exists
            meta_string = item.metadata['youtube_reponse_metadata']
            metadata.append(meta_string) 
        
        # Combine retrieved metadata into a dictionary
        dic = {i: meta for i, meta in enumerate(metadata)}
    
        # Create a chaining operation where metadata is included as context
       
        chaining = (
            {"context": retriever, 
             "metadata": RunnablePassthrough(dic),
             "question": RunnablePassthrough()}
            | self.prompt()
            | self.llm()
            | self.parser()
        )
    
        # Invoke the complete chain with the initial query
        return chaining.invoke(query)

    

    def chain2(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 10}  # Retrieve top 10 similar results
        )
      
        # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        for item in retrieved_items:
            content=item.page_content
            meta = ast.literal_eval(item.metadata['youtube_response_metadata'])['snippet']['channelTitle']
            context_entry = {
                "text": content,
                "source": meta
            }
            metadata.append(context_entry)
    
        # Combine retrieved metadata into a dictionary
        dic = {f'Context {i}': context for i, context in enumerate(metadata)}


    
        # Use a lambda function for passing context and question to prompt
        context_and_question = RunnablePassthrough(lambda: {'context': dic ,'question': query})
    
        # Create a sequence of operations
        # Assuming your self.prompt(), self.llm(), and self.parser() are methods that handle their respective parts
        result = {'context': RunnablePassthrough(lambda x : dic) ,'question': RunnablePassthrough()} | self.prompt() | self.llm() | self.parser()
    
        # Invoke the complete chain with the initial query
        return result.invoke(query)


    def chain3(self, query):
        # Initialize the retriever using an existing Pinecone index with specified embeddings
        retriever = PineconeVectorStore.from_existing_index(
            index_name=self.pinecone_index,
            embedding=self.embeddings
        ).as_retriever(
            search_type='similarity',
            search_kwargs={'k': 5}  # Retrieve top 10 similar results
        )
      
        # Invoke the retriever with the query and process metadata
        retrieved_items = retriever.invoke(query)
        metadata = []
        comments = []
        youtube_urls =[]
        contexts=[]
       
        
        for item in retrieved_items:
            yt_url = item.metadata['youtube_video_url']
            content=item.page_content
            comments=item.metadata["youtube_comments"]
            meta = ast.literal_eval(item.metadata['youtube_response_metadata'])['snippet']['channelTitle']
            context_entry = {
                "text": content,
                "source": meta,
                "public_opinion":comments
            }
            metadata.append(context_entry)
            comments.extend(item.metadata.get('youtube_comments', []))
            youtube_urls.append(yt_url)
            contexts.append(content)

        comments_str = "\n".join(comments)
        dic = {f'Context {i}': context for i, context in enumerate(metadata)}
        # Use a lambda function for passing context and question to prompt
        context_and_question = RunnablePassthrough(lambda: {'context': dic ,'question': query})
        
        # Create a sequence of operations
        # Assuming your self.prompt(), self.llm(), and self.parser() are methods that handle their respective parts
        result = {'context': RunnablePassthrough(lambda x : dic) ,'question': RunnablePassthrough()} | self.prompt() | self.llm() | self.parser()
        
        comments_runnable = RunnableLambda(lambda _: comments)
        sentiment_chain = (
            {
                "context": retriever,
                "comments": comments_runnable,
                "question": RunnablePassthrough()
            }
            | self.prompt_tagging() #uses self.template_classification
            | self.llm_tagging() #for structured output
        )

        dic_sentiment = sentiment_chain.invoke(query).dict()

        sentiment = dic_sentiment['sentiment']
        aggresiveness = dic_sentiment['aggressiveness']
        political_tendency=dic_sentiment['political_tendency']
        # Invoke the complete chain with the initial query

        fig,ax=self.analyze_and_plot_sentiment(contexts)
        return result.invoke(query),sentiment,aggresiveness,political_tendency, youtube_urls,ax,fig


    def analyze_and_plot_sentiment(self,contexts):
        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
        
        list_sent = []
        data_sent = pd.DataFrame({'labels': []})
        
        # Analyze sentiment for each context
        for item in contexts:
            list_sent.append(sentiment_analyzer(item)[0]["label"])
        
        data_sent['labels'] = list_sent
        
        plt.style.use('seaborn-v0_8-dark')
        
        # Count the sentiment labels
        sentiment_counts = data_sent['labels'].value_counts()
        
        # Create a figure and a set of subplots
        fig, ax = plt.subplots(figsize=(8, 6))
        
        # Define colors consistent with the seaborn-dark theme
        colors = ['#4c72b0' if label == 'positive' else '#c44e52' if label == 'negative' else '#dd8452' for label in sentiment_counts.index]
        
        # Plot the sentiment counts as a bar plot
        ax.bar(sentiment_counts.index, sentiment_counts.values, color=colors)
        
        # Customize the plot
        ax.set_xlabel('Sentiment')
        ax.set_ylabel('Number of Chunks')
        #ax.set_title('Sentiment Analysis of Context Chunks')
        ax.set_xticks(range(len(sentiment_counts)))
        ax.set_xticklabels(sentiment_counts.index, rotation=0)
        ax.grid(True)
        
        # Set the face color of the figure and axes to white
        fig.patch.set_facecolor('white')
        ax.set_facecolor('white')
        
        # Adjust layout
        plt.tight_layout()
        
        # Close the figure to prevent it from displaying automatically
        plt.close(fig)
        
        return fig, ax
            

    
     

In [78]:
bot = FinFeedRAG(pinecone_index='news-data')
# bot1= FinFeedRAG(pinecone_index='latest-news')
pine= bot.initialize_pinecone()
# pine1= bot1.initialize_pinecone()

In [80]:
directory_path = "data/text/"
for filename in os.listdir(directory_path):
    if filename.endswith(('.txt')):
        # Perform your desired actions here
        print(directory_path + filename)
        bot.upload_to_vb(directory_path + filename,embeddings=OpenAIEmbeddings(openai_api_key=openai_api_key),
                         chunksize=1000,chunkoverlap=100,index='news-data', preprocess_yt= True)


data/text/#uk_liberal_democrats_leader_falls_off_paddle_board.txt
data/text/3_places_to_invest_your_money_right_now_cashing_in_on_a_rally.txt
data/text/47%_of_american_workers_feel_financially_well_report.txt
data/text/abukarsh_every_central_bank_wants_to_take_right_step.txt
data/text/airfares_peaking_as_travelers_in_europe,_asia_seek_savings__reuters.txt
data/text/ai_weekly_musk_startup_generates_$6_billion_in_fresh_funding__reuters.txt
data/text/allcargo_on_why_supply_chain_outlook_is_looking_up.txt
data/text/americans_are_delaying_home_remodeling_amid_high_borrowing_costs.txt
data/text/annual_sales_of_weight-loss_drugs_seen_hitting_$150_billion__reuters.txt
data/text/apples_china_iphone_shipments_up_52%.txt
data/text/apple_sees_big_iphone_rebound_in_china.txt
data/text/apple_turns_it_around,_trump_trial_closing_arguments,_israel_strikes_rafah_#bloombergbrief_#shorts.txt
data/text/armenia_protests_calls_for_pm_nikol_pashinyan_to_resign.txt
data/text/artists_from_gaza_express_palestin

In [81]:
print(pine.describe_index_stats())
# print(pine1.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2193}},
 'total_vector_count': 2193}


In [82]:
bot.get_all_vector_ids(index=pine)

['75d42c28-8cfc-4b48-96d3-a99ebde5b4db',
 '4d81c2a0-10f6-40be-9cb4-18e0b32eeedf',
 '74a348bc-601e-42a6-81e8-a492d8d53741',
 'a1abfa3d-becf-4ad6-8fcc-1d9c688f7b02',
 '63f64647-0412-49f0-a200-b0a50ad4d32d',
 '89b51932-8957-4a6c-8271-30900bb44f43',
 '95abdcb0-8f90-41cc-b5d7-2146208294cf',
 '041b8570-e19e-45c9-a173-372c2658e794',
 '3015091a-68c4-49e7-a3af-f8949221011a',
 '25c09050-b9b4-46da-ac27-ddd86885e06f',
 'af836032-e985-4fde-8720-a091f80532df',
 'd17ce62f-fda3-4e88-94ed-7973bd172b80',
 '832a58a7-fed7-4a9f-861f-309c83ab8b28',
 '0a389a72-bb2d-4d01-86db-3b87ebd1d2de',
 '5b5b2339-9448-4f3b-b3f8-88731b0c1e3b',
 '472ba3b2-f009-459b-b314-ce65028870db',
 'dce48bda-7f05-45cd-a22f-4ec3f12cbf34',
 '02aea409-272f-4c49-a9c6-e11ce775e577',
 '71f79897-cc08-476c-8348-8b6c61d3ebcc',
 '3f59af10-a4fd-4a4e-8579-d8ea95a6298a',
 '1b758784-6da5-400a-b18c-88e2bc867e6e',
 '0085f059-d3c8-45e1-b7af-76b4219bb201',
 '0d016c9d-c6db-4eb2-963f-13cc8e485045',
 'bf5ccf20-6d5a-4a58-85bd-31c54243cbde',
 'c65c382b-9512-

In [83]:
bot.insert_youtube_metadata(index= pine)
