In [25]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone import Pinecone,  PodSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnablePassthrough
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline
# load_dotenv("API_KEYS")

In [43]:
from openai import OpenAI

In [50]:
# load environment variables from .env file
load_dotenv("API_KEYS.env")

# get the environment variable
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [52]:
load_dotenv("API_KEYS.env")

True

In [69]:
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [73]:

class FinFeedRAG:
    def __init__(self ,pinecone_index, pine_cone_api_key= pinecone_api_key, openai_api_key= openai_api_key, embeddings_model= OpenAIEmbeddings(openai_api_key=openai_api_key),model='gpt-3.5-turbo'):
        self.openai_api_key=openai_api_key
        self.api_key_pinecone = pine_cone_api_key
        self.pinecone_index = pinecone_index
        # Initialize Pinecone connection
        self.vector_db = None
        self.embeddings=embeddings_model
        self.model=model
        self.template = """
                Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                If you can't answer the question, reply "I do not have enough information to answer this question".
                
                Context: {context}
                
                Question: {question}
                """

    def initialize_pinecone(self):
        if self.vector_db is None:  # Check if it's already initialized
            pc = Pinecone(api_key=self.api_key_pinecone)
            self.vector_db = pc.Index(self.pinecone_index)  # Connect to the index and store the connection
        return self.vector_db
        
    
    def preprocess_youtube_text(self, text_file, chunksize,chunkoverlap):

        self.preprocess_input(text_file,save_back_to_file=True)
        
        loader = TextLoader(text_file) #text instance of langchain
        text_documents = loader.load() 
        # Assuming RecursiveCharacterTextSplitter is a class you have access to or have created
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunksize, chunk_overlap=chunkoverlap)
        processed_text = splitter.split_documents(text_documents)
        # Further processing can be done here if necessary
        return processed_text

    def upload_to_vb(self,text,embeddings,chunksize, chunkoverlap,index=None):
        if index is None:
            index = self.pinecone_index
        return PineconeVectorStore.from_documents(self.preprocess_youtube_text(text,chunksize,chunkoverlap), self.embeddings, index_name=index)


    def preprocess_input(self, text_file,save_back_to_file=True):
        # Simple text preprocessing: lowercasing, removing punctuation need to add more preprocessing steps do research on it
        # Read and process the content and rewrite it
        if save_back_to_file==True:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            # Write the processed content back, replacing the original
            with open(text_file, 'w') as file:
                file.write(final_text)
        else:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            return final_text
        
    def most_common(self, input_text_file,most_common=10):
        # Preprocess the text
        processed_text = self.preprocess_input(input_text_file,save_back_to_file=False)    
        # Extract keywords based on frequency, assuming more frequent terms are more relevant
        words = processed_text.split()
        word_freq = Counter(words)
        common_words = word_freq.most_common(most_common)  # Get the top 5 words       
        # Form a query by joining the most common words
        query = ' '.join(word for word, _ in common_words)
        return query

    def retrieve_embeddings(self, query, most_similar=2):
        assert self.vector_db is not None, "Initialize Pinecone first"
        query_result = self.vector_db.query(vector=self.embeddings.embed_query(query), top_k=most_similar)
        ids = [item['id'] for item in query_result['matches']]
        return [self.vector_db.fetch(ids)['vectors'][id]['values'] for id in ids]

    def provide_context(self, query,index=None,most_similar=2):
        if index is None:
            index = self.pinecone_index
        # Provide context to LLM
        return PineconeVectorStore.from_existing_index(index_name=index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}).invoke(query)
        
    def prompt(self,template=None):
        if template is None:
            template = self.template
        return ChatPromptTemplate.from_template(template)
        
    def llm(self,model=None):
        if model is None:
            model = self.model
        return ChatOpenAI(openai_api_key=self.openai_api_key, model=model)
        
    def parser(self):
        return StrOutputParser()

    def chain(self,query):
        #complete_query = self.prompt().format(context=self.provide_context(query),question=query)
        #response = self.llm().invoke(complete_query)
        #return self.parser().invoke(response)
        chaining = (
        {"context": PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}), 
         "question": RunnablePassthrough()}
        | self.prompt()
        | self.llm()
        | self.parser())
        return chaining.invoke(query)
    
    def pipe(self,chunk):
        pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
        return pipe(chunk)

    def get_sentiment(self,chunks,neutrality_threshdold=0.3):
        """Gets the compound sentiment of the chunks based on their individual sentiment
        Parameters
        ----------
        chunks : list
            List of text chunks
        neutrality_threshdold : float, optional
            A hyperparameter neutrality_threshdold tunes how certain we need to be of a sentiment to classify it as positive or negative
            (If neutrality_threshdold=1, any list of chunks will result in a neutral sentiment
            If neutrality_threshdold=0, any list of chunks will be classified as positive or negative)
        Returns
        -------
        int
            1 for positive, 0 for neutral, and -1 for negative
        """
        #Assing a numerical value to each sentiment to simplify calculations
        sentiment_values = {'positive':1, 'neutral':0, 'negative':-1}
        #Run each chunk through sentiment model
        sentiments = [self.pipe(chunk.page_content)[0] for chunk in chunks]
        #Print out model output
        #print(sentiments)
        #For each chunk, we compute a sentiment score by multiplying the score times the sentiment value corresponding to its label
        sentiment_scores = [(sentiment['score'])*sentiment_values[sentiment['label']] for sentiment in sentiments]
        #Average sentiment_scores
        avg_sentiment_score = sum(sentiment_scores)/len(sentiment_scores)
        if avg_sentiment_score >= neutrality_threshdold:
            return ('positive',sentiments)
        elif avg_sentiment_score <= -neutrality_threshdold:
            return ('negative',sentiments)
        else:
            return ('neutral',sentiments)
            

     

In [226]:
bot = FinFeedRAG(pinecone_index='day1')

In [227]:
pine = bot.initialize_pinecone()

In [228]:
pine.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4367}},
 'total_vector_count': 4367}

In [66]:

directory_path = "data/text/"
for filename in os.listdir(directory_path)[0:2]:
    if filename.endswith(('.txt')):
        # Perform your desired actions here
        print(directory_path + filename)


data/text/'comfortable'_with_front_end_of_treasury_curve_allspring's_bory.txt
data/text/11_million_immigrants_in_us_unlawfully_at_last_count_mayorkas.txt


In [78]:
directory_path = "data/text/"
for filename in os.listdir(directory_path):
    if filename.endswith(('.txt')):
        # Perform your desired actions here
        print(directory_path + filename)
        bot.upload_to_vb(directory_path + filename,embeddings=OpenAIEmbeddings(openai_api_key=openai_api_key),chunksize=200,chunkoverlap=20,index='day1')

data/text/'comfortable'_with_front_end_of_treasury_curve_allspring's_bory.txt
data/text/11_million_immigrants_in_us_unlawfully_at_last_count_mayorkas.txt
data/text/active_bond_investors_getting_a_taste_of_'nirvana'.txt
data/text/a_rebound_in_box_shipments_is_unlikely_in_2024.txt
data/text/biden_and_trump_agree_to_debate_what_to_expect_#shorts.txt
data/text/biden_blocks_release_of_interview_tapes_on_classified_papers__balance_of_power.txt
data/text/biggest_saudi_ipo_of_the_year_draws_$91_billion_in_orders.txt
data/text/blackrocksrieder_federal_reserve_rate_cuts_needed_to_tame_inflation.txt
data/text/bloomberg_brief_(05172024).txt
data/text/bloomberg_markets_asia_05172024.txt
data/text/bloomberg_real_yield_05172024.txt
data/text/bloomberg_surveillance_05172024.txt
data/text/bloomberg_the_open_05172024.txt
data/text/bny_mellon_ceo_on_pushing_bank's_240-year_legacy_forward.txt
data/text/booking_holdings_versus_disney_why_only_one_is_a_buy.txt
data/text/china's_mortgage_bazooka__daybreak_eu

In [85]:
bot

<__main__.FinFeedRAG at 0x177bb963260>

In [229]:
bot.chain("give me 3 bullet points on China's economy.")

"- China is trying to shift its economy from investment-led growth to consumer-led growth.\n- There is a significant focus on diversifying away from China's economy due to geopolitical shifts.\n- The Chinese government is taking steps to intervene in the housing market to stimulate the domestic economy."

In [230]:
bot.provide_context("give me 3 bullet points on China's economy.",most_similar=5)

[Document(page_content='last last days know really speaks way developed market economies changed shifted geopolitics therefore economic policies governments talking political economies theres one stands today well china', metadata={'source': "data/text/inflation_fears,_richemont's_new_'bos'__bloomberg_markets_today_0517.txt", 'youtube_reponse_metadata': "{'id': {'kind': 'youtube#video', 'videoId': 'HY583QRI5v0'}, 'snippet': {'publishedAt': '2024-05-17T11:21:26Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Inflation Fears, Richemont&#39;s New &#39;Bos&#39; | Bloomberg Markets Today 05/17', 'channelTitle': 'Bloomberg Television'}}"}),
 Document(page_content='pressing forward move diversify away chinas economy lot domestic interest beijing trying navigate well hope fly wall conversations asia government economy correspondent rebecca chung wilkins near', metadata={'source': "data/text/fed's_'higher_for_longer'_message_tumbles_markets__horizons_middle_east_&_africa_05162024.txt", 'y

### Aryama's work- adding youtube metadata to pinecone index

In [99]:
pc = Pinecone(pinecone_api_key)
index = pc.Index("day1")

In [110]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4367}},
 'total_vector_count': 4367}

In [111]:
import pandas as pd
final_df= pd.read_pickle("final_df.pkl")

In [153]:

def get_ids_from_query(index, input_vector):
    results = index.query(vector=input_vector, top_k=10000, include_values=False)
    # ids = set()
    ids=[]
    for result in results['matches']:
        # ids.add(result['id'])
        ids.append(result['id'])
    return ids

# Usage
my_index = pc.Index("day1")
all_ids = get_ids_from_query(my_index, [0.0] * 1536)  # Replace 768 with your vector dimension
all_ids[0:3]

['dc35a55d-f7af-4689-bbc3-d4e38dad6eab',
 '9a40d91f-894b-4ef2-9468-4d0af3d041e0',
 'a30fce25-c63f-45b6-97de-8f4886ac9f9a']

In [210]:
index.fetch([all_ids[0]])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'dc35a55d-f7af-4689-bbc3-d4e38dad6eab': {'id': 'dc35a55d-f7af-4689-bbc3-d4e38dad6eab',
                                                      'metadata': {'source': 'data/text/china_regulators_set_to_discuss_property_aid_with_banks.txt',
                                                                   'text': 'loans '
                                                                           'state '
                                                                           'banks '
                                                                           'course '
                                                                           'lot '
                                                                           'details '
                                                                           'yet '
                                                                           'sketched '
                             

In [224]:
for id in all_ids:
    old_metadata = index.fetch([id])['vectors'][id]['metadata']
    # old_metadata['youtube_response_metadata'] = final_df[final_df['txt_file_path']
    #                                                      .apply(lambda x : x.replace('\\','/')) == old_metadata['source']]['response_metadata'].values[0]
    index.update(
	id= id, 
	set_metadata={'youtube_reponse_metadata': str(final_df[final_df['txt_file_path'].apply(lambda x : x.replace('\\','/')) == 
                                                        old_metadata['source']]['response_metadata'].values[0])
}, 
	namespace=""
)

In [214]:
old_metadata = index.fetch([all_ids[0]])['vectors'][all_ids[0]]['metadata']
# old_metadata['youtube_response_metadata'] = final_df[final_df['txt_file_path'].apply(lambda x : x.replace('\\','/')) == old_metadata['source']]['response_metadata'].values[0]


In [220]:
index.update(
	id=all_ids[0], 
	set_metadata={'youtube_reponse_metadata': str(final_df[final_df['txt_file_path'].apply(lambda x : x.replace('\\','/')) == old_metadata['source']]['response_metadata'].values[0])
}, 
	namespace=""
)

{}

In [221]:
index.fetch([all_ids[0]])['vectors'][all_ids[0]]['metadata']

{'empty': 'empty',
 'source': 'data/text/china_regulators_set_to_discuss_property_aid_with_banks.txt',
 'text': 'loans state banks course lot details yet sketched well watching big question big scale going going finance recall early 2023 government roll something similar smaller scale extended 100 billion yuan',
 'youtube_reponse_metadata': "{'id': {'kind': 'youtube#video', 'videoId': '0f8nNqcCNEU'}, 'snippet': {'publishedAt': '2024-05-17T00:06:24Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'China Regulators Set to Discuss Property Aid With Banks', 'channelTitle': 'Bloomberg Television'}}"}

In [205]:
index.fetch([all_ids[0]])['vectors'][all_ids[0]]['metadata']

{'source': 'data/text/china_regulators_set_to_discuss_property_aid_with_banks.txt',
 'text': 'loans state banks course lot details yet sketched well watching big question big scale going going finance recall early 2023 government roll something similar smaller scale extended 100 billion yuan',
 'youtube_response_metadata': {'id': {'kind': 'youtube#video',
   'videoId': '0f8nNqcCNEU'},
  'snippet': {'publishedAt': '2024-05-17T00:06:24Z',
   'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg',
   'title': 'China Regulators Set to Discuss Property Aid With Banks',
   'channelTitle': 'Bloomberg Television'}}}