In [67]:
#!pip install transformers
#!pip install nltk


import os
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone import Pinecone,  PodSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnablePassthrough
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline

#load_dotenv("API_KEYS")

In [3]:
from openai import OpenAI

In [5]:
# load environment variables from .env file
load_dotenv("API_KEYS.env")

# get the environment variable
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [6]:
load_dotenv("API_KEYS.env")

True

In [7]:
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')

In [8]:

class FinFeedRAG:
    def __init__(self ,pinecone_index, pine_cone_api_key= pinecone_api_key, openai_api_key= openai_api_key, embeddings_model= OpenAIEmbeddings(openai_api_key=openai_api_key),model='gpt-3.5-turbo'):
        self.openai_api_key=openai_api_key
        self.api_key_pinecone = pine_cone_api_key
        self.pinecone_index = pinecone_index
        # Initialize Pinecone connection
        self.vector_db = None
        self.embeddings=embeddings_model
        self.model=model
        self.template = """
                Answer the question based on the context below but pretend like you are a news reporter who just received the context as the latest news. 
                If you can't answer the question, reply "I do not have enough information to answer this question".
                
                Context: {context}
                
                Question: {question}
                """

    def initialize_pinecone(self):
        if self.vector_db is None:  # Check if it's already initialized
            pc = Pinecone(api_key=self.api_key_pinecone)
            self.vector_db = pc.Index(self.pinecone_index)  # Connect to the index and store the connection
        return self.vector_db
        
    
    def preprocess_youtube_text(self, text_file, chunksize,chunkoverlap):

        self.preprocess_input(text_file,save_back_to_file=True)
        
        loader = TextLoader(text_file) #text instance of langchain
        text_documents = loader.load() 
        # Assuming RecursiveCharacterTextSplitter is a class you have access to or have created
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunksize, chunk_overlap=chunkoverlap)
        processed_text = splitter.split_documents(text_documents)
        # Further processing can be done here if necessary
        return processed_text

    def upload_to_vb(self,text,embeddings,chunksize, chunkoverlap,index=None):
        if index is None:
            index = self.pinecone_index
        return PineconeVectorStore.from_documents(self.preprocess_youtube_text(text,chunksize,chunkoverlap), self.embeddings, index_name=index)


    def preprocess_input(self, text_file,save_back_to_file=True):
        # Simple text preprocessing: lowercasing, removing punctuation need to add more preprocessing steps do research on it
        # Read and process the content and rewrite it
        if save_back_to_file==True:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            # Write the processed content back, replacing the original
            with open(text_file, 'w') as file:
                file.write(final_text)
        else:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            return final_text
        
    def most_common(self, input_text_file,most_common=10):
        # Preprocess the text
        processed_text = self.preprocess_input(input_text_file,save_back_to_file=False)    
        # Extract keywords based on frequency, assuming more frequent terms are more relevant
        words = processed_text.split()
        word_freq = Counter(words)
        common_words = word_freq.most_common(most_common)  # Get the top 5 words       
        # Form a query by joining the most common words
        query = ' '.join(word for word, _ in common_words)
        return query

    def retrieve_embeddings(self, query, most_similar=2):
        assert self.vector_db is not_none, "Initialize Pinecone first"
        query_result = self.vector_db.query(vector=self.embeddings.embed_query(query), top_k=most_similar)
        ids = [item['id'] for item in query_result['matches']]
        return [self.vector_db.fetch(ids)['vectors'][id]['values'] for id in ids]

    def provide_context(self, query,index=None,most_similar=2):
        if index is None:
            index = self.pinecone_index
        # Provide context to LLM
        return PineconeVectorStore.from_existing_index(index_name=index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}).invoke(query)
        
    def prompt(self,template=None):
        if template is None:
            template = self.template
        return ChatPromptTemplate.from_template(template)
        
    def llm(self,model=None):
        if model is None:
            model = self.model
        return ChatOpenAI(openai_api_key=self.openai_api_key, model=model)
        
    def parser(self):
        return StrOutputParser()

    def chain(self,query):
        #complete_query = self.prompt().format(context=self.provide_context(query),question=query)
        #response = self.llm().invoke(complete_query)
        #return self.parser().invoke(response)
        chaining = (
        {"context": PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 10}), 
         "question": RunnablePassthrough()}
        | self.prompt()
        | self.llm()
        | self.parser())
        return chaining.invoke(query)
    
    def pipe(self,chunk):
        pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
        return pipe(chunk)

    def get_sentiment(self,chunks,neutrality_threshdold=0.3):
        """Gets the compound sentiment of the chunks based on their individual sentiment
        Parameters
        ----------
        chunks : list
            List of text chunks
        neutrality_threshdold : float, optional
            The hyperparameter neutrality_threshdold tunes how certain we need to be of a sentiment to classify it as positive or negative
            (If neutrality_threshdold=1, any list of chunks will result in a neutral sentiment
            If neutrality_threshdold=0, any list of chunks will be classified as positive or negative)
        Returns
        -------
        int
            1 for positive, 0 for neutral, and -1 for negative
        """
        #Assing a numerical value to each sentiment to simplify calculations
        sentiment_values = {'positive':1, 'neutral':0, 'negative':-1}
        #Run each chunk through sentiment model
        sentiments = [self.pipe(chunk.page_content)[0] for chunk in chunks]
        #Print out model output
        #print(sentiments)
        #For each chunk, we compute a sentiment score by multiplying the score times the sentiment value corresponding to its label
        sentiment_scores = [(sentiment['score'])*sentiment_values[sentiment['label']] for sentiment in sentiments]
        #Average sentiment_scores
        avg_sentiment_score = sum(sentiment_scores)/len(sentiment_scores)
        if avg_sentiment_score >= neutrality_threshdold:
            return ('positive',sentiments)
        elif avg_sentiment_score <= -neutrality_threshdold:
            return ('negative',sentiments)
        else:
            return ('neutral',sentiments)
            

     

In [9]:
bot = FinFeedRAG(pinecone_index='day1')

In [10]:
pine = bot.initialize_pinecone()

In [11]:
pine.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4367}},
 'total_vector_count': 4367}

In [12]:

directory_path = "data/text/"
for filename in os.listdir(directory_path)[0:2]:
    if filename.endswith(('.txt')):
        # Perform your desired actions here
        print(directory_path + filename)


data/text/xi_jinping_and_vladimir_putin_vow_to_cooperate_against_us'containment'.txt
data/text/active_bond_investors_getting_a_taste_of_'nirvana'.txt


In [78]:
directory_path = "data/text/"
for filename in os.listdir(directory_path):
    if filename.endswith(('.txt')):
        # Perform your desired actions here
        print(directory_path + filename)
        bot.upload_to_vb(directory_path + filename,embeddings=OpenAIEmbeddings(openai_api_key=openai_api_key),chunksize=200,chunkoverlap=20,index='day1')

data/text/'comfortable'_with_front_end_of_treasury_curve_allspring's_bory.txt
data/text/11_million_immigrants_in_us_unlawfully_at_last_count_mayorkas.txt
data/text/active_bond_investors_getting_a_taste_of_'nirvana'.txt
data/text/a_rebound_in_box_shipments_is_unlikely_in_2024.txt
data/text/biden_and_trump_agree_to_debate_what_to_expect_#shorts.txt
data/text/biden_blocks_release_of_interview_tapes_on_classified_papers__balance_of_power.txt
data/text/biggest_saudi_ipo_of_the_year_draws_$91_billion_in_orders.txt
data/text/blackrocksrieder_federal_reserve_rate_cuts_needed_to_tame_inflation.txt
data/text/bloomberg_brief_(05172024).txt
data/text/bloomberg_markets_asia_05172024.txt
data/text/bloomberg_real_yield_05172024.txt
data/text/bloomberg_surveillance_05172024.txt
data/text/bloomberg_the_open_05172024.txt
data/text/bny_mellon_ceo_on_pushing_bank's_240-year_legacy_forward.txt
data/text/booking_holdings_versus_disney_why_only_one_is_a_buy.txt
data/text/china's_mortgage_bazooka__daybreak_eu

In [13]:
bot

<__main__.FinFeedRAG at 0x13d6e3290>

In [14]:
bot.chain("What is happening with Pakistan's economy?")

'I do not have enough information to answer this question.'

In [15]:
bot.provide_context("Give me 10 bullet points about US stock market and worldwide",most_similar=5)

[Document(page_content='foo lets take quick look markets right futures indicating mixed open certainly look across size scale big cap stocks looking modest gains sp futures marginally two points nasa futures one tenth one', metadata={'source': 'data/text/bloomberg_the_open_05172024.txt', 'youtube_reponse_metadata': "{'id': {'kind': 'youtube#video', 'videoId': 'ro1faNCrBXM'}, 'snippet': {'publishedAt': '2024-05-17T16:36:56Z', 'channelId': 'UCIALMKvObZNtJ6AmdCLP7Lg', 'title': 'Bloomberg The Open 05/17/2024', 'channelTitle': 'Bloomberg Television'}}"}),
 Document(page_content='head trying figure right market rally set broaden rest year want bring jack manley jpmorgan asset management global global market strategist jack great see desk talk us youre looking dow 40000', metadata={'source': 'data/text/stocks_tread_water_with_dow_aiming_for_40,000__may_17_yahoo_finance.txt', 'youtube_reponse_metadata': "{'id': {'kind': 'youtube#video', 'videoId': 'jyOVRQIrIpc'}, 'snippet': {'publishedAt': '20

Model evaluation

In [79]:
#LOOK AT THIS

user_prompts=['what is going on with inmigrants','what is the capital of the US?', 'has inflation gone up?']

def model_evaluation(user_prompts):
    
    evaluations = []
    our_answers = []
    benchmark_answers = []
    preferred_answers = []

    #initialize our model
    bot = FinFeedRAG(pinecone_index = 'day1') #Ask guys
    
    for user_prompt in user_prompts:

        #Benchmark answer
        benchmark_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)
        benchmark_answer = benchmark_model.invoke(user_prompt).content

        #Our answer
        our_answer = bot.chain(user_prompt)

        #Evaluation
        template = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful finance bot evaluator. You will be given a user prompt, and the corresponding answers of two different bots. Your job is to choose the most up to date answer. \
        Please provide a reason for your choice."),
            ("system", "User_prompt: " + user_prompt),
            ("system", "Answer 1: " + benchmark_answer),
            ("system", "Answer 2: " + our_answer)])

        prompt_value = template.invoke({"user_prompt":user_prompt,"benchmark_answer":benchmark_answer,"our_answer":our_answer})

        chaining = ({} | template | benchmark_model)
        evaluation = chaining.invoke([''])

        our_answers.append(our_answer)
        benchmark_answers.append(benchmark_answer)
        evaluations.append(evaluation)

    #Extract preffered answer as an integer 
    for evaluation in evaluations:

        #Benchmark answer
        evaluator_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)

        #Evaluation
        template = ChatPromptTemplate.from_messages([
            ("system", "Determine what is the user preferred answer. Please restrict your output to a single integer."),
            ("system", "User_prompt: " + evaluation.content)])
        
        chaining = ({} | template | evaluator_model)
        preferred_answer = chaining.invoke([''])
        preferred_answers.append(int(preferred_answer.content))
    
    return benchmark_answers, our_answers, evaluations, preferred_answers
    

In [86]:
benchs, ours, evals, pref = model_evaluation(user_prompts)



In [87]:
user_prompts

['what is going on with inmigrants',
 'What is the capital of the US?',
 'Has inflation gone up?']

In [88]:
benchs

['There are various issues and challenges facing immigrants around the world. Some of the key issues include:\n\n1. Immigration policies: Many countries have strict immigration policies that make it difficult for immigrants to enter and stay in the country legally. This can lead to undocumented immigration and exploitation of immigrants.\n\n2. Discrimination and xenophobia: Immigrants often face discrimination and xenophobia in their host countries, which can impact their ability to find employment, housing, and access to services.\n\n3. Economic challenges: Immigrants may struggle to find stable employment and earn a living wage, leading to economic insecurity and poverty.\n\n4. Legal challenges: Immigrants may face legal challenges such as deportation, detention, and lack of access to legal representation.\n\n5. Integration: Immigrants may struggle to integrate into their new communities due to language barriers, cultural differences, and lack of social support.\n\nOverall, the situa

In [89]:
ours

['As per the latest news received, there is a significant discussion on the broken fundamental state of the US immigration system. The Secretary Mayorkas mentioned that the immigration system is facing challenges due to disparities in people seeking asylum, concerns about illegal immigrants entering the country, and the reduction in the number of people qualifying for asylum. Additionally, there is a growing number of displaced people in the United States, with factors such as violence, insecurity, poverty, and extreme weather events driving people to leave their home countries. Overall, the issue of immigration is a complex and pressing one that requires attention and reform.',
 'I do not have enough information to answer this question.',
 'I do not have enough information to answer this question.']

In [90]:
evals

[AIMessage(content='I would choose Answer 2 as the most up to date response. This answer provides specific information about the current state of the US immigration system, including recent statements from Secretary Mayorkas and the challenges faced by immigrants seeking asylum. It also mentions the growing number of displaced people in the United States due to various factors. Answer 1, on the other hand, provides a general overview of the issues and challenges facing immigrants without specific details or recent updates.', response_metadata={'token_usage': {'completion_tokens': 91, 'prompt_tokens': 409, 'total_tokens': 500}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-694cac36-7ad7-4b6f-bf24-adfa60246180-0'),
 AIMessage(content='I would choose Answer 1: "The capital of the United States is Washington, D.C." This answer provides the correct and up-to-date information about the capital of the US. Answer 2 is not helpful

In [91]:
pref

[2, 1, 1]