In [28]:
import os
from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone import Pinecone,  PodSpec
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnablePassthrough
import re
from collections import Counter

# Load the environment variables from API_KEYS.env'
load_dotenv("API_KEYS")
# Accessing the OPENAI_API_KEY variable
openai_api_key = os.getenv('OPENAI_API_KEY')


In [11]:
print(os.getenv('PINECONE_API_KEY'))

22406bd9-364f-4cd4-8728-1b4f96243697


In [95]:
class FinFeedRAG:
    def __init__(self, pine_cone_api_key, openai_api_key, pinecone_index, embeddings_model= OpenAIEmbeddings(),model='gpt-3.5-turbo'):
        self.openai_api_key=openai_api_key
        self.api_key_pinecone = pine_cone_api_key
        self.pinecone_index = pinecone_index
        # Initialize Pinecone connection
        self.vector_db = None
        self.embeddings=embeddings_model
        self.model=model
        self.template = """
                Answer the question based on the context below. If you can't
                answer the question, reply "I don't know".
                
                Context: {context}
                
                Question: {question}
                """

    def initialize_pinecone(self):
        if self.vector_db is None:  # Check if it's already initialized
            pc = Pinecone(api_key=self.api_key_pinecone)
            self.vector_db = pc.Index(self.pinecone_index)  # Connect to the index and store the connection
        return self.vector_db
        
    
    def preprocess_youtube_text(self, text_file, chunksize,chunkoverlap):

        text_file=preprocess_input(text_file)
        
        loader = TextLoader(text_file) #text instance of langchain
        text_documents = loader.load() 
        # Assuming RecursiveCharacterTextSplitter is a class you have access to or have created
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunksize, chunk_overlap=chunkoverlap)
        processed_text = splitter.split(text_documents)
        # Further processing can be done here if necessary
        return processed_text

    def upload_to_vb(self,text,embeddings,index=None):
        if index is None:
            index = self.pinecone_index
        return PineconeVectorStore.from_documents(self.preprocess_youtube_text(text), self.embeddings, index_name=index)     


    def preprocess_input(self, text_file,save_back_to_file=True):
        # Simple text preprocessing: lowercasing, removing punctuation need to add more preprocessing steps do research on it (it also removes articles like a, an, the, is,are etc, idk it is necessary)
        # Read and process the content and rewrite it
        if save_back_to_file==True:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            # Write the processed content back, replacing the original
            with open(text_file, 'w') as file:
                file.write(final_text)
        else:
            with open(text_file, 'r') as file:
                # Read the contents of the file
                text = file.read()
            processed_text = text.lower()
            processed_text = re.sub(r'[^\w\s]', '', processed_text)
            tokens = word_tokenize(processed_text)
            filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]
            # Join words back into a single string
            final_text = ' '.join(filtered_words)
            return final_text
        
           

    def most_common(self, input_text_file,most_common=10):
        # Preprocess the text
        processed_text = self.preprocess_input(input_text_file,save_back_to_file=False)    
        # Extract keywords based on frequency, assuming more frequent terms are more relevant
        words = processed_text.split()
        word_freq = Counter(words)
        common_words = word_freq.most_common(most_common)  # Get the top 5 words       
        # Form a query by joining the most common words
        query = ' '.join(word for word, _ in common_words)
        return query

     
    def retrieve_embeddings(self, query, most_similar=2):
        assert self.vector_db is not None, "Initialize Pinecone first"
        query_result = self.vector_db.query(vector=self.embeddings.embed_query(query), top_k=most_similar)
        ids = [item['id'] for item in query_result['matches']]
        return [self.vector_db.fetch(ids)['vectors'][id]['values'] for id in ids]


    def query_langchain(self, query, most_similar=2,index=None):
        if index is None:
            index = self.pinecone_index
        # Use LangChain to process the query and get results
        return PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).similarity_search(query,k=most_similar)
        

    def provide_context(self, query,index=None,most_similar=2):
        if index is None:
            index = self.pinecone_index
        # Provide context to LLM
        return PineconeVectorStore.from_existing_index(index_name=index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': most_similar}).invoke(query)
        
    def prompt(self,template=None):
        if template is None:
            template = self.template
        return ChatPromptTemplate.from_template(template)
        
    def llm(self,model=None):
        if model is None:
            model = self.model
        return ChatOpenAI(openai_api_key=self.openai_api_key, model=model)
        
    def parser(self):
        return StrOutputParser()

    def chain(self,query):
        #complete_query = self.prompt().format(context=self.provide_context(query),question=query)
        #response = self.llm().invoke(complete_query)
        #return self.parser().invoke(response)
        chaining = (
        {"context": PineconeVectorStore.from_existing_index(index_name=self.pinecone_index,embedding=self.embeddings).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 4}), 
         "question": RunnablePassthrough()}
        | self.prompt()
        | self.llm()
        | self.parser())
        return chaining.invoke(query)
     

In [98]:
finfeed = FinFeedRAG(pine_cone_api_key=os.getenv('PINECONE_API_KEY'), openai_api_key=os.getenv('OPENAI_API_KEY'), pinecone_index='youtube-index')

In [99]:
finfeed.most_common('transcription1.txt',12
                   )

'gaza us workers trump israeli quote cohen people since monday state said'

In [97]:
finfeed.preprocess_input('transcription1.txt')

In [104]:
finfeed.chain("what are the most important news today")

"The most important news today includes the ruling expected in London on Julian Assange's extradition case, the death toll and ongoing flooding in Rio Grande de Soule, Israeli bombing in Gaza resulting in casualties, unionized workers at the University of California voting to authorize a strike, and developments regarding former President Trump's potential investigations and Secretary of State Antony Blinken's visit to Ukraine."

In [5]:
finfeed.chain('what are the most important news today')

"The most important news today includes the court ruling in London regarding Julian Assange's extradition to the U.S., the death toll and ongoing flooding in Rio Grande de Soule, Israeli bombings in Gaza leading to casualties including children, and unionized workers at the University of California voting to authorize a strike in protest of the crackdown on Gaza encampments."

In [101]:
finfeed.initialize_pinecone()

<pinecone.data.index.Index at 0x1fa4c07b3b0>

In [102]:
finfeed.retrieve_embeddings("what are the most important news today")

[[-0.0148902787,
  -0.0211708155,
  -0.0078472048,
  0.000947106339,
  -0.00557345664,
  0.0150150582,
  -0.024068458,
  -0.0143911634,
  -0.0208380725,
  -0.0267720018,
  0.0272572525,
  -0.00459255604,
  -0.000340975617,
  0.0127829025,
  -0.0191327613,
  -0.00840870943,
  0.0290041566,
  -0.0241239164,
  -0.001996462,
  -0.0129908677,
  0.0126997167,
  0.0379882343,
  -0.0120827546,
  0.000273603684,
  -0.0142663848,
  -0.00319572561,
  0.00174517126,
  -0.0322761349,
  -0.000383868348,
  -0.0304737743,
  -0.0100308349,
  -0.0136563545,
  -0.0261758342,
  -0.00523031456,
  -0.008387913,
  0.0106616616,
  0.00564971,
  -0.00191500911,
  0.0103011895,
  -0.0261758342,
  0.0219610818,
  -0.0140098948,
  0.00651969621,
  -0.0270631518,
  -0.031472005,
  0.0135731688,
  0.00261169113,
  -0.0195209626,
  -0.0122560579,
  0.0340784974,
  0.0417870581,
  0.0264947154,
  -0.0119579751,
  -0.024830997,
  -0.00788879767,
  -0.0160132889,
  -0.0104190363,
  -0.0159162395,
  0.00677618617,
  0.0

In [17]:
PineconeVectorStore()

AttributeError: 'Index' object has no attribute 'PineconeVectorStore'

In [36]:
text = PineconeVectorStore.from_existing_index(index_name='youtube-index',embedding=OpenAIEmbeddings()).as_retriever(search_type='similarity',
                search_kwargs={
                'k': 2}).invoke('news')[0]

In [41]:
finfeed.preprocess_user_input(text.page_content)



In [39]:
text.page_content



In [57]:
finfeed.pr

In [60]:
loader = TextLoader('transcription1.txt') #text instance of langchain
text_documents = loader.load() 

In [61]:
text_documents



In [85]:

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure stopwords are downloaded
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Example text
processed_text = "Hello, world! This is a test. #python"

# Remove non-word characters except for whitespace
cleaned_text = re.sub(r'[^\w\s]', '', processed_text)

# Tokenize the text
tokens = word_tokenize(cleaned_text)

# Remove stopwords
filtered_words = [word for word in tokens if word.lower() not in stopwords.words('english')]

# Join words back into a single string
final_text = ' '.join(filtered_words)

print(final_text)


Hello world test python


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Korel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Korel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
