# Preprocessing functions and corpus preparation



In [None]:
!pip install openai
!pip install tiktoken
!pip install faiss-gpu
!pip install langchain --quiet
!python -m nltk.downloader stopwords
!python -m spacy download it_core_news_sm
!pip install rank-bm25
!pip install  python-dotenv
!pip install -U langchain-openai

In [2]:
import json
from pathlib import Path
import pandas as pd

file_path='docs/test.json'
data = json.loads(Path(file_path).read_text())

df = pd.DataFrame(data)
df = pd.DataFrame(data)

tbr_text =list(df.data)
text = tbr_text

In [None]:

def remove_stopwords(text):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    example_sent = text
    en_sw = set(stopwords.words('english'))
    stop_words = en_sw.union(it_sw)
    word_tokens = word_tokenize(example_sent)
    # converts the words in word_tokens to lower case and then checks whether
    #they are present in stop_words or not
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    #with no lower case conversion
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return  ' '.join(filtered_sentence)


def lemmatize(text):
    import spacy
    import it_core_news_sm
    nlp = it_core_news_sm.load()
    # Define a sample text
    # Process the text using spaCy
    doc = nlp(text)
    # Extract lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc]
    # Join the lemmatized tokens into a sentence
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

def preprocess(text):
    return lemmatize(remove_stopwords(text.lower()))

# preprocess_corpus
import nltk
nltk.download('punkt')
# text = text.values()
preprocessed = [preprocess(i) for i in text]
import pandas as pd
pd.DataFrame(preprocessed).to_csv("tbr.csv")
list(pd.read_csv("tbr.csv")["0"].values)

# Final pipeline

In [None]:


def get_idx_rank_query(path_to_corpus, query):
  # 1.1 Tokenizing
  from rank_bm25 import BM25Okapi
  tbrl = list(pd.read_csv(path_to_corpus)["0"].values)
  corpus = list(tbrl)
  tokenized_corpus = [doc.split(" ") for doc in corpus]
  bm25 = BM25Okapi(tokenized_corpus)
  # 1.2 preprocessing query
  query = preprocess(query)
  # 2. querying
  tokenized_query = query.split(" ")
  doc_scores_bm25 = bm25.get_scores(tokenized_query)
  idx_rank_query = pd.DataFrame(doc_scores_bm25, columns =["ds"]).sort_values(by = "ds", ascending = False).reset_index(names = "dn").dn.values
  return idx_rank_query

def get_idx_rank_semantic(path_to_corpus, query):
  from google.colab import userdata
  import os
  os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
  # from dotenv import load_dotenv
  from langchain.text_splitter import CharacterTextSplitter
  from langchain.embeddings.openai import OpenAIEmbeddings
  from langchain.vectorstores import FAISS
  from langchain.chains.question_answering import load_qa_chain
  from langchain.llms import OpenAI
  from langchain.callbacks import get_openai_callback
  from langchain.document_loaders import CSVLoader
  loader = CSVLoader(file_path=path_to_corpus)
  data = loader.load()
  embeddings = OpenAIEmbeddings()
  docsearch = FAISS.from_documents(data, embeddings)
  retriever = docsearch.as_retriever(search_kwargs={"k": len(data)})
  docs = retriever.get_relevant_documents(query)
  rank_semantic = [docs[i].metadata['row'] for i in range(len(docs))]
  idx_rank_semantic = pd.DataFrame(rank_semantic, columns =["rank_semantic"]).sort_values(by = "rank_semantic").index
  return idx_rank_semantic

def get_rrf_top_k(path_to_corpus, query, k = 3 ):
  idx_rank_semantic = get_idx_rank_semantic(path_to_corpus, query)
  idx_rank_query =  get_idx_rank_query(path_to_corpus, query)
  idx_rank_rrf = []
  for i in range(len(idx_rank_query)):
      idx_rank_rrf.append(1/(idx_rank_semantic[i]+1) + 1/(idx_rank_query[i]+1))
  rrf_top_k = pd.DataFrame(idx_rank_rrf, columns =["rank_rrf"]).sort_values(by = "rank_rrf", ascending = False).sort_values('rank_rrf', ascending = False).head(k).index
  return rrf_top_k


In [None]:
query = ""
path_to_corpus = ".csv"
rrf_top_k = get_rrf_top_k(path_to_corpus, query, k = 3 )
tbrl = list(pd.read_csv(path_to_corpus)["0"].values)
print(tbr_text[rrf_top_k[0]])

In [None]:
  # Come usarlo? Fai una query relativa ai TBR nella seconda cella
  query = ""
  path_to_corpus = ".csv"

In [None]:
def generate_answer(query, path_to_corpus):


  rrf_top_k = get_rrf_top_k(path_to_corpus, query, k = 3 )
  tbrl = list(pd.read_csv(path_to_corpus)["0"].values)
  context = tbr_text[rrf_top_k[0]]

  # LangChain
  from langchain.chat_models import ChatOpenAI
  from langchain.schema import (
      HumanMessage
  )
  model_name = "gpt-3.5-turbo"

  # Initialize the chat object.
  chat = ChatOpenAI(model_name=model_name, temperature=0)
  question = f'''{query}
  Context: {context}'''
  print(context)


  return chat([HumanMessage(content=question)]), context

In [None]:
answ = generate_answer(query, path_to_corpus)
print(answ[0].content)
