In [1]:
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import  Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import  BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [3]:
print(OPENAI_API_KEY)  # Should not be None or empty


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


In [4]:
llm = ChatOpenAI(model='gpt-4o-mini', api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-c138a6be-224f-43ad-8ad0-5d304407cea5-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Process PDF document

In [5]:
loader = PyPDFLoader('../RAG-PDF/content/emotion.pdf')
pages = loader.load()
pages

[Document(metadata={'source': '../RAG-PDF/content/emotion.pdf', 'page': 0}, page_content='Proceedings of Recent Advances in Natural Language Processing, pages 750–757,\nVarna, Bulgaria, Sep 2–4, 2019.\nhttps://doi.org/10.26615/978-954-452-056-4_087\n750\nSentiment and Emotion Based Text\nRepresentation for Fake Reviews Detection\nAlimuddin Melleng\nQueen’s University Belfast\namelleng01@qub.ac.uk\nAnna-Jurek Loughrey\nQueen’s University Belfast\na.jurek@qub.ac.uk\nDeepak P\nQueen’s University Belfast\ndeepaksp@acm.org\nAbstract\nFake reviews are increasingly prevalent\nacross the Internet. They can be uneth-\nical and harmful. They can affect busi-\nnesses and mislead customers. As opin-\nions on the Web are increasingly relied on,\nthe detection of fake reviews has become\nmore critical. In this study we explore\nthe effectiveness of sentiment and emo-\ntions based representations for the task\nof building machine learning models for\nfake reviews detection. The experiment\nperformed 

## Split Document

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=200,length_function=len,separators=["\n\n","\n", " "])

chunks = text_splitter.split_documents(pages)


## Create embeddings

In [7]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")


In [8]:
from langchain.evaluation import  load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", embeddings = embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="pizza")


{'score': 0.2207434692141369}

## Create Vector Database

In [9]:

import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):
    
    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = []
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)
    
    # Create a new Chroma databae from the documents        
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory= vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [10]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks,
                                 embedding_function=embedding_function,
                                 vectorstore_path="vectorstore_chroma")

  vectorstore.persist()


### 2. Query for relevant data

In [11]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)


  vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)


In [12]:
# Create retriever and get relevant chunks

retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the paper")
relevant_chunks

[Document(metadata={'page': 6, 'source': '../RAG-PDF/content/emotion.pdf'}, page_content='Metadata.” Proceedings of the 21th ACM \nSIGKDD, 2015. \nRout, Jitendra Kumar, et al. “Deceptive Review \nDetection Using Labeled and Unlabeled Data.” \nMultimedia Tools and Applications , vol. 76, no. \n3, Multimedia Tools and Applications, 2017, pp. \n3187–211, doi:10.1007/s11042-016-3819-y.'),
 Document(metadata={'page': 6, 'source': '../RAG-PDF/content/emotion.pdf'}, page_content='Conference on Knowledge Discovery and Data \nMining, ACM, 2013, \ndoi:10.1145/2487575.2487580. \nLi, Luyang, et al. “Document Representation and \nFeature Combination for Deceptive Spam \nReview Detection.” Neurocomputing, vol. 254, \nElsevier B.V., 2017, pp. 1339 –51, \ndoi:10.1016/j.neucom.2016.10.080. \nLuca, Michael, and Georgios Zervas. “Fake It till You \nMake It: Reputation.” Competition, and Yelp \nReview Fraud., SSRN Electronic Journal, 2016. \nNielsen, Finn Årup. “A New ANEW: Evaluation of a \nWord List for

In [13]:
query = "What is the title of the paper?"
retrieved_docs = retriever.invoke(query)
for doc in retrieved_docs:
    print(doc.page_content)


Metadata.” Proceedings of the 21th ACM 
SIGKDD, 2015. 
Rout, Jitendra Kumar, et al. “Deceptive Review 
Detection Using Labeled and Unlabeled Data.” 
Multimedia Tools and Applications , vol. 76, no. 
3, Multimedia Tools and Applications, 2017, pp. 
3187–211, doi:10.1007/s11042-016-3819-y.
Conference on Knowledge Discovery and Data 
Mining, ACM, 2013, 
doi:10.1145/2487575.2487580. 
Li, Luyang, et al. “Document Representation and 
Feature Combination for Deceptive Spam 
Review Detection.” Neurocomputing, vol. 254, 
Elsevier B.V., 2017, pp. 1339 –51, 
doi:10.1016/j.neucom.2016.10.080. 
Luca, Michael, and Georgios Zervas. “Fake It till You 
Make It: Reputation.” Competition, and Yelp 
Review Fraud., SSRN Electronic Journal, 2016. 
Nielsen, Finn Årup. “A New ANEW: Evaluation of a 
Word List for Sentiment Analysis in 
Microblogs.” CEUR Workshop Proceedings , 
vol. 718, 2011, pp. 93 –98, 
doi:10.1016/j.knosys.2015.06.015. 
Ott, Myle, et al. “Finding Deceptive Opinion Spam by 
Any Stretch of th

In [14]:
# Access all stored documents
docs = vectorstore._collection.get()
for doc in docs['documents']:
    print(doc)  # Each doc is a chunk of text


Proceedings of Recent Advances in Natural Language Processing, pages 750–757,
Varna, Bulgaria, Sep 2–4, 2019.
https://doi.org/10.26615/978-954-452-056-4_087
750
Sentiment and Emotion Based Text
Representation for Fake Reviews Detection
Alimuddin Melleng
Queen’s University Belfast
amelleng01@qub.ac.uk
Anna-Jurek Loughrey
Queen’s University Belfast
a.jurek@qub.ac.uk
Deepak P
Queen’s University Belfast
deepaksp@acm.org
Abstract
Fake reviews are increasingly prevalent
across the Internet. They can be uneth-
ical and harmful. They can affect busi-
nesses and mislead customers. As opin-
ions on the Web are increasingly relied on,
the detection of fake reviews has become
more critical. In this study we explore
the effectiveness of sentiment and emo-
tions based representations for the task
of building machine learning models for
fake reviews detection. The experiment
performed with three real-world datasets
demonstrate that improved data represen-
tation can be achieved by combining sen-
time

In [15]:
# Access stored documents and metadata
docs = vectorstore._collection.get()
for doc, metadata in zip(docs['documents'], docs['metadatas']):
    print(f"Document: {doc}")
    print(f"Metadata: {metadata}")


Document: Proceedings of Recent Advances in Natural Language Processing, pages 750–757,
Varna, Bulgaria, Sep 2–4, 2019.
https://doi.org/10.26615/978-954-452-056-4_087
750
Sentiment and Emotion Based Text
Representation for Fake Reviews Detection
Alimuddin Melleng
Queen’s University Belfast
amelleng01@qub.ac.uk
Anna-Jurek Loughrey
Queen’s University Belfast
a.jurek@qub.ac.uk
Deepak P
Queen’s University Belfast
deepaksp@acm.org
Abstract
Fake reviews are increasingly prevalent
across the Internet. They can be uneth-
ical and harmful. They can affect busi-
nesses and mislead customers. As opin-
ions on the Web are increasingly relied on,
the detection of fake reviews has become
more critical. In this study we explore
the effectiveness of sentiment and emo-
tions based representations for the task
of building machine learning models for
fake reviews detection. The experiment
performed with three real-world datasets
demonstrate that improved data represen-
tation can be achieved by combining

In [None]:
# Count the nunmber of stored chunks
docs = vectorstore._collection.get()
print(f"Number of stored chunks: {len(docs['documents'])}")


In [None]:
# Use the Retriever for Targeted Debugging
retrieved_docs = vectorstore.as_retriever().get_relevant_documents("What is the title of the paper?")
for doc in retrieved_docs:
    print(doc.page_content)


In [None]:
title_query = "What is the title of this paper?"
summary_query = "What is the summary of this paper?"
year_query = "What is the publication year of this paper?"
authors_query = "Who are the authors of this paper?"


title = rag_chain.invoke(title_query)
summary = rag_chain.invoke(summary_query)
year = rag_chain.invoke(year_query)
authors = rag_chain.invoke(authors_query)


In [None]:
title

In [16]:
# Prompt template 

PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""
# PROMPT_TEMPLATE = """
# You are an assistant for extracting information about research articles.
# Use the retrieved context to extract the following information:

# - Title of the article
# - Summary of the article
# - Year of publication
# - Names of the authors

# If any field is not explicitly mentioned in the context, say "Not available."

# Context:
# {context}

# Question:
# {question}

# """

In [17]:
# Concatenate context text

context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text,
                                question="What is the title of the paper?")

print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

Metadata.” Proceedings of the 21th ACM 
SIGKDD, 2015. 
Rout, Jitendra Kumar, et al. “Deceptive Review 
Detection Using Labeled and Unlabeled Data.” 
Multimedia Tools and Applications , vol. 76, no. 
3, Multimedia Tools and Applications, 2017, pp. 
3187–211, doi:10.1007/s11042-016-3819-y.

---

Conference on Knowledge Discovery and Data 
Mining, ACM, 2013, 
doi:10.1145/2487575.2487580. 
Li, Luyang, et al. “Document Representation and 
Feature Combination for Deceptive Spam 
Review Detection.” Neurocomputing, vol. 254, 
Elsevier B.V., 2017, pp. 1339 –51, 
doi:10.1016/j.neucom.2016.10.080. 
Luca, Michael, and Georgios Zervas. “Fake It till You 
Make It: Reputation.” Competition, and Yelp 
Review Fraud., SSRN Electronic Journal, 2016. 
Nielsen, Finn Årup. “A New ANEW: Evaluation of a 

In [18]:
llm.invoke(prompt)

AIMessage(content='The title of the paper is "Sentiment and Emotion Based Text Representation for Fake Reviews Detection."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 1206, 'total_tokens': 1226, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0aa8d3e20b', 'finish_reason': 'stop', 'logprobs': None}, id='run-62b43325-5d19-4b26-a59f-dfd96e5e7075-0', usage_metadata={'input_tokens': 1206, 'output_tokens': 20, 'total_tokens': 1226, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Using Langchain Expression Language

In [19]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()} 
    | prompt_template
    | llm
)

# rag_chain.invoke("What is the title of the article?")
rag_chain.invoke("what's the title of this paper.")

AIMessage(content='The title of the paper is "Sentiment and Emotion Based Text Representation for Fake Reviews Detection."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 1200, 'total_tokens': 1220, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'stop', 'logprobs': None}, id='run-d4bfbddb-bc43-4450-beca-0c4448de8fe6-0', usage_metadata={'input_tokens': 1200, 'output_tokens': 20, 'total_tokens': 1220, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Generate Structured responses

In [22]:

    
class ExtractedInfo(BaseModel):
    """Extracted Information about the research paper"""
    paper_title: str = Field(description="Title of the paper")
    paper_summary: str = Field(description="Summary of the paper")
    publication_year: int = Field(description="Year of publication of the paper")
    paper_authors: str = Field(description="Names of the authors of the paper")
    


In [26]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm.with_structured_output(ExtractedInfo, strict=True)
)

rag_chain.invoke("what is the title of the research paper.")
# rag_chain.invoke("what is the title, summary, publication date, authors of the research paper.")

ExtractedInfo(paper_title=AnswerWithSources(answer='Sentiment and Emotion Based Text Representation for Fake Reviews Detection', sources='Sentiment and Emotion Based Text Representation for Fake Reviews Detection Alimuddin Melleng Queen’s University Belfast amelleng01@qub.ac.uk Anna-Jurek Loughrey Queen’s University Belfast a.jurek@qub.ac.uk Deepak P Queen’s University Belfast deepaksp@acm.org', reasoning='The title is explicitly mentioned at the beginning of the provided context.'), paper_summary=AnswerWithSources(answer='Fake reviews are increasingly prevalent across the Internet and can be unethical and harmful, affecting businesses and misleading customers. The study explores the effectiveness of sentiment and emotion-based representations for detecting fake reviews.', sources='Fake reviews are increasingly prevalent across the Internet. They can be unethical and harmful. They can affect businesses and mislead customers. As opinions on the Web are increasingly relied on, the detect

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm.with_structured_output(ExtractedInfo, strict=True)
)

rag_chain.invoke("what is the title, summary, publication date, authors of the research paper.")

### Transform response into a dataframe

In [27]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning"""
    answer: str=Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    

class ExtractedInfo(BaseModel):
    """Extracted Information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [28]:
# structured_response = rag_chain.invoke("Give me the title, summary, publication data, authors of the research paper")
structured_response = rag_chain.invoke("what is the title of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transorming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])
    
# Create new dataframe with two rows: 'answer' and 'souce' 
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer','sources','reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Sentiment and Emotion Based Text Representatio...,The study explores the effectiveness of sentim...,2019,"Alimuddin Melleng, Anna-Jurek Loughrey, Deepak P"
sources,Sentiment and Emotion Based Text Representatio...,In this study we explore the effectiveness of ...,Proceedings of Recent Advances in Natural Lang...,"Alimuddin Melleng Queen’s University Belfast, ..."
reasoning,The title is explicitly mentioned at the begin...,The summary is derived from the abstract where...,The publication year is indicated in the metad...,The authors' names are listed in the abstract ...


In [31]:
structured_response_df['paper_authors'][0]

  structured_response_df['paper_authors'][0]


'Alimuddin Melleng, Anna-Jurek Loughrey, Deepak P'