# Quote Generation/Retrieval



In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
import os
import openai

# import utils
import configparser
from pprint import pprint
from llama_index import SimpleDirectoryReader


import os
import openai


import jupyter_black

jupyter_black.load()
# Access values from the sections

import sys

sys.path.append("../app")
import conn_utils

OPENAI_API_KEY = conn_utils.get_open_ai_key("../config.ini")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
directory = "index_store"

In [9]:
!ls ./../data

Amours.txt                 MetamorphosesI_VII.txt
Fasti.txt                  MetamorphosesVIII_XV.txt
Heroides.txt               MetamorphosesofPublius.txt
LastPoems.txt              RemediaAmoris.txt
LoversAssistant.txt


In [12]:
DATA_DIR = "./../data"
files = [
    "RemediaAmoris.txt",
    "Heroides.txt",
    "Amours.txt",
    "Fasti.txt",
    "MetamorphosesI_VII.txt",
    "MetamorphosesVIII_XV.txt",
    "MetamorphosesofPublius.txt",
    "LoversAssistant.txt",
    "LastPoems.txt",
]
docs = [f"{DATA_DIR}/{file}" for file in files]
documents = SimpleDirectoryReader(input_files=docs).load_data()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=25)
texts = text_splitter.create_documents(docs)
directory = "index_store"
vector_index = FAISS.from_documents(texts, OpenAIEmbeddings())
vector_index.save_local(directory)

vector_index = FAISS.load_local("index_store", OpenAIEmbeddings())
retriever = vector_index.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [13]:
qa_interface = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
)

response = qa_interface(
    """
I am a big fan of ovid. 
Please recommend memorable quotes to me.
"""
)

print(response["result"])

Sure! Here are some memorable quotes from the works of Ovid:

1. "Love is a kind of warfare." - Metamorphoses
2. "Fortune and love favor the brave." - Amores
3. "The mind can make a heaven of hell, a hell of heaven." - Metamorphoses
4. "The cause is hidden. The effect is visible to all." - Metamorphoses
5. "Dripping water hollows out stone, not through force but through persistence." - Amores
6. "Happy is the man who has broken the chains which hurt the mind, and has given up worrying once and for all." - Metamorphoses
7. "To love and be loved is to feel the sun from both sides." - Epistulae Heroidum (Heroides)
8. "Everything changes, nothing perishes." - Metamorphoses
9. "It is no use to blame the looking glass if your face is awry." - Amores
10. "Let others praise ancient times; I am glad I was born in these." - Metamorphoses

I hope you find these quotes inspiring and enjoyable!


### 2 Issues with Index retrieved/Generated Quotes:
**Goal:** To have ~100 quotes to have tweeted on a rolling basis since this is a quarter of a year and will not likely to be easily repeated.


### Probability of Repeating a Quote Exactly 2 Weeks Later

\begin{equation*}
P(\text{repeated in 2 weeks}) = 1 - \left( \frac{N - 1}{N} \right)^{\frac{14}{m}}
\end{equation*}

 Substituting in the values:

 \begin{equation*}
P(\text{repeated in 2 weeks}) = 1 - \left( \frac{99}{100} \right)^{14}
\end{equation*}

 Calculating this gives:

 \begin{equation*}
P(\text{repeated in 2 weeks}) \approx 0.135
\end{equation*}

Personally, I'd like it to be under 10% for a longer period of time (increases probability)

Tweaking the numbers a bit 
N = 200
days = 21

\begin{equation*}
P(\text{repeated in 3 weeks}) \approx  0.0980
\end{equation*}

The following should be conditional 1) That the quote generated has confidence that the quote was written by Ovid then 2) That the quote is not a variation of a quote already in DB.

#### Authorship authentication

**[Who Wrote it and Why?
Prompting Large-Language Models for Authorship Verification](https://arxiv.org/pdf/2310.08123.pdf)**

In [14]:
# LLM Authorship Attribution
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseOutputParser


class CommaSeparatedListOutputParser(BaseOutputParser[List[str]]):
    """Parse the output of an LLM call to a comma-separated list."""

    def parse(self, text: str) -> List[str]:
        """Parse the output of an LLM call."""
        return text.strip().split(", ")


template = """Task: On a scale of 0 to 1, with 0 indicating low confidence
and 1 indicating high confidence, please provide a general
assessment of the likelihood that given text 
written by the same author as the provided reference. Your answer should reflect a
moderate level of strictness in scoring. Here are some
relevant variables to this problem.
1. punctuation style(e.g. hyphen, brackets, colon, comma,
parenthesis, quotation mark)
2. special characters style, capitalization style(e.g.
Continuous capitalization, capitalizing certain words)
3. acronyms and abbreviations(e.g. Usage of acronyms
such as OMG, Abbreviations without punctuation marks
such as Mr Rochester vs. Mr. Rochester,Unusual
abbreviations such as def vs. definitely)
4. writing style
5. expressions and Idioms
6. tone and mood
7. sentence structure
8. any other relevant aspect
First step: Understand the problem, extracting relevant
variables and devise a plan to solve the problem. Then,
carry out the plan and solve the problem step by step.
9. One (or both) of the texts is written by the famous Latin author "Ovid"
Finally, show the confidence score.

The following are all quotes by Ovid for reference:
'Love is a thing full of anxious fears.',
 'Now are fields of corn where Troy once stood.',
 "We're slow to believe what wounds us.",
 "The end proves the acts (were done), or the result is a test of the actions; Ovid's line 85 full translation: “The event proves well the wisdom of her [Phyllis'] course.”",
 "Let him who loves, where love success may find, Spread all his sails before the prosp'rous wind.",
 'Resist beginnings; the remedy comes too late when the disease has gained strength by long delays.',
 "Love yields to business. If you seek a way out of love, be busy; you'll be safe then.",
 'The gods behold all righteous actions.',
 'There is a god within us.',
 'The mind, conscious of rectitude, laughed to scorn the falsehood of report.',
 'Every lover is a soldier.',
 'Let the man who does not wish to be idle fall in love!',
 'Far away be that fate!',
 'They bear punishment with equanimity who have earned it.',
 "We take no pleasure in permitted joys. But what's forbidden is more keenly sought.",
 'Who is allowed to sin, sins less.' """
human_template = "{text}"

chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", template),
        ("human", human_template),
    ]
)
chain = chat_prompt | ChatOpenAI() | CommaSeparatedListOutputParser()
# chain.invoke({"text": ""})

In [22]:
# Extract Quotes from LLM Authorshipp Attribution
import pandas as pd
import spacy
import re

nlp = spacy.load("en_core_web_sm")
input_string = response["result"]

# Extract lines starting with a number and strip the number
lines_with_stripped_numbers = [
    re.sub(r"^\d+\.\s*", "", line.strip())
    for line in input_string.splitlines()
    if re.match(r"^\d+\.", line)
]

# Print the extracted lines
for line in lines_with_stripped_numbers:
    print(line)

"Love is a kind of warfare." - Metamorphoses
"Fortune and love favor the brave." - Amores
"The mind can make a heaven of hell, a hell of heaven." - Metamorphoses
"The cause is hidden. The effect is visible to all." - Metamorphoses
"Dripping water hollows out stone, not through force but through persistence." - Amores
"Happy is the man who has broken the chains which hurt the mind, and has given up worrying once and for all." - Metamorphoses
"To love and be loved is to feel the sun from both sides." - Epistulae Heroidum (Heroides)
"Everything changes, nothing perishes." - Metamorphoses
"It is no use to blame the looking glass if your face is awry." - Amores
"Let others praise ancient times; I am glad I was born in these." - Metamorphoses


In [21]:
from itertools import product

# Compare to quotes
generated_quotes = lines_with_stripped_numbers

# Read in existing quotes
df = pd.read_json("./../ovid_quotes.json")

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Process each quote and calculate similarity scores
similarity_data = []

for generated_quote, actual_quote in product(generated_quotes, df["Quote"]):
    # Process the quotes with spaCy
    doc_generated = nlp(generated_quote)
    doc_actual = nlp(actual_quote)

    # Calculate similarity score
    similarity_score = doc_generated.similarity(doc_actual)

    # Append data to the list
    similarity_data.append(
        {
            "generated_quotes": generated_quote,
            "actual_quotes": actual_quote,
            "similarity_score": similarity_score,
        }
    )

# Create a DataFrame from the similarity data
similarity_df = pd.DataFrame(similarity_data).sort_values(
    "similarity_score", ascending=False
)
similarity_df

  similarity_score = doc_generated.similarity(doc_actual)


Unnamed: 0,generated_quotes,actual_quotes,similarity_score
151,"""Everything changes, nothing perishes."" - Meta...","Everything changes, nothing perishes.",0.811338
74,"""The cause is hidden. The effect is visible to...","The cause is hidden, but the effect is visible...",0.795255
6,"""Love is a kind of warfare."" - Metamorphoses",Love yields to business. If you seek a way out...,0.601550
67,"""The cause is hidden. The effect is visible to...",Every lover is a soldier.,0.588966
63,"""The cause is hidden. The effect is visible to...",Love yields to business. If you seek a way out...,0.587367
...,...,...,...
20,"""Fortune and love favor the brave."" - Amores",Now are fields of corn where Troy once stood.,0.136513
134,"""Everything changes, nothing perishes."" - Meta...",Now are fields of corn where Troy once stood.,0.134465
126,"""To love and be loved is to feel the sun from ...",Far away be that fate!,0.122378
145,"""Everything changes, nothing perishes."" - Meta...",Far away be that fate!,0.119651


In [33]:
# Seems like there might be a magic number similarity score < 0.65
similarity_df[similarity_df["similarity_score"] <= 0.65]["generated_quotes"].unique()

array(['"Love is a kind of warfare." - Metamorphoses',
       '"The cause is hidden. The effect is visible to all." - Metamorphoses',
       '"The mind can make a heaven of hell, a hell of heaven." - Metamorphoses',
       '"Let others praise ancient times; I am glad I was born in these." - Metamorphoses',
       '"Everything changes, nothing perishes." - Metamorphoses',
       '"Fortune and love favor the brave." - Amores',
       '"To love and be loved is to feel the sun from both sides." - Epistulae Heroidum (Heroides)',
       '"It is no use to blame the looking glass if your face is awry." - Amores',
       '"Dripping water hollows out stone, not through force but through persistence." - Amores',
       '"Happy is the man who has broken the chains which hurt the mind, and has given up worrying once and for all." - Metamorphoses'],
      dtype=object)

In [55]:
df["Quote"].to_list()

['Love is a thing full of anxious fears.',
 'Now are fields of corn where Troy once stood.',
 "We're slow to believe what wounds us.",
 "The end proves the acts (were done), or the result is a test of the actions; Ovid's line 85 full translation: “The event proves well the wisdom of her [Phyllis'] course.”",
 "Let him who loves, where love success may find, Spread all his sails before the prosp'rous wind.",
 'Resist beginnings; the remedy comes too late when the disease has gained strength by long delays.',
 "Love yields to business. If you seek a way out of love, be busy; you'll be safe then.",
 'The gods behold all righteous actions.',
 'There is a god within us.',
 'The mind, conscious of rectitude, laughed to scorn the falsehood of report.',
 'Every lover is a soldier.',
 'Let the man who does not wish to be idle fall in love!',
 'Far away be that fate!',
 'They bear punishment with equanimity who have earned it.',
 "We take no pleasure in permitted joys. But what's forbidden is mo

In [54]:
# Print the resulting DataFrame
display(similarity_df.sort_values("similarity_score"))

Unnamed: 0,generated_quotes,actual_quotes,similarity_score
123,"""Time flies, death urges, heaven's revolving w...",Let the man who does not wish to be idle fall ...,0.017135
27,"""Fortune favors the bold."" - Metamorphoses",Let the man who does not wish to be idle fall ...,0.068201
77,"""Wherever there is a human being, there is an ...",They bear punishment with equanimity who have ...,0.082028
66,"""Wherever there is a human being, there is an ...",We're slow to believe what wounds us.,0.108871
34,"""One man's meat is another man's poison."" - Me...",We're slow to believe what wounds us.,0.122052
...,...,...,...
115,"""Time flies, death urges, heaven's revolving w...","The end proves the acts (were done), or the re...",0.597987
35,"""One man's meat is another man's poison."" - Me...","The end proves the acts (were done), or the re...",0.606245
6,"""Love is a kind of warfare."" - Amores",Love yields to business. If you seek a way out...,0.609010
51,"""The cause is hidden, but the effect is visibl...","The end proves the acts (were done), or the re...",0.624752


In [None]:
# For sanity-checking on how well a similarity_score does for retrieving a "memorable quote"

In [50]:
print("highest score comparison:")
idx = 54
print("GENERATED:", similarity_df.iloc[54]["generated_quotes"])
print("ACTUAL:", similarity_df.iloc[54]["actual_quotes"])

highest score comparison:
GENERATED: "Let others praise ancient times; I am glad I was born in these." - From Amores
ACTUAL: Love yields to business. If you seek a way out of love, be busy; you'll be safe then.


### Assessment of Quote Retrieval Accuracy

In [15]:
import re

non_ovid_generated_quotes = [
    "One man's meat is another man's poison.",
    "Fortune favors the bold.",
    "Wherever there is a human being, there is an opportunity for a kindness.",
    "Love is a kind of warfare.",
    "One man's meat is another man's poison.",
]


# Define the regex pattern
pattern = r"(\d+\.\d+)"


scores = []
strings = []
for q in non_ovid_generated_quotes:
    r = chain.invoke({"text": q})
    r = " ".join(r)
    strings.append(r)
    match = re.search(pattern, r)
    score = match[0] if match else -1
    scores.append(score)
    print(r)

pd.DataFrame(
    data={
        "questionable_quote": non_ovid_generated_quotes,
        "authorship_match_score": scores,
        "is_original": [0, 0, 0, 1, 0],
    }
)

Based on the given text "One man's meat is another man's poison," it is difficult to determine the likelihood that it was written by the same author as the quotes by Ovid. The writing style and tone of this text are different from the quotes provided. There is no evident use of acronyms or abbreviations and the sentence structure is simple and straightforward. The expression and idiom used in this text are not commonly found in Ovid's quotes. Therefore the likelihood that this text was written by the same author as Ovid is low. Confidence score: 0.2
Confidence Score: 0.4

Explanation: The given text does not exhibit many of the specific characteristics of Ovid's writing style. While the sentence structure is similar and there is a use of an expression or idiom the overall tone and mood of this quote is different from Ovid's usual style. Additionally there are no specific punctuation or capitalization styles that align with Ovid's writing. Therefore there is a moderate level of confiden

NameError: name 'pd' is not defined

### Storage of Additional Quotes

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Given documents
documents = [
    "The duck loves to eat the worm",
    "The worm doesn’t like the early bird",
    "The bird loves to get up early to get the worm",
    "The bird gets the worm from the early duck",
    "The duck and the birds are so different from each other but one thing they have in common is that they both get the worm"
]

# Given query
query = "The early bird gets the worm"

# Tokenization and preprocessing
stopwords = set(["the", "and", "to", "is", "that", "they", "are", "from", "but", "one", "in", "has", "have", "up"])
vectorizer = TfidfVectorizer(vocabulary=["bird", "duck", "worm", "early", "get", "love"])
tfidf_matrix = vectorizer.fit_transform(documents + [query])

# Compute cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

# Get the top-ranked documents
top_documents_indices = cosine_similarities.argsort()[0][::-1][:2]
top_documents = [documents[i] for i in top_documents_indices]

# Print the results
print("Top-ranked documents:")
for i, document in enumerate(top_documents, start=1):
    print(f"{i}. {document}")

# Check relevance
relevant_documents = ["The bird loves to get up early to get the worm", "The bird gets the worm from the early duck"]
are_relevant = all(document in relevant_documents for document in top_documents)

print("\nAre the top-ranked documents relevant to the query?", are_relevant)


Top-ranked documents:
1. The worm doesn’t like the early bird
2. The bird gets the worm from the early duck

Are the top-ranked documents relevant to the query? False
