# Quote Generation/Retrieval

Initial quotes were scraped from one webpage. But that would lead to quite a lot of repitition and kindof a boring bot. So...

### 2 Issues with Index retrieved/Generated Quotes:
**Goal:** To have ~100 quotes to have tweeted on a rolling basis since this is a quarter of a year and will not likely to be easily repeated.

### Probability of Repeating a Quote Exactly 2 Weeks Later

\begin{equation*}
P(\text{repeated in 2 weeks}) = 1 - \left( \frac{N - 1}{N} \right)^{\frac{14}{m}}
\end{equation*}

 Substituting in the values:

 \begin{equation*}
P(\text{repeated in 2 weeks}) = 1 - \left( \frac{99}{100} \right)^{14}
\end{equation*}

 Calculating this gives:

 \begin{equation*}
P(\text{repeated in 2 weeks}) \approx 0.135
\end{equation*}

Personally, I'd like it to be under 10% for a longer period of time (increases probability)

Tweaking the numbers a bit 
N = 200
days = 21

\begin{equation*}
P(\text{repeated in 3 weeks}) \approx  0.0980
\end{equation*}

The following should be conditional 1) That the quote generated has confidence that the quote was written by Ovid then 2) That the quote is not a variation of a quote already in DB.

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
import os
import openai
import pandas as pd

from itertools import product

# import utils
import configparser
from pprint import pprint
from llama_index import SimpleDirectoryReader


import os
import openai


import jupyter_black

jupyter_black.load()
# Access values from the sections

import sys

sys.path.append("../app")
import conn_utils

OPENAI_API_KEY = conn_utils.get_open_ai_key("./../app/config.ini")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
directory = "index_store"

pd.set_option("display.max_colwidth", None)

In [6]:
!ls ./../data

Amours.txt                 MetamorphosesVIII_XV.txt
Fasti.txt                  MetamorphosesofPublius.txt
Heroides.txt               RemediaAmoris.txt
LastPoems.txt              notes.jsonl
LoversAssistant.txt        notes_validation.jsonl
MetamorphosesI_VII.txt


## Get low sentency similarity

Instead of using a LLM. You can trust to get a close (but not too close) similarity

In [87]:
import spacy

# Read in exisitng quotes
df = pd.read_json("./../app/ovid_quotes.json")

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Define the file paths
files = [
    "RemediaAmoris.txt",
    "Heroides.txt",
    "Amours.txt",
    "Fasti.txt",
    "MetamorphosesI_VII.txt",
    "MetamorphosesVIII_XV.txt",
    "MetamorphosesofPublius.txt",
    "LoversAssistant.txt",
    "LastPoems.txt",
]

# Read in the text files and split sentences
from tqdm import tqdm

sentences = []
for file in tqdm(["Amours.txt"]):
    with open(os.path.join("./../data", file), "r") as f:
        text = f.read()
        doc = nlp(text)
        sentences.extend([sent.text for sent in doc.sents])

# Print the list of sentences
print(f"Num sentences: {len(sentences)}")
pprint(sentences[:10])

100%|██████████| 1/1 [00:16<00:00, 16.55s/it]

Num sentences: 3752
['\ufeffThe Project Gutenberg eBook of The Amores; or, Amours\n'
 '    \n'
 'This ebook is for the use of anyone anywhere in the United States and\n'
 'most other parts of the world at no cost and with almost no restrictions\n'
 'whatsoever.',
 'You may copy it, give it away or re-use it under the terms\n'
 'of the Project Gutenberg License included with this ebook or online\n'
 'at www.gutenberg.org.',
 'If you are not located in the United States,\n'
 'you will have to check the laws of the country where you are located\n'
 'before using this eBook.\n'
 '\n',
 'Title: The Amores; or, Amours\n'
 '\n'
 '\n'
 'Author: Ovid\n'
 '\n'
 'Translator: Henry T. Riley\n'
 '\n'
 'Release date: December 16, 2014 [eBook #47676]\n'
 '\n'
 'Language: English\n'
 '\n'
 'Credits: Produced by David Widger from page images generously\n'
 '        provided by the Internet Archive\n'
 '\n'
 '\n'
 '*** START OF THE PROJECT GUTENBERG EBOOK THE AMORES; OR, AMOURS ***\n'
 '\n'
 '\n'
 '\n'





In [91]:
s = "\ufeffThe Project Gutenberg eBook of The Amores; or, Amours\n"
re.sub("[^0-9a-zA-Z]+", " ", s)

' The Project Gutenberg eBook of The Amores or Amours '

In [100]:
import re


def replace_non_alphanumeric(input_string):
    return re.sub("[^0-9a-zA-Z]+", " ", input_string)


# Clean sentences
sentences = [
    replace_non_alphanumeric(input_string=s).strip()
    for s in sentences
    if len(s.replace("\n", "").strip().split()) > 2
]

sentences = [s for s in sentences if "Footnote" not in s]
sentences = [s for s in sentences if "Project Gutenberg" not in s]
sentences = [s for s in sentences if "--Ver." not in s]
sentences = [s for s in sentences if "The Amores" not in s]
print(len(sentences))
pprint(sentences[:5])

2619
['If you are not located in the United States you will have to check the laws '
 'of the country where you are located before using this eBook',
 'AN EPIGRAM ON THE AMOURS',
 'We who of late were five books',
 '001 of Naso are now but three this work our author has preferred to the '
 'former one',
 'Though it should 002 now be no pleasure to thee to read us still the labour '
 'will be less the two being removed']


In [120]:
# Save to temp disk
# Define the file path
file_path = "/Users/aus10powell/Downloads/sentences.json"

pd.DataFrame({"sentences": sentences}).to_json(file_path, index=False)

In [101]:
t = product(sentences, df["Quote"])
t = list(t)

In [8]:
#########################################################################
# POC: To see if generate a response based on a quote
# Situation: There is a general tweet
# Task: Generate a list of candidate quotes
# Action: Create a model that assigns a score to each quote
# Result: The top 1 quote is returned
##################################################################################################################################################
import spacy
import pandas as pd

# Read in exisitng quotes
df = pd.read_json("./../app/ovid_quotes.json")
sentences = df["Quote"].tolist()
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")
actual_quote = "People will do anything, no matter how absurd, in order to avoid facing their own souls. One does not become enlightened by imagining figures of light, but by making the darkness conscious."
doc_actual = nlp(actual_quote)
similarity_data = []
for s in sentences:
    doc_generated = nlp(s)
    similarity_score = doc_generated.similarity(doc_actual)
    similarity_data.append(
        {
            "new_quotes": s,
            "actual_quotes": actual_quote,
            "similarity_score": similarity_score,
        }
    )
similarity_df = pd.DataFrame(similarity_data)
similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False).head(10)
similarity_df["new_quotes"].to_list()[:10]
##################################################################################################################################################

  similarity_score = doc_generated.similarity(doc_actual)


['Let the man who does not wish to be idle fall in love!',
 'To love and be loved is to feel the sun from both sides.',
 "Love yields to business. If you seek a way out of love, be busy; you'll be safe then.",
 'Happy is the man who has broken the chains which hurt the mind, and has given up worrying once and for all.',
 'Nothing is stronger than habit.',
 'Dripping water hollows out stone, not through force but through persistence.',
 'It is no use to blame the looking glass if your face is awry.',
 'Resist beginnings; the remedy comes too late when the disease has gained strength by long delays.',
 'The mind, conscious of rectitude, laughed to scorn the falsehood of report.',
 'The cause is hidden. The effect is visible to all.']

In [102]:
similarity_data = []

for generated_quote, actual_quote in tqdm(t):
    # Process the quotes with spaCy
    doc_generated = nlp(generated_quote)
    doc_actual = nlp(actual_quote)

    # Calculate similarity score
    similarity_score = doc_generated.similarity(doc_actual)

    # Append data to the list
    similarity_data.append(
        {
            "new_quotes": generated_quote,
            "actual_quotes": actual_quote,
            "similarity_score": similarity_score,
        }
    )

# Create a DataFrame from the similarity data
similarity_df = pd.DataFrame(similarity_data).sort_values(
    "similarity_score", ascending=False
)
display(similarity_df.head(3))

# Sort and clean
upper_similarity = 0.8
lower_similarity = 0.2

suitable_df = similarity_df[
    (similarity_df["similarity_score"] > lower_similarity)
    & (similarity_df["similarity_score"] < upper_similarity)
].sort_values(
    "similarity_score", ascending=True
)  # .sample(10)
print(suitable_df.shape)
# Clean out very short quotes
suitable_df = suitable_df[suitable_df["new_quotes"].str.len() > 20]
# suitable_df = suitable_df[~suitable_df["new_quotes"].str.contains("Footnote")]
# suitable_df = suitable_df[~suitable_df["new_quotes"].str.contains("--Ver.")]
print(suitable_df.shape)
suitable_df.sample(10)

  similarity_score = doc_generated.similarity(doc_actual)
100%|██████████| 89046/89046 [34:33<00:00, 42.95it/s]    


Unnamed: 0,new_quotes,actual_quotes,similarity_score
11299,Let him who wishes not to become slothful fall in love,Let the man who does not wish to be idle fall in love!,0.767422
83568,He alludes to the custom of the nearest relative closing the eyes of the dying person,The mind moves in the direction of our currently dominant thoughts.,0.762584
10536,The one watches at the door of his mistress but the other at that of his general,The mind moves in the direction of our currently dominant thoughts.,0.76001


(56813, 3)
(56016, 3)


Unnamed: 0,new_quotes,actual_quotes,similarity_score
47230,645 Bearing these things and others on which I am silent I have oft endured them find another in my stead who could put up with these things,"Let him who loves, where love success may find, Spread all his sails before the prosp'rous wind.",0.462355
14586,Why should I be punished in my affections if thy husband does decay through length of years,Love is a thing full of anxious fears.,0.362743
67507,As Assyria adjoined India the word Assyrium is here used by poetical licence as really meaning Indian,"The cause is hidden, but the effect is visible to all.",0.260701
40552,Her the impetuous stream beheld from his rapid waves and raised his hoarse mouth from the midst of his fords and thus he said Why in sorrow art thou pacing my banks Ilia the descendant of Laomedon,Fortune and love favor the brave.,0.31344
29470,that I could suddenly be changed into my own present by the arts of her of a or of the Carpathian old man,It is no use to blame the looking glass if your face is awry.,0.369159
28830,I myself though destined as I am to die a more pleasing death by love should have beheld no days had my mother slain me,"And did you, my hands, seize the horns of the mighty bull?",0.297777
7789,my hair let anger nerve your hands weak though they may be,"The end proves the acts (were done), or the result is a test of the actions; Ovid's line 85 full translation: “The event proves well the wisdom of her [Phyllis'] course.”",0.285236
82868,A certain poetic measure was called by this name but we learn from Athenaeus that it was not always confined to pathetic subjects,Every lover is a soldier.,0.28626
45634,To her said Nemesis What dost thou say,"Love yields to business. If you seek a way out of love, be busy; you'll be safe then.",0.211255
66827,Hippolytus was an example of chastity while Priapus was the very ideal of lustfulness,"The cause is hidden, but the effect is visible to all.",0.357871


In [73]:
suitable_df = suitable_df[suitable_df["new_quotes"].str.len() > 20]
suitable_df.sample(10)

Unnamed: 0,new_quotes,actual_quotes,similarity_score
19431,"the young\nLady began to change her Note, and to hope he would not forsake her\nso.\n\n","The cause is hidden, but the effect is visible to all.",0.222706
8661,Believe every Woman is to be\ncome at.,To love and be loved is to feel the sun from both sides.,0.587393
10582,"Agamemnon_, after returning safe from so many bloody Campaigns, and\nfrom the dangerous Seas which he crossed, fell at last a dreadful\nVictim to the Whore his Wife[37].\n\n",There is a god within us.,0.333108
11316,"When your Mistress is in this Humour, let _Abigail_","Happy is the man who has broken the chains which hurt the mind, and has given up worrying once and for all.",0.266805
15899,"Evie Evoe_, two very mysterious Words, and\nfull of Masonry, the God and his new-ravished Bride go together,\nbetween a Pair of sacred Sheets.\n\n","The mind can make a heaven of hell, a hell of heaven.",0.4847
12837,"It is moreover my Advice to you, to be liberal of your Promises; for\nwhat Injury can you receive by Promising?",Love is a kind of warfare.,0.338792
12419,"""I promise you, my Dear,"" says she, ""if you will but buy me this\nsingle Jewel, I will not ask another of you the Lord knows how long;\nbut I have really a present Occasion for this, and besides it is the\ncheapest Thing I ever saw.","The mind, conscious of rectitude, laughed to scorn the falsehood of report.",0.230278
3017,"It\nis well known that the [1]Rules of Art are necessary to the Conduct of\na Ship; for which reason, none but able and experienced Seamen are\npreferred to the Command of one.",To love and be loved is to feel the sun from both sides.,0.337503
25593,"Caunus_; and\nupon his rejecting her Addresses, hanged herself.",To love and be loved is to feel the sun from both sides.,0.283958
19018,"Fie upon it, General, I am ashamed to see you sit quilting among the\nGirls; a Sword becomes your Hands much better than a Needle._\n\n_",Far away be that fate!,0.286274


In [9]:
# df = pd.read_json("./../app/ovid_quotes.json")
# ## Update add new quotes to the dataframe
# new_row = {
#     "Quote": "Let anger nerve your hands weak though they may be.",
#     "Work": "Amores",
#     "Quote in Latin": "Ira vires anima manus, quamvis infirmae sint.",
# }
# df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# df.to_json("./../app/ovid_quotes.json")
# df.shape

(37, 3)

In [None]:
df = pd.read_json("./../app/ovid_quotes.json")

DATA_DIR = "./../data"
files = [
    "RemediaAmoris.txt",
    "Heroides.txt",
    "Amours.txt",
    "Fasti.txt",
    "MetamorphosesI_VII.txt",
    "MetamorphosesVIII_XV.txt",
    "MetamorphosesofPublius.txt",
    "LoversAssistant.txt",
    "LastPoems.txt",
]
docs = [f"{DATA_DIR}/{file}" for file in files]
documents = SimpleDirectoryReader(input_files=docs).load_data()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=25)
texts = text_splitter.create_documents(docs)
directory = "index_store"
vector_index = FAISS.from_documents(texts, OpenAIEmbeddings())
vector_index.save_local(directory)

vector_index = FAISS.load_local("index_store", OpenAIEmbeddings())
retriever = vector_index.as_retriever(search_type="similarity", search_kwargs={"k": 10})

In [None]:
df = pd.read_json("./../app/ovid_quotes.json")
df.head(1)

In [None]:
qa_interface = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
)

response = qa_interface(
    f"""
I am a big fan of ovid. 
Please recommend 10 memorable quotes to me along with the source document they were taken from.

Do NOT include quotes I already have:
{df["Quote"].tolist()}
"""
)

print(response["result"])

#### Authorship authentication

**[Who Wrote it and Why?
Prompting Large-Language Models for Authorship Verification](https://arxiv.org/pdf/2310.08123.pdf)**

In [None]:
# LLM Authorship Attribution
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseOutputParser


class CommaSeparatedListOutputParser(BaseOutputParser[List[str]]):
    """Parse the output of an LLM call to a comma-separated list."""

    def parse(self, text: str) -> List[str]:
        """Parse the output of an LLM call."""
        return text.strip().split(", ")


template = f"""Task: On a scale of 0 to 1, with 0 indicating low confidence
and 1 indicating high confidence, please provide a general
assessment of the likelihood that given text 
written by the same author as the provided reference. Your answer should reflect a
moderate level of strictness in scoring. Here are some
relevant variables to this problem.
1. punctuation style(e.g. hyphen, brackets, colon, comma,
parenthesis, quotation mark)
2. special characters style, capitalization style(e.g.
Continuous capitalization, capitalizing certain words)
3. acronyms and abbreviations(e.g. Usage of acronyms
such as OMG, Abbreviations without punctuation marks
such as Mr Rochester vs. Mr. Rochester,Unusual
abbreviations such as def vs. definitely)
4. writing style
5. expressions and Idioms
6. tone and mood
7. sentence structure
8. any other relevant aspect
First step: Understand the problem, extracting relevant
variables and devise a plan to solve the problem. Then,
carry out the plan and solve the problem step by step.
9. One (or both) of the texts is written by the famous Latin author "Ovid"
Finally, show the confidence score.

The following are all quotes by Ovid for reference:
'Love is a thing full of anxious fears.',
 'Now are fields of corn where Troy once stood.',
 "We're slow to believe what wounds us.",
 "The end proves the acts (were done), or the result is a test of the actions; Ovid's line 85 full translation: “The event proves well the wisdom of her [Phyllis'] course.”",
 "Let him who loves, where love success may find, Spread all his sails before the prosp'rous wind.",
 'Resist beginnings; the remedy comes too late when the disease has gained strength by long delays.',
 "Love yields to business. If you seek a way out of love, be busy; you'll be safe then.",
 'The gods behold all righteous actions.',
 'There is a god within us.',
 'The mind, conscious of rectitude, laughed to scorn the falsehood of report.',
 'Every lover is a soldier.',
 'Let the man who does not wish to be idle fall in love!',
 'Far away be that fate!',
 'They bear punishment with equanimity who have earned it.',
 "We take no pleasure in permitted joys. But what's forbidden is more keenly sought.",
 'Who is allowed to sin, sins less.' 
 
 """
human_template = "{text}"

chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", template),
        ("human", human_template),
    ]
)
chain = chat_prompt | ChatOpenAI() | CommaSeparatedListOutputParser()

In [None]:
# Extract Quotes from LLM Authorshipp Attribution
import pandas as pd
import spacy
import re

nlp = spacy.load("en_core_web_sm")
input_string = response["result"]


def get_quotes(input_string):
    """Extract quotes from a string that has been created by an LLM string prompt."""
    # Extract the lines that start with a number
    lines_with_stripped_numbers = [
        re.sub(r"^\d+\.\s*", "", line.strip())
        for line in input_string.splitlines()
        if re.match(r"^\d+\.", line)
    ]

    # Print the extracted lines
    quotes = []
    pattern2 = r'"([^"]+)"\s*-\s*(.+)'

    for line in lines_with_stripped_numbers:
        match = re.search(pattern2, line)
        q = match.group(1)
        w = match.group(2)
        quotes.append((q, w))
    return quotes


quotes = get_quotes(response["result"])
quotes

### Score New Quotes

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate


def score_quotes(quotes=None):
    """Score a list of quotes that have been generated by an LLM string prompt.

    quotes example:
    [('Dripping water hollows out stone, not through force but through persistence.',
    'Metamorphoses'),

    """
    if quotes is None:
        quotes = []

    # Define the regex pattern
    pattern = r"(\d+\.\d+)"

    # Get the quote string (not the work)
    quotes_text = [q[0] for q in quotes]
    quotes_work = [q[1] for q in quotes]

    # Extract the scores
    scores = []
    score_reasons = []
    for q in quotes_text:
        r = chain.invoke({"text": q})
        r = " ".join(r)
        score_reasons.append(r)
        match = re.search(pattern, r)
        score = match[0] if match else -1
        scores.append(score)
    return list(zip(*(scores, score_reasons, quotes_text, quotes_work)))


# Example usage
scored_quotes = score_quotes(quotes)
scored_quotes_df = pd.DataFrame(
    scored_quotes, columns=["score", "reason", "quote", "work"]
)
pd.set_option('display.max_colwidth', None)
scored_quotes_df  # ["score"]

In [None]:
scored_quotes_df["quote"].tolist()

### Context-Based Quotation Recommendation

Resource:
* https://arxiv.org/pdf/2005.08319.pdf

### Assessment of Retrieval Accuracy

The below uses current pipeline for 2 reasons:
1) To assess the hulicination affect against current pipeline
2) To assess scoring variability

In [None]:
import re

non_ovid_generated_quotes = [
    "One man's meat is another man's poison.",
    "Fortune favors the bold.",
    "Wherever there is a human being, there is an opportunity for a kindness.",
    "Love is a kind of warfare.",
    "One man's meat is another man's poison.",
    "To be loved, be lovable.",
]


# Define the regex pattern
pattern = r"(\d+\.\d+)"


scores = []
strings = []
for q in non_ovid_generated_quotes:
    r = chain.invoke({"text": q})
    r = " ".join(r)
    strings.append(r)
    match = re.search(pattern, r)
    score = match[0] if match else -1
    scores.append(score)
    print(r)

pd.DataFrame(
    data={
        "questionable_quote": non_ovid_generated_quotes,
        "authorship_match_score": scores,
        "is_original": [0, 0, 0, 1, 0, 1],
    }
)

In [None]:
scored_quotes_df[scored_quotes_df["score"].astype(float) < 0.5]["quote"].tolist()

In [None]:
from itertools import product

# Compare to quotes
generated_quotes = scored_quotes_df[scored_quotes_df["score"].astype(float) > 0.5][
    "quote"
].tolist()

# Read in existing quotes
df = pd.read_json("./../app/ovid_quotes.json")

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Process each quote and calculate similarity scores
similarity_data = []

for generated_quote, actual_quote in product(generated_quotes, df["Quote"]):
    # Process the quotes with spaCy
    doc_generated = nlp(generated_quote)
    doc_actual = nlp(actual_quote)

    # Calculate similarity score
    similarity_score = doc_generated.similarity(doc_actual)

    # Append data to the list
    similarity_data.append(
        {
            "generated_quotes": generated_quote,
            "actual_quotes": actual_quote,
            "similarity_score": similarity_score,
        }
    )

# Create a DataFrame from the similarity data
similarity_df = pd.DataFrame(similarity_data).sort_values(
    "similarity_score", ascending=False
)
display(similarity_df.head(2))

# Seems like there might be a magic number similarity score < 0.65
print("unique quutes")

similiar_quotes = similarity_df[similarity_df["similarity_score"] > 0.65][
    "generated_quotes"
].unique()

# Filter out the similar quotes
new_quotes = similarity_df[~similarity_df["generated_quotes"].isin(similiar_quotes)][
    "generated_quotes"
].unique()

# Get the works for the new quotes
new_quote_works = []
for q in new_quotes:
    for quote, work in quotes:
        if q == quote:
            new_quote_works.append(work)
df_new_quotes = pd.DataFrame(
    list(zip(*(new_quotes, new_quote_works))), columns=["Quote", "Work"]
)
df_new_quotes["Quote in Latin"] = None
df_new_quotes

In [None]:
similiar_quotes

In [None]:
print("highest score comparison:")
idx = 54
print("GENERATED:", similarity_df.iloc[54]["generated_quotes"])
print("ACTUAL:", similarity_df.iloc[54]["actual_quotes"])

### Storage of Additional Quotes

In [None]:

# https://colab.research.google.com/drive/1Vu8PjdzdnQAebQKFxDMc5cDdLFyF64mJ#scrollTo=20zGRG3Jlrmg

In [None]:
# After quotes works have passed the assessment bars above
df = pd.concat([df, df_new_quotes]).reset_index(drop=True)
df.to_json("./../ovid_quotes.json")

## Fine-Tuning

**Format:** Example data from OpenAI for .jsonl
```jsonl
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
            {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", 
            "content": "Paris, as if everyone doesn't know that already."}]}
```

## Dual-Encoder

In [32]:
import pandas as pd
from tensorflow.keras.layers import Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model
import nltk
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate

stopwords = nltk.corpus.stopwords.words("english")

In [39]:
# Preprocess data
def preprocess_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in stopwords]
    return " ".join(text)


similarity_df["actual_quotes"] = similarity_df["actual_quotes"].apply(preprocess_text)
similarity_df["new_quotes"] = similarity_df["new_quotes"].apply(preprocess_text)

# Define vocabulary size and embedding dimension
max_features = 10000
embedding_dim = 128
max_sequence_length = 100

# Create embedding layers
embedding_layer = Embedding(
    max_features, embedding_dim, input_length=max_sequence_length
)

In [33]:
# Define and build the model
def build_model():
    quote_input = Input(shape=(max_sequence_length,))
    new_quote_input = Input(shape=(max_sequence_length,))

    embedded_quote = embedding_layer(quote_input)
    embedded_new_quote = embedding_layer(new_quote_input)

    encoded_quote = LSTM(units=64)(embedded_quote)
    encoded_new_quote = LSTM(units=64)(embedded_new_quote)

    merged = Concatenate()([encoded_quote, encoded_new_quote])
    output = Dense(1, activation="softmax")(merged)

    model = Model(inputs=[quote_input, new_quote_input], outputs=output)
    model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return model


model = build_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 100, 128)             1280000   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 lstm (LSTM)                 (None, 64)                   49408     ['embedding[0][0]']       