In [None]:
## Project to read in all tours from explore.co.uk
# Use this output to be able to recommend tours given a request from a user

In [None]:
# start the llm
#> ollama run llama3

In [None]:
# import libraries

In [None]:
import json
import requests
import html5lib
import bs4
from bs4 import BeautifulSoup
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
import scipy
import gensim
import gensim.downloader as api
import transformers
import numpy

nltk.download('punkt') # if necessary...

In [None]:
## define functions to be used

In [None]:
# functions for calculating similarities

In [None]:
def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [None]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [None]:
def gensimy(text1, corpus_of_documents):
    gen_docs = [[w.lower() for w in word_tokenize(text)]
        for text in corpus_of_documents]
    dictionary = gensim.corpora.Dictionary(gen_docs)
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus], num_features=len(dictionary))
    query_doc = [w.lower() for w in word_tokenize(text1)]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    return(sims[query_doc_tf_idf])

In [None]:
# function to return similarities of documents

In [None]:
def return_response(sim_func, query, corpus):
    similarities = []
    for doc in corpus:
        similarity = sim_func(user_input, doc)
        similarities.append(similarity)
        tour_id = similarities.index(max(similarities))
    return tour_id, corpus_of_documents[similarities.index(max(similarities))]

In [None]:
# read in a list of urls to scrape the tour details from

In [None]:
tour_urls = ['https://www.explore.co.uk/holidays/machu-picchu-trek',
             'https://www.explore.co.uk/holidays/vietnam-historic-tour',
             'https://www.explore.co.uk/holidays/cycling-holiday-morocco',
             'https://www.explore.co.uk/holidays/taste-of-japan-tokyo-kyoto-osaka',
             'https://www.explore.co.uk/holidays/india-tiger-safari',
             'https://www.explore.co.uk/holidays/family-costa-rica-highlights',
             'https://www.explore.co.uk/holidays/central-american-highlights',
             'https://www.explore.co.uk/holidays/costa-rica-wildlife-tour',
             'https://www.explore.co.uk/holidays/vietnam-adventure-tour',
             'https://www.explore.co.uk/holidays/vietnam-walking-holiday',
             'https://www.explore.co.uk/holidays/peru-amazon-extension']

In [None]:
# iterate through the list and get the itinerary for each tour with the title

In [None]:
contents = []
tours = []
for index, url in enumerate(tour_urls):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html5lib')

    tours.append(soup.title.get_text().replace('\n', ' ').replace('\t', ''))
    table = soup.find('div', attrs = {'id':'itinerary'})

    itinerary = []  # a list to store quotes
    for row in table.findAll('div', attrs = {'class':'pr-i-desc'}): 
        itinerary.append(row.p.get_text())

    contents.append(itinerary)

In [None]:
# create a list of details

In [None]:
corpus_of_documents = []
for item in contents:
    full_details = ' '.join(item)
    corpus_of_documents.append(full_details)

In [None]:
# define input

In [None]:
user_input = "I'd like to visit South America and visit Machu Picchu"

In [None]:
# find the relevant document

In [None]:
relevant_tour, relevant_document = return_response(jaccard_similarity, user_input, corpus_of_documents)

In [None]:
relevant_tour, relevant_document = return_response(cosine_sim, user_input, corpus_of_documents)

In [None]:
relevant_tour, relevant_document = return_response(gensimy, user_input, corpus_of_documents)

In [None]:
# create prompt

In [None]:
prompt = """
You are a bot that makes recommendations for holidays. You answer with facts, highlighting pros and cons.
This is the recommended holiday: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

In [None]:
# setup llama3
url = 'http://localhost:11434/api/generate'
data = {
    "model": "llama3",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)

In [None]:
# generate and return the response

In [None]:
full_response = []
try:
    count = 0
    for line in response.iter_lines():
        # filter out keep-alive new lines
        # count += 1
        # if count % 5== 0:
        #     print(decoded_line['response']) # print every fifth token
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            
            full_response.append(decoded_line['response'])
finally:
    response.close()

In [None]:
print(tours[relevant_tour], ''.join(full_response))

In [None]:
def sim_func1(query, corpus_of_documents):
    query = query.lower().split(" ")
    sims = []
    for document in corpus_of_documents:
        document = document.lower().split(" ")
        intersection = set(query).intersection(set(document))
        union = set(query).union(set(document))
        sims.append(len(intersection)/len(union))
    return sims

In [None]:
def sim_func2(query, corpus_of_documents):
    gen_docs = [[w.lower() for w in word_tokenize(text)]
        for text in corpus_of_documents]
    dictionary = gensim.corpora.Dictionary(gen_docs)
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus], num_features=len(dictionary))
    query_doc = [w.lower() for w in word_tokenize(query)]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    return(sims[query_doc_tf_idf])

In [None]:
def sim_func3(query, corpus_of_documents):
    nlp = spacy.load('en_core_web_sm')
    sims = []
    for document in corpus_of_documents:
        query = nlp(query)
        document = nlp(document)
        sims.append(query.similarity(document))
    return sims

In [None]:
def sim_func4(query, corpus_of_documents):
    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens]
    
    '''remove punctuation, lowercase, stem'''
    def normalize(text):
        return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
    
    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

    tfidf = vectorizer.fit_transform([query, corpus_of_documents])
    return ((tfidf * tfidf.T).A)[0,1]

In [None]:
sim_func1(user_input, corpus_of_documents)

In [None]:
sim_func2(user_input, corpus_of_documents)

In [None]:
sim_func3(user_input, corpus_of_documents)

In [None]:
document = corpus_of_documents[1]

In [None]:
test1 = nlp(document)

In [None]:
# close llama3
#> /bye