Notes:
* Web scraping is not allowed for Lonely Planet
* Implement parallel processing?

# 0. Import Libraries and Dependencies

In [1]:
#!pip install transformers

In [2]:
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import wikipedia
import docx
import pandas as pd

import gspread
from oauth2client.service_account import ServiceAccountCredentials
import json
import gspread_dataframe as gd

from googlesearch import search

import multiprocessing as mp

  from .autonotebook import tqdm as notebook_tqdm


# 1. Setting up the Model

In [3]:
# Initiate list of landmarks
landmark_list = ['Habbous', 'Morocco Mall', 'Old Medina', 'La Corniche', 'Casa Port Train Station', 'Ain Diab', 'Central Marketplace', 'Tamaris Aquaparc', 'Anfaplace Mall', 'Tachfine Center', 'Marina Shopping Center', 'Derb Ghallef', 'City Hall' , 'Casablanca Grand Theater']

city = 'Casablanca'

# Initiate basic summary pipeline
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


# 2A. Get TripAdvisor Text

In [4]:
def find_TA_url(landmark, city):
    ''' 
    Run Google search to try and find TripAdvisor URL for the landmark
    '''
    query = 'TripAdvisor ' + landmark + ' ' + city # Set query text to search based on input name

    my_results_list = []
    # Check the first 25 search results for url from tripadvisor.com (so it is in English)
    for i in search(query, lang = 'en', tld = 'com', start = 1, stop = 25):
        if 'tripadvisor.com' in i:
            my_results_list.append(i)
    
    if len(my_results_list) > 0:
        top_result = my_results_list[0] # Assume first match is what we are looking for
    else:
        print(f"No TripAdvisor site found for {landmark}")
        top_result = False
    
    return top_result, my_results_list

In [5]:
def get_TA_text (url):
    ''' 
    Get TripAdvisor description based on a provided URL
    '''
    headers = {'User-Agent': "Mozilla/5.0"} # Use alternative header so that TA doesn't block the scrape request
    r = requests.get(url, headers = headers, timeout = 10) 
    soup = BeautifulSoup(r.text, 'html.parser') # Parse entire webpage
    
    if ('Hotel_Review' in url):
        results = soup.find_all("div", class_="fIrGe _T") # Find class associated with location description
        text = [result.text for result in results] # Pull only the text (no html information)
        TA_description = text[0] # Text contained in first list element
    elif ('Attraction_Review' in url):
        results = soup.find_all("div", class_="fIrGe _T bgMZj")
        text = [result.text for result in results] # Pull only the text (no html information)
        TA_description = text[1] # Text contained in first list element
    else: # Sometimes the top option will be a page with only reviews (sometimes TripAdvisor has no About section)
        results = soup.find_all("div", class_="entry")
        text = [result.text for result in results] # Pull only the text (no html information)
        TA_description = ' '.join(text) # Combine the text from all reviews
    
    return TA_description

# 2B. Get Wiki Text

In [102]:
def check_wiki(landmark):
    ''' 
    See possible search results on Wiki
    '''
    wiki_search = wikipedia.search(landmark + city) # Want to make sure we're looking at the right city
    
    if wiki_search[0] == city:
        top_wiki_result = wiki_search[1].title().replace(" ", "") # To have search query work, need to capitalize all first letters and remove spaces
    else:
        top_wiki_result = wiki_search[0].title().replace(" ", "")

    if top_wiki_result != landmark.title().replace(" ", ""): # We need an exact match to be sure we have the right page
        top_wiki_result = None
        print('No exact Wiki match')

    return top_wiki_result, wiki_search

In [103]:
city = 'Casablanca'

check_wiki(landmark_list[0])

No exact Wiki match


(None, ['Hubous'])

In [118]:
for landmark in landmark_list:
    res = wikipedia.search(landmark + city)
    print(landmark + ":")
    print(res)

Habbous:
['Hubous']
Morocco Mall:
['Morocco Mall', 'Casablanca', 'Fnac', 'Marrakesh', 'Salwa Idrissi Akhannouch', 'Belvedere (Casablanca)', 'Economy of Morocco', 'Davide Padoa', 'Next (cigarette)', 'Écoles Belges au Maroc']
Old Medina:
['Casablanca', 'Medina quarter', 'Architecture of Casablanca', 'Tunis', 'Tourism in Morocco', 'Hubous', 'Marrakesh', 'United Nations Square (Casablanca)', 'Rabat', 'Fez, Morocco']
La Corniche:
['Casablanca', 'List of Art Deco architecture in Africa', 'Casablanca Metro', 'Mohammedia', 'Salwa Idrissi Akhannouch', 'Ain Diab', 'Anfa Circuit', 'Zinedine Zidane', 'Marseille', 'Georges Burou']
Casa Port Train Station:
['Casablanca', 'Casa-Port Railway Terminal', 'Casa-Voyageurs Railway Station', 'Casablanca Tramway', 'Oasis railway station', 'ONCF', 'Al Bidaoui', 'Mohammed V International Airport', 'History of Casablanca', 'Train Navette Rapide']
Ain Diab:
['Ain-Diab Circuit', 'Ain Diab', 'Casablanca', 'Casablanca Tramway', 'Moroccan Grand Prix', 'List of Formu

In [8]:
def get_wiki_text(landmark):
    '''
    Grab summary text from the wiki page of the landmark
    '''
    wiki = wikipedia.page(landmark) # Grab the page associated with the landmark
    wiki_description = wiki.summary # Extract the plain text content of the page
    wiki_description = wiki_description.replace("\n", "") # Clean the text

    return wiki_description

We need to chunk the text as the basic summary model we chose can only take a maximum of 500 words as input. Thus, we chunk the text into a list.

In [9]:
def break_into_blocks(text):
    ''' 
    Break text into list of 'blocks' each less than 500 words long
    '''
    max_chunk = 500 # Set max chunk threshold

    # Make end of each sentence consistent
    text = text.replace('.', '.<eos>')
    text = text.replace('?', '?<eos>')
    text = text.replace('!', '!<eos>')

    # Split text and break into chunks where each chunk not more than 500 words
    sentences = text.split('<eos>')
    current_chunk = 0 
    chunks = []
    for sentence in sentences:
        if len(chunks) == current_chunk + 1: 
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))

    for chunk_id in range(len(chunks)):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])

    return chunks

# 4. Summarize Text

In [10]:
def summarize(text, short, long):
    '''
    Summarize input text to create both a short and long summary
    '''
    short_res = summarizer(text, max_length=(short + 25), min_length=(short - 25), do_sample=False)
    short_summary = ' '.join([summ['summary_text'] for summ in short_res]) # Grab relevant text from the dictionary and join chunks

    long_res = summarizer(text, max_length=(long + 25), min_length=(long - 25), do_sample=False)
    long_summary = ' '.join([summ['summary_text'] for summ in long_res]) # Grab relevant text from the dictionary and join chunks

    return short_summary, long_summary

# 5. Bringing it all Together

In [53]:
# Create lists to store the resulting summaries
short_summaries = []
long_summaries = []

def create_summaries(landmark, city, short, long):
    ''' 
    Performing the full summary generation for a given landmark
    '''
    # Find TripAdvisor and Wiki descriptions of the landmark, if available
    TA_url = find_TA_url(landmark, city)[0]
    if TA_url != False:
        TA_text = get_TA_text(TA_url)
    else:
        TA_text = ""
    wiki_search = check_wiki(landmark)[0]
    wiki_text = get_wiki_text(wiki_search)
    
    # Combine descriptions and summarize 
    combined_text = wiki_text + TA_text
    summaries = summarize(combined_text, short, long)
    
    print(f'Summary text for {landmark} is: {summaries[1]}') # Check the long summary output

    # Append results to lists
    short_summaries.append(summaries[0])
    long_summaries.append(summaries[1])

In [56]:
# TESTING

landmark = landmark_list[3]
city = 'Casablanca'

TA_url = find_TA_url(landmark, city)[0]
if TA_url != False:
    TA_text = get_TA_text(TA_url)
else:
    TA_text = ""
wiki_search = check_wiki(landmark)[0]
wiki_text = get_wiki_text(wiki_search)

# Combine descriptions and summarize 
combined_text = wiki_text + TA_text
summaries = summarize(combined_text, 100, 200)

print(f'Summary text for {landmark} is: {summaries[1]}') # Check the long summary output

Summary text for La Corniche is:  The HR manager handled a small accident that happened to me from A to Z...and I love and respect people who are so dedicated and passionate worker. This is a list of buildings that are examples of Art Deco in Africa: Best restaurant, bars and disco in Morocco by far. They treat you like a king even if it's your first time. And that's the best experience you can live in Morocco, cause other restaurants are quiet arrogant with new...clients. The lady manager of the french restaurant is very rude, she doesn’t know what is customer service . La Corniche By Palmeraie is the best restaurant, barman, real Gentlemen. Shoutoutout to the Barman. The restaurant is beautiful and the barman is a real Gentleman. It's a great place to go to the bar. It’s


In [60]:
check_wiki(landmark_list[3])

('ListOfArtDecoArchitectureInAfrica',
 ['Casablanca',
  'List of Art Deco architecture in Africa',
  'Casablanca Metro',
  'Mohammedia',
  'Salwa Idrissi Akhannouch',
  'Ain Diab',
  'Anfa Circuit',
  'Zinedine Zidane',
  'Marseille',
  'Georges Burou'])

In [54]:
# Create summaries for all the landmarks in the list
for landmark in landmark_list:
    print(landmark)
    create_summaries(landmark, city, 100, 200)

Habbous
No TripAdvisor site found for Habbous


Your max_length is set to 225, but you input_length is only 134. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)


Summary text for Habbous is:  The Hubous is one of the older neighborhoods of Casablanca, Morocco . Its development dates back to 1916, in the early stages of the French protectorate . The neighborhood is a cultural and religious center for Morocco and for Morocco . It hosts the Moroccan Ministry of Islamic Affairs as well as bookstores of important Moroccan and Arabic publishing houses . The many traditional and historic buildings also make the Hubous a popular tourist destination, with many traditional buildings also making the neighborhood a tourist destination . It is located in the Moroccan capital, the capital of the Moroccan city of the capital, Casablancas . It was also home to the Moroccan ministry of Islamic affairs and the Moroccan National Institute of Education and Research Institute of Islamic Studies, which was established in the 1930s and founded by the government of the National Geographic Institute of Morocco, which is now based in the 1950s .
Morocco Mall
No TripAdvi

Your max_length is set to 125, but you input_length is only 90. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 225, but you input_length is only 90. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


Summary text for Morocco Mall is:  Morocco Mall is the largest shopping mall in Africa with 190 000m² of floor space in Casablanca, Morocco . Morocco Mall opened on December 1, 2011 . The mall was designed by Davide Padoa of Design International, a global architecture boutique with its headquarters in London . The project site coordination was led by Miguel Fernandes and Catia Zizzi . The Moroccan Mall is located in Morocco's capital city, the capital of the capital city of the Moroccan capital, the city of Marrakerakeel . It is the world's largest mall with 190,000 sq ft of floor-to-floor space in Africa, the largest mall in the world with a total of 190 000 sq sq ft space in the continent . It opened in December 2011. The mall opened in October 2011, the first of its first year of construction in Morocco in the region .
Old Medina
No TripAdvisor site found for Old Medina


Your max_length is set to 225, but you input_length is only 138. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)


Summary text for Old Medina is:  A medina quarter is typically walled, with many narrow and maze-like streets . The word "medina" itself simply means "city" or "town" in modern-day Arabic . It is cognate with the Aramaic-Hebrew word (also called medina) referring to a city or populated area . The medina is a distinct historical city section found in a number of North African cities, and in Malta, and North American cities . A Medina quarter (Arabic:    ) is found in many North African and American cities, often with narrow streets and narrow, maze--like mounds of narrow and narrow streets . A medinama quarter is often walled and walled with narrow, narrow walls and molesoles in the middle of its own streets and is often surrounded by a maze of streets .
La Corniche
Summary text for La Corniche is:  The HR manager handled a small accident that happened to me from A to Z...and I love and respect people who are so dedicated and passionate worker. This is a list of buildings that are examp

Token indices sequence length is longer than the specified maximum sequence length for this model (1173 > 1024). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

In [None]:
# Check the results
print(short_summaries)
print('\n')
print(long_summaries)

# 6. Output to Google Sheets

In [None]:
def gsheet_connect(file_name):
    '''
    Connect to desired Google Sheet
    '''
    JSON_keyfile_path = "/Users/Chris/Desktop/Landmarks/LandmarksKey.json"
    scopes = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive']
    
    credentials = ServiceAccountCredentials.from_json_keyfile_name(JSON_keyfile_path, scopes) #access the json key you downloaded earlier 
    file = gspread.authorize(credentials) # authenticate the JSON key with gspread
    gsheet = file.open(file_name)  #open sheet

    print(f"Current worksheets in '{file_name}': " + str(gsheet.worksheets())) # Test print of all worksheets
    return gsheet

gsheet = gsheet_connect('Landmarks Output')

In [None]:
# Toggle to desired sheet
ws = gsheet.sheet1

# Create desired df
landmark_df = ('Landmark': landmark_list,
               'Wiki Short Summary': wiki_short_list,
               'Wiki Long Summary': wiki_long_list,
               'TA Short Summary': TA_short_list,
               'TA Long Summary': TA_long_list,
               'Combined Short Summary': combined_short_list,
               'Combined Long Summary': combined_long_list
               )

# Update sheet with df
ws.update([landmark_df.columns.values.tolist()] + landmark_df.values.tolist())