# Scrap functions

## 1. Scraping content from the given url

In [18]:
import requests
from bs4 import BeautifulSoup

########################################################
#### This function scrap the contents in the url
#### input: url
#### output: content (type: str)
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_content(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.text, 'html.parser')
#print(soup)
        # Extract the desired information
        # Example: Extracting all paragraphs from the webpage
        paragraphs = soup.find_all('p')

        content = ""
        # Print or process the extracted information
#        for paragraph in paragraphs:
#            content += " " + paragraph.text

###### The below is to only return a shorter content. For the full content, use the commented commands above
        for i in range(min(len(paragraphs), 10)):
### For the whole content:
#        for i in range(len(paragraphs)):
            content += " " + paragraphs[i].text
        return content
    else:
        print("Failed to retrieve content. Status code:", response.status_code)
        return None
    
scrape_content('https://www.msn.com/en-ca/lifestyle/other/experts-urge-people-to-fish-and-eat-crab-species-putting-entire-fishing-industry-at-risk-an-animal-of-unacceptable-intelligence/ar-BB1na7wc?ocid=BingNewsSearch')

''

## 2. Auto scrap of the top 20 search results from different search engines
Search Engines that we use:
* 2-1. Google (news tab)
* 2-2. Yahoo (news tab)
* 2-3. Bing (news tab)
* 2-4. Maritime Executive (content scraping is not authorized.- producing 403 errors)

What we scrap:
* Title
* Link
* Date


###  2-1. Google search in the news tab (working)

In [41]:
import requests
from bs4 import BeautifulSoup

########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from google news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_google_news(query):
    # Construct the Google News URL with the query
    url = f"https://www.google.com/search?q={query}&tbm=nws"
    print(url)
#    url = f"https://news.google.com/search?q={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='Gx5Zad fP1Qef xpd EtOod pkphOe')
    date_elements = soup.find_all('span', class_='r0bn4c rQMQod')


    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 5)):
        title = search_results[i].find('a').text
        link = search_results[i].find('a')['href']
        #link = 'https://news.google.com' + link[1:]
        date = date_elements[i].text   
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap

#### 2-1-1. Testing Google scrap algorithm
#### Getting the queries from the excel file

In [24]:
import pandas as pd

# read the excel file
excel_data = pd.read_excel('PIMS Sample Prompts.xlsx')

queries = []
for index, row in excel_data.iterrows():
    # Process each row
    queries.append(row['Prompt'])
    
queries = queries[:2]

#### Testing with queries

In [46]:
### Test with differnet queries
query = input("Enter your search query: ")

# When we want to use the queries from the excel file:
#for query in queries:
#    top_results = scrape_google_news(query)
#    for index, result in enumerate(top_results, start=1):
#        print(f"{index}. {result['title']}")
#        link = "https://news.google.com" + result['link'][1:]
#        print(link)
#        print()
#        print(scrape_content(link))


top_results = scrape_google_news(query)

for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    link = result['link'][7:]
    print(link)
    print(result['date'])
    print(scrape_content(link))
    print()    

Enter your search query: fishing
https://www.google.com/search?q=fishing&tbm=nws
1. Inside the Slimy, Smelly, Secretive World of Glass-Eel FishingThe New YorkerEach spring, hundreds of millions of baby eels swarm the waterways of coastal Maine. Soaring global demand incited an era of jackpot payouts...11 hours ago
https://www.newyorker.com/magazine/2024/06/24/inside-the-slimy-smelly-secretive-world-of-glass-eel-fishing&sa=U&ved=2ahUKEwj_tLrLyuOGAxXRCjQIHT7eDMUQxfQBegQIBBAC&usg=AOvVaw06NpDYqEjrhSW4jGt_oa5F
11 hours ago
 Find anything you save across the site in your account  By Paige Williams The Sargasso Sea, a warm, calm expanse of the North Atlantic Ocean, is bordered not by land but by four strong currents—a gyre. Vast mats of prickly brown seaweed float so thickly on the windless surface that Christopher Columbus worried about his ships getting stuck. The biodiverse sanctuary within and beneath the sargassum produces Anguilla rostrata, the American eel. Each female lays some eight 

Failed to retrieve content. Status code: 404
None

5. Man walking his dog stumbles across unusual pits on beach, leading to unexpected discoveryFox NewsBritish officials recently announced the discovery of centuries-old fishing bait tanks that were cut into beachrock. posing a historical and...18 hours ago
https://www.foxnews.com/lifestyle/man-walking-his-dog-stumbles-unusual-pits-beach-leading-unexpected-discovery&sa=U&ved=2ahUKEwj_tLrLyuOGAxXRCjQIHT7eDMUQxfQBegQICBAC&usg=AOvVaw2ZefnrxPOwcNkaJT0GRmgs
18 hours ago
Failed to retrieve content. Status code: 404
None



### 2-2. Yahoo search in the news tab (working)
This is very similar to Google one. I think we can easily produce similar functions for other search engines!

In [None]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Yahoo news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form
########################################################

def scrape_yahoo_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://news.search.yahoo.com/search?p={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='NewsArticle')
    date_elements = soup.find_all('span', class_='fc-2nd s-time mr-8')

    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 5)):
        title = search_results[i].find('h4').text
        link = search_results[i].find('a')['href']
        date = date_elements[i].text[2:]
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap





In [None]:
# Example usage
query = input("Enter your search query: ")
top_results = scrape_yahoo_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print(result['date'])
    ### NOT print(scrape_content(link))
    print(scrape_content(result['link']))
    print()


###### Clotilde's data frame function (copied from Global code)

In [21]:
search_engines = {
    'Google': scrape_google_news,
    'Yahoo': scrape_yahoo_news,
}

data = []
for query in queries:
    for engine_name, scrape_function in search_engines.items():
        print(f"Scraping {engine_name} for query: {query}")
        results = scrape_function(query)
        
        if results:
            for result in results[:len(results)]:  # Limiting to the top 20 results per engine
                data.append({
                    'Query': query,
                    'Engine': engine_name,
                    'Title': result['title'],
                    'Link': result['link'],
                    'Content': scrape_content(result['link'])
                })
                print(data[len(data)-1])

        else:
            data.append({
                'Query': query,
                'Engine': engine_name,
                'Title': 'No results'
            })
            print(data[len(data)])

        
        time.sleep(1)  # Adding a delay to be respectful of website policies

#### Data frame:
df = pd.DataFrame(data)



Scraping Google for query: Vessel caught misreporting catch amount
{'Query': 'Vessel caught misreporting catch amount', 'Engine': 'Google', 'Title': 'Sea Shepherd Global', 'Link': 'https://news.google.com/articles/CBMiQWh0dHBzOi8vd3d3LnNlYXNoZXBoZXJkZ2xvYmFsLm9yZy9sYXRlc3QtbmV3cy9jb21iYXQtaXV1LWZpc2hpbmcv0gEA?hl=en-CA&gl=CA&ceid=CA%3Aen', 'Content': ' Wednesday, 05 Jun, 2024 Sea Shepherd Global stands at the forefront of the fight against Illegal, Unreported, and Unregulated (IUU) fishing, deploying innovative strategies and international collaborations to protect marine biodiversity.\xa0 Illegal, Unreported, and Unregulated (IUU) fishing refers to fishing activities that do not comply with national, regional, or international fisheries conservation and management laws and regulations. These activities are conducted by vessels in various ways, including:'}
{'Query': 'Vessel caught misreporting catch amount', 'Engine': 'Google', 'Title': 'New rules tighten controls on EU…', 'Link': 'htt

NameError: name 'time' is not defined

### 2-3. Microsoft Bing search in news tab

In [19]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Microsoft Bing news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form like Google
########################################################

def scrape_bing_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://www.bing.com/news/search?q={query}"
    print(url)
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='news-card')
    date_elements = soup.find_all('span', tabindex="0")
    
    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 20)):
        title = search_results[i].find('a', class_ = 'title').text
        link = search_results[i].find('a', class_ = 'title')['href']
        date = date_elements[i]['aria-label']
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap



# Example usage
query = input("Enter your search query: ")
top_results = scrape_bing_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print(result['date'])
    print(scrape_content(result['link']))
    print()


Enter your search query: fishing
https://www.bing.com/news/search?q=fishing
1. Experts urge people to fish and eat crab species putting entire fishing industry at risk: 'An animal of unacceptable intelligence'
https://www.msn.com/en-ca/lifestyle/other/experts-urge-people-to-fish-and-eat-crab-species-putting-entire-fishing-industry-at-risk-an-animal-of-unacceptable-intelligence/ar-BB1na7wc?ocid=BingNewsSearch
1 hour ago


2. Video shows the Houthis attack a merchant ship with a naval drone seemingly disguised as a slow fishing boat
https://www.msn.com/en-us/news/world/video-shows-the-houthis-attack-a-merchant-ship-with-a-naval-drone-seemingly-disguised-as-a-slow-fishing-boat/ar-BB1ooEEY?ocid=BingNewsSearch
1 hour ago


3. Commercial lobster sector concerned about out-of-season fishing in St. Marys Bay
https://www.msn.com/en-ca/news/other/commercial-lobster-sector-concerned-about-out-of-season-fishing-in-st-marys-bay/ar-BB1oav0d?ocid=BingNewsSearch
4 days ago


4. Free Fishing For Father

### 2-4 Maritime Executive search
The content scraping is not authorized (403 error). Need to set up an API or header.

In [None]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Microsoft Bing news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form like Google
########################################################

def scrape_maritime_executive(query):
    # Construct the Yahoo News URL with the query
    url = f"https://www.maritime-executive.com/search?key={query}"
    print(url)
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup)
    # Find all the search result elements
    search_results = soup.find_all('div', class_='desc body no-padding-xs')
#    date_elements = soup.find_all('span', tabindex="0")
    
    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 20)):
        title = search_results[i].find('a', class_ = "font-firasans").text
        title = title.strip()
        link = search_results[i].find('a', class_ = "font-firasans")['href']
        date = 0
 #       date = search_results[i].find('p', class_ = "summary")
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap



# Example usage
query = input("Enter your search query: ")
top_results = scrape_maritime_executive(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
#    print(result['date'])
    print(scrape_content(result['link']))
    print()
