# Scrap functions

## 1. Scraping content from the given url

In [9]:
import requests
from bs4 import BeautifulSoup

########################################################
#### This function scrap the contents in the url
#### input: url
#### output: content (type: str)
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_content(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the desired information
        # Example: Extracting all paragraphs from the webpage
        paragraphs = soup.find_all('p')
        
        content = ""
        # Print or process the extracted information
#        for paragraph in paragraphs:
#            content += " " + paragraph.text

###### The below is to only return a shorter content. For the full content, use the commented commands above
        for i in range(min(len(paragraphs), 3)):
### For the whole content:
#        for i in range(len(paragraphs)):
            content += " " + paragraphs[i].text
        return content
    else:
        print("Failed to retrieve content. Status code:", response.status_code)
        return None

## 2. Auto scrap of the top 20 search results from different search engines
What we scrap:
* Title
* Link
* Date


###  2-1. Google News (working)

In [16]:
########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from google news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_google_news(query):
    # Construct the Google News URL with the query
    url = f"https://news.google.com/search?q={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='IL9Cne')
    date_elements = soup.find_all('time', class_='hvbAAd')


    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 5)):
        title = search_results[i].find('a', class_ = 'JtKRv').text
        link = search_results[i].find('a')['href']
        link = 'https://news.google.com' + link[1:]
        date = date_elements[i]['datetime'][:10]    
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap

#### 2-1-1. Testing Google scrap algorithm
#### Getting the queries from the excel file

In [17]:
import pandas as pd

# read the excel file
excel_data = pd.read_excel('PIMS Sample Prompts.xlsx')

queries = []
for index, row in excel_data.iterrows():
    # Process each row
    queries.append(row['Prompt'])
    
queries = queries[:2]

#### Testing with queries

In [None]:
### Test with differnet queries
query = input("Enter your search query: ")

# When we want to use the queries from the excel file:
#for query in queries:
#    top_results = scrape_google_news(query)
#    for index, result in enumerate(top_results, start=1):
#        print(f"{index}. {result['title']}")
#        link = "https://news.google.com" + result['link'][1:]
#        print(link)
#        print()
#        print(scrape_content(link))


top_results = scrape_google_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    link = result['link']
#    link = "https://news.google.com" + result['link'][1:]
    print(link)
    print(result['date'])
    print(scrape_content(link))
    print()    

### 2-2. Yahoo News (working)
This is very similar to Google one. I think we can easily produce similar functions for other search engines!

In [18]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Yahoo news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form
########################################################

def scrape_yahoo_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://news.search.yahoo.com/search?p={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='NewsArticle')
    date_elements = soup.find_all('span', class_='fc-2nd s-time mr-8')

    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 5)):
        title = search_results[i].find('h4').text
        link = search_results[i].find('a')['href']
        date = date_elements[i].text[2:]
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap





In [None]:
# Example usage
query = input("Enter your search query: ")
top_results = scrape_yahoo_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print(result['date'])
    print(scrape_content(link))
    print()


In [21]:
search_engines = {
    'Google': scrape_google_news,
    'Yahoo': scrape_yahoo_news,
}

data = []
for query in queries:
    for engine_name, scrape_function in search_engines.items():
        print(f"Scraping {engine_name} for query: {query}")
        results = scrape_function(query)
        
        if results:
            for result in results[:len(results)]:  # Limiting to the top 20 results per engine
                data.append({
                    'Query': query,
                    'Engine': engine_name,
                    'Title': result['title'],
                    'Link': result['link'],
                    'Content': scrape_content(result['link'])
                })
                print(data[len(data)-1])

        else:
            data.append({
                'Query': query,
                'Engine': engine_name,
                'Title': 'No results'
            })
            print(data[len(data)])

        
        time.sleep(1)  # Adding a delay to be respectful of website policies

#### Data frame:
df = pd.DataFrame(data)



Scraping Google for query: Vessel caught misreporting catch amount
{'Query': 'Vessel caught misreporting catch amount', 'Engine': 'Google', 'Title': 'Sea Shepherd Global', 'Link': 'https://news.google.com/articles/CBMiQWh0dHBzOi8vd3d3LnNlYXNoZXBoZXJkZ2xvYmFsLm9yZy9sYXRlc3QtbmV3cy9jb21iYXQtaXV1LWZpc2hpbmcv0gEA?hl=en-CA&gl=CA&ceid=CA%3Aen', 'Content': ' Wednesday, 05 Jun, 2024 Sea Shepherd Global stands at the forefront of the fight against Illegal, Unreported, and Unregulated (IUU) fishing, deploying innovative strategies and international collaborations to protect marine biodiversity.\xa0 Illegal, Unreported, and Unregulated (IUU) fishing refers to fishing activities that do not comply with national, regional, or international fisheries conservation and management laws and regulations. These activities are conducted by vessels in various ways, including:'}
{'Query': 'Vessel caught misreporting catch amount', 'Engine': 'Google', 'Title': 'New rules tighten controls on EU…', 'Link': 'htt

NameError: name 'time' is not defined

### 2-3. Microsoft Bing News

In [None]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Microsoft Bing news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form like Google
########################################################

def scrape_bing_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://www.bing.com/news/search?q={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='news-card')

    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 20)):
        title = search_results[i].find('h4').text
        link = search_results[i].find('a')['href']
        date = date_elements[i].text[2:]
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap



# Example usage
query = input("Enter your search query: ")
top_results = scrape_bing_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print(result['date'])
    print()
