### Necessary scrap functions

In [None]:
import requests
from bs4 import BeautifulSoup

########################################################
#### This function scrap the contents in the url
#### input: url
#### output: content (type: str)
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_content(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the desired information
        # Example: Extracting all paragraphs from the webpage
        paragraphs = soup.find_all('p')
        
        content = ""
        # Print or process the extracted information
#        for paragraph in paragraphs:
#            content += " " + paragraph.text

###### The below is to only return a shorter content. For the full content, use the commented commands above
        for i in range(min(len(paragraphs), 3)):
### For the whole content:
#        for i in range(len(paragraphs)):
            content += " " + paragraphs[i].text
        return content
    else:
        print("Failed to retrieve content. Status code:", response.status_code)
        return None
    
########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from google news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_google_news(query):
    # Construct the Google News URL with the query
    url = f"https://news.google.com/search?q={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='IL9Cne')
    date_elements = soup.find_all('time', class_='hvbAAd')


    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 20)):
        title = search_results[i].find('a', class_ = 'JtKRv').text
        link = search_results[i].find('a')['href']
        date = date_elements[i]['datetime'][:10]    
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap

import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Yahoo news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form
########################################################

def scrape_yahoo_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://news.search.yahoo.com/search?p={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='NewsArticle')
    date_elements = soup.find_all('span', class_='fc-2nd s-time mr-8')

    # Extract the title and link of each search result
    scrap = []
    for i in range(min(len(search_results), 20)):
        title = search_results[i].find('h4').text
        link = search_results[i].find('a')['href']
        date = date_elements[i].text[2:]
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap


### Read queries from the excel file

In [None]:
import pandas as pd

# read the excel file
excel_data = pd.read_excel('PIMS Sample Prompts.xlsx')

queries = []
for index, row in excel_data.iterrows():
    # Process each row
    queries.append(row['Prompt'])

### Read preselected articles

In [None]:
import pandas as pd

# read the excel file
url_data = pd.read_excel('sample_url.xlsx')


relevant_list = []
irrelevant_list = []
for index, row in url_data.iterrows():
    if row['relevance'] == 1:
        relevant_list.append(row['url'])
    else:
        irrelevant_list.append(row['url'])    

#### What we want to do:
We want to train a model with the preselected articles so we can evaluate the level of information of a new article. For evaluation, we want information:

* What was the violation
    - What kind of fish
    - misreported / underreported
* Where the incident happend
* When the incident happend
* Names of people involved
* Type of vessels involved
* 
* 
* (anything else?)

I'll start with the first three.

* How can a model answer this question?
* How should we score each question?

#### What we want to do is to evaluate how much information the article have