# Scrap algorithms

## 1. Scraping content from the given url

In [100]:
import requests
from bs4 import BeautifulSoup

########################################################
#### This function scrap the contents in the url
#### input: url
#### output: content (type: str)
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_content(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the desired information
        # Example: Extracting all paragraphs from the webpage
        paragraphs = soup.find_all('p')
        
        content = ""
        # Print or process the extracted information
#        for paragraph in paragraphs:
#            content += " " + paragraph.text

###### The below is to only return a shorter content. For the full content, use the commented commands above
        for i in range(min(len(paragraphs), 3)):
            content += " " + paragraphs[i].text
        return content
    else:
        print("Failed to retrieve content. Status code:", response.status_code)
        return None

## 2. Auto scrap of the top n search results from different search engines
What we scrap:
* Title
* Link
* Date


In [101]:
# n can be changed to test small examples
n = 20
# For some search engines, it gives less than 20 articles in one page. In that case, it only scraps the first page for now.

###  2-1. Google News (working)

In [102]:
########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from google news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
########################################################

def scrape_google_news(query):
    # Construct the Google News URL with the query
    url = f"https://news.google.com/search?q={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='IL9Cne')
    date_elements = soup.find_all('time', class_='hvbAAd')


    # Extract the title and link of each search result
    scrap = []
    for i in range(20):
        title = search_results[i].find('a', class_ = 'JtKRv').text
        link = search_results[i].find('a')['href']
        date = date_elements[i]['datetime'][:10]    
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap

#### 2-1-1. Testing Google scrap algorithm
#### Getting the queries from the excel file

In [None]:
import pandas as pd

# read the excel file
excel_data = pd.read_excel('PIMS Sample Prompts.xlsx')

queries = []
for index, row in excel_data.iterrows():
    # Process each row
    queries.append(row['Prompt'])

#### Testing with queries

In [103]:
### Test with differnet queries
query = input("Enter your search query: ")

# When we want to use the queries from the excel file:
#for query in queries:
#    top_results = scrape_google_news(query)
#    for index, result in enumerate(top_results, start=1):
#        print(f"{index}. {result['title']}")
#        link = "https://news.google.com" + result['link'][1:]
#        print(link)
#        print()
#        print(scrape_content(link))


top_results = scrape_google_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    link = "https://news.google.com" + result['link'][1:]
    print(link)
    print(result['date'])
    print(scrape_content(link))
    print()    

Enter your search query: fish
1. Freaky Oregon Fish Is Bigger Than Shaquille O'Neal - Videos from The Weather Channel
https://news.google.com/articles/CBMiZWh0dHBzOi8vd2VhdGhlci5jb20vbmF0dXJlL3dpbGQtYW5pbWFscy92aWRlby9yYXJlbHktaWRlbnRpZmllZC1uZXdseS1kaXNjb3ZlcmVkLWZpc2gtZm91bmQtaW4tb3JlZ29u0gEA?hl=en-CA&gl=CA&ceid=CA%3Aen
2024-06-11
 June 13, 2024 This type of sunfish was only recently discovered to even exist. Now, scientists say one of the largest examples of the rarely identified species has washed up on an Oregon beach. Now Playing

2. Rare 7-foot fish washed ashore on Oregon's coast garners worldwide attention
https://news.google.com/articles/CBMiXWh0dHBzOi8vYWJjbmV3cy5nby5jb20vV2VpcmQvd2lyZVN0b3J5L3JhcmUtNy1mb290LWZpc2gtd2FzaGVkLWFzaG9yZS1vcmVnb25zLWNvYXN0LTExMDkzODgyMdIBYWh0dHBzOi8vYWJjbmV3cy5nby5jb20vYW1wL1dlaXJkL3dpcmVTdG9yeS9yYXJlLTctZm9vdC1maXNoLXdhc2hlZC1hc2hvcmUtb3JlZ29ucy1jb2FzdC0xMTA5Mzg4MjE?hl=en-CA&gl=CA&ceid=CA%3Aen
2024-06-07
 A massive rare fish thought to only live

 Newsletters Newsletters 

13. Sault Tribe holds first ever open house at Walleye Fish Hatchery
https://news.google.com/articles/CBMiZ2h0dHBzOi8vd3d3LnNvb2xlYWRlci5jb20vYm9sZC9zYXVsdC10cmliZS1ob2xkcy1maXJzdC1ldmVyLW9wZW4taG91c2UtYXQtd2FsbGV5ZS1maXNoLWhhdGNoZXJ5LTkwNzg3MznSAQA?hl=en-CA&gl=CA&ceid=CA%3Aen
2024-06-13
 Sign In Register For the first time in its history, the Sault Ste. Marie Tribe of Chippewa Indians opened its walleye hatchery to the public in an open house Wednesday evening.

14. Free family fishing this weekend for Father's Day in Ontario
https://news.google.com/articles/CBMiigFodHRwczovL3d3dy50b3JvbnRvLmNvbS9uZXdzL2ZyZWUtZmFtaWx5LWZpc2hpbmctdGhpcy13ZWVrZW5kLWZvci1mYXRoZXJzLWRheS1pbi1vbnRhcmlvL2FydGljbGVfNDUyMWJjNjYtNDdhNy01YTkyLTljZDItZGFmNGUzZGM1NmJlLmh0bWzSAQA?hl=en-CA&gl=CA&ceid=CA%3Aen
2024-06-13
  This Father's Day weekend, the province is hosting a free family fishing event June 15 and 16. If you don't have big plans this weekend, take time to drop a line, with an

### 2-2. Yahoo News (working)
This is very similar to Google one. I think we can easily produce similar functions for other search engines!

In [105]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Yahoo news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form
########################################################

def scrape_yahoo_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://news.search.yahoo.com/search?p={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='NewsArticle')
    date_elements = soup.find_all('span', class_='fc-2nd s-time mr-8')

    # Extract the title and link of each search result
    scrap = []
    for i in range(len(search_results)):
        title = search_results[i].find('h4').text
        link = search_results[i].find('a')['href']
        date = date_elements[i].text[2:]
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap



# Example usage
query = input("Enter your search query: ")
top_results = scrape_yahoo_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print(result['date'])
    print(scrape_content(link))
    print()


Enter your search query: fish
1. Watch: ‘Jubilee’ Draws Millions of Fish into Gulf Coast Shallows, Where They’re Ripe for Gigging
https://www.yahoo.com/news/watch-jubilee-draws-millions-fish-165758884.html?fr=sycsrp_catchall
2 hours ago
 The future of fishing is more than robot fisher-people and smart refineries. In the vast, blue expanse of our planet's oceans, a revolution is quietly unfolding, poised to redefine the ancient practice of fishing for the modern era. This revolution, powered by artificial intelligence (AI), is not just transforming how we harvest the seas; it's ensuring we do so sustainably, preserving our marine ecosystems for generations to come. As we delve into this fascinating journey, we uncover the innovative ways in which AI is becoming the cornerstone of sustainable fishing, offering a beacon of hope for the future of our oceans. 

2. 21-year-old Georgia woman breaks decades-old fishing record
https://sports.yahoo.com/21-old-georgia-woman-breaks-184022927.html?

### 2-3. Microsoft Bing News

In [None]:
import requests
from bs4 import BeautifulSoup


########################################################
#### This function scrap the titles, links, and dates of the first 20 articles from Microsoft Bing news with the given query
#### input: query
#### output: the list of the title and link
####
#### required libraries:
# import requests
# from bs4 import BeautifulSoup
#
# Note: The dates in Yahoo news is in different form.
# It only gives info in the form of "1 month ago", "3 hours ago", not YYYY-MM-DD form like Google
########################################################

def scrape_bing_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://www.bing.com/news/search?q={query}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='news-card')

    # Extract the title and link of each search result
    scrap = []
    for i in range(len(search_results)):
        title = search_results[i].find('h4').text
        link = search_results[i].find('a')['href']
        date = date_elements[i].text[2:]
        scrap.append({'title': title, 'link': link, 'date': date})   
    return scrap



# Example usage
query = input("Enter your search query: ")
top_results = scrape_bing_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print(result['date'])
    print()


### Evaluation functions

In [None]:
##Clotilde's function for relevance evaluation
# This function searches for the number of keywords in the given text (var: results)

def evaluate_relevance(results, keywords):
    relevance_scores = []
    for result in results:
        title = result["title"].lower()  # Convertir le titre en minuscule pour une comparaison insensible à la casse
        score = sum(1 for word in keywords if word in title)  # Compter combien de mots-clés apparaissent dans le titre
        relevance_scores.append(score)
    return relevance_scores
####

### From the original file. ---- no need to read below

### From GPT

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_search_results(query):
    url = f"https://www.google.com/search?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }  # User-Agent header to mimic a browser
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        search_results = []
        for result in soup.find_all('div', class_='tF2Cxc'):
            title = result.find('h3').text
            link = result.find('a')['href']
            search_results.append({'title': title, 'link': link})
        return search_results
    else:
        print("Failed to fetch search results.")
        return None

##Clotilde's function for relevance evaluation
def evaluate_relevance(results, keywords):
    relevance_scores = []
    for result in results:
        title = result["title"].lower()  # Convertir le titre en minuscule pour une comparaison insensible à la casse
        score = sum(1 for word in keywords if word in title)  # Compter combien de mots-clés apparaissent dans le titre
        relevance_scores.append(score)
    return relevance_scores
####

# Example usage:
query = input("Enter your search query: ")
results = scrape_search_results(query)

if results:
    for i, result in enumerate(results, start=1):
        print(f"{i}. {result['title']}")
        print(f"   Link: {result['link']}")
        print()

evaluate_relevance(results, ['ocean', 'sea', 'crime'])

### Modified (in progress)

In [10]:
import requests
from bs4 import BeautifulSoup


def clean_text(text):
    # Text to lowercase
    text = text.lower()
    # Remove special characters using regular expression
    cleaned_text = re.sub(r'[^-a-zA-Z0-9\s]', '', text)
    return cleaned_text

def text_to_word(soup):
    content = soup.find_all("div", class_ = 'entry-content')
#    print(content)
    if content is not []:
        for paragraph in content:
            print(paragraph)
            text = paragraph.get_text(separator='\n')
            print(text)
            text = clean_text(text)
            #text_word = text.split()
            return text.split()    
    else:
        return None

def scrape_search_results(query):
    url = f"https://news.google.com/search?q={query}&hl=en-CA&gl=CA&ceid=CA%3Aen"
    
#    url = f"https://www.google.com/search?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }  # User-Agent header to mimic a browser
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        search_results = []
        for result in soup.find_all('div', class_='tF2Cxc'):
            title = result.find('h3').text
            link = result.find('a')['href']
            search_results.append({'title': title, 'link': link})
        return search_results
    else:
        print("Failed to fetch search results.")
        return None

    
def evaluate_relevance(results, keywords):
    relevance_scores = []
    for result in results:
        title = result["title"].lower()  # Convertir le titre en minuscule pour une comparaison insensible à la casse
        score = sum(1 for word in keywords if word in title)  # Compter combien de mots-clés apparaissent dans le titre
        relevance_scores.append(score)
    return relevance_scores

    
    
def scrape_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        word_text = text_to_word(soup)
        content = soup.find_all("div", class_ = 'entry-content')
        content = soup.get_text()
        return word_text
    else:
        print(f"Failed to fetch content from {url}.")
        return None
    

# Example usage:
query = input("Enter your search query: ")
results = scrape_search_results(query)

if results:
    for i, result in enumerate(results[:5], start=1):
        print(f"{i}. {result['title']}")
        print(f"   Link: {result['link']}")
        print("   Content:")
        content = scrape_content(result['link'])
        if content:
            print(content[:500])  # Print the first 500 characters of the content
        print()


Enter your search query: vessel fish crime


In [None]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    # Text to lowercase
    text = text.lower()
    # Remove special characters using regular expression
    cleaned_text = re.sub(r'[^-a-zA-Z0-9\s]', '', text)
    return cleaned_text



text = 'Hi my name is: sumin", this-is to test text cleaning!'
clean_text(text)

In [32]:
import requests
from bs4 import BeautifulSoup

def scrape_yahoo_news(query):
    # Construct the Yahoo News URL with the query
    url = f"https://news.google.com/search?q={query}"
    print(url)
    # Send a GET request to the URL
    response = requests.get(url)
    print(response)
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the search result elements
    search_results = soup.find_all('div', class_='NewsArticle')

    # Extract the title and link of each search result
    results = []
    for result in search_results[:10]:  # Scraping top 10 results
        title = result.find('h4').text
        link = result.find('a')['href']
        results.append({'title': title, 'link': link})

    return results

# Example usage
query = input("Enter your search query: ")
top_results = scrape_yahoo_news(query)
for index, result in enumerate(top_results, start=1):
    print(f"{index}. {result['title']}")
    print(result['link'])
    print()


Enter your search query: fish
https://news.google.com/search?q=fish
<Response [200]>
[]
