## Scraping


In this Jupyter Notebook, we embark on a comprehensive exploration of web scraping, with a specific focus on the Reuters news site. We 've developed a function specifically designed to conduct targeted web scraping based on selected keywords. For our study, we chose the keyword "Israel Hamas" to focus our efforts on gathering articles related to the Israel-Hamas/Palestine conflict. When this keyword is entered into our function, it triggers a search throughout the Reuters website, systematically retrieving articles that align with this specific subject.  For our project, we established a sample Gmail account, "advancedcustomeranalytics@gmail.com," specifically for the purpose of user creation on the Reuters website. This account facilitated our web scraping process, allowing us to access and collect the required data. Finally, we successfully scraped a total of 2,644 articles from Reuters, which provided us with a substantial dataset for our analysis.

### Importing Necessary Libraries

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
import json
import time
import csv

### Initialize Scrapping Function

In [4]:
def reuters_search(keyword: str,
                   email: str,
                   password: str):
    """
    Searches Reuters for articles with the given keyword.
    Returns a list of dictionaries with article title, article url,
    article date, article source, article text.

    Parameters:
    - keyword (str): The keyword or search term to search for on Reuters.
    - email (str): Your email address for logging into the Reuters website.
    - password (str): Your password for logging into the Reuters website.

    Returns:
    - list of dict: A list of dictionaries containing article information.
      Each dictionary includes the following keys: 'title', 'url', 'date', 'source', 'text'.
    """
    
    browser = webdriver.Chrome()
    login_url = 'https://www.reuters.com/account/sign-in/'
    browser.get(login_url)

    try:
        # Wait for the login elements to be visible and interactable
        WebDriverWait(browser, 2).until(EC.visibility_of_element_located((By.NAME, 'email')))
        WebDriverWait(browser, 2).until(EC.visibility_of_element_located((By.NAME, 'password')))
        
        # Accept cookies after signing in
        WebDriverWait(browser, 1).until(
            EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Allow All")]'))
        ).click()

        # Input the username and password into the form
        browser.find_element(By.NAME, 'email').send_keys(email)
        browser.find_element(By.NAME, 'password').send_keys(password)

        # Wait for the sign in button with the class 'button__container__3sgvk' to be clickable and click it
        sign_in_button = WebDriverWait(browser, 2).until(
            EC.presence_of_element_located((By.XPATH, '//*[@data-testid="Text"][contains(text(), "Sign in")]'))

        )
        sign_in_button.click()
        
        # Wait for the URL to change from the current URL, which indicates a redirect
        current_url = browser.current_url
        WebDriverWait(browser, 2).until(EC.url_changes(current_url))

        print("Redirection after login successful")
    except Exception as e:
        print(f"An error occurred: {e}")
        browser.quit()
        return []

  
    # Wait for the cookie consent button to be clickable and click it
    try:
        WebDriverWait(browser, 2).until(
            EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Allow All")]'))
        ).click()
    except:
        pass  
    
    
    # After redirection, wait for the search button to be clickable and click it
    search_button = WebDriverWait(browser, 2).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@data-testid="Button"][@aria-label="Open search bar"]'))
    )
    search_button.click()


    # Wait for the search input to be present and enter the keyword
    search_input = WebDriverWait(browser, 2).until(
            EC.visibility_of_element_located((By.XPATH, '//input[@type="search"]'))
    )
    search_input.send_keys(keyword)
    search_input.send_keys(Keys.RETURN)

    
    time.sleep(1)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    content = soup.find('div', attrs = {'class': 'search-results__sectionContainer__34n_c'})
    articles = content.find_all('li', attrs = {'class': 'search-results__item__2oqiX'})


    article_list = []

    while len(article_list) < 3000:
        
        soup = BeautifulSoup(browser.page_source, 'html.parser')

        # Find all articles on the current page.
        articles = soup.find_all('li', attrs={'class': 'search-results__item__2oqiX'})

        # Append articles to article_list while checking for duplicates.
        for article in articles:
            if article not in article_list:
                article_list.append(article)

        # If we have reached our target number of articles, we can break the loop.
        if len(article_list) >= 3000:
            break

        # Find the next page button and click it, or break if it doesn't exist.
        try:
            next_button = WebDriverWait(browser, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//button[@data-testid="Button"][contains(@aria-label, "Next stories")]'))
            )
            if next_button:
                browser.execute_script("arguments[0].click();", next_button)
                
                # Wait for the page to load the new content.
                WebDriverWait(browser, 5).until(
                    EC.presence_of_element_located((By.XPATH, '//li[@class="search-results__item__2oqiX"]'))
                )
                time.sleep(2)  
              
        except Exception as e:
            print(f"No more pages to visit or an error occurred: {e}")
            if len(article_list) > 3000
                break
            else:
                pass

            
    print(len(article_list))
            
    for article in article_list:
         
        # Wait for the cookie consent button to be clickable and click it
        try:
            WebDriverWait(browser, 2).until(
                EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Allow All")]'))
            ).click()
        except:
            pass  
        
        
        
        try:
            article_dict = {}
            a_element = article.find('a')
            if not a_element or 'href' not in a_element.attrs:
                print("continue--------")
                continue

            article_link = 'https://www.reuters.com' + a_element['href']
            print(article_link)
        except:
            pass
        
        
        try:
            browser.get(article_link)
            WebDriverWait(browser, 2).until(
                EC.presence_of_element_located((By.TAG_NAME, "article"))
            )
        except:
            pass
        
        try:
            article_html = browser.page_source
            article_soup = BeautifulSoup(article_html, 'html.parser')

            text_container = article_soup.find('div', attrs={'class': 'article-body__content__17Yit'})

            article_text = ' '.join(p.get_text() for p in text_container.find_all('p')) if text_container else 'Text not found'

            article_dict['title'] = a_element.text.strip()
            article_dict['url'] = article_link
            article_dict['date'] = article.find('time').text if article.find('time') else 'Date not found'
            article_dict['source'] = 'Reuters'
            article_dict['text'] = article_text
            article_list.append(article_dict)
        except:
            pass
        
        time.sleep(1)
    
    browser.quit()
    return article_list

### Connect to Reuters and Perform Scraping

In [None]:
if __name__ == '__main__':
    
    keyword = 'israel hamas'
    articles = reuters_search(keyword,"advancedcustomeranalytics@gmail.com","password")
    
    csv_file_name = 'reuters_search_news_israel.csv'
    
    with open(csv_file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=';') 
        writer.writerow(['title', 'url', 'date', 'source', 'text'])

        for article in articles:
            try:
                writer.writerow([
                    article.get('title', ''),
                    article.get('url', ''),
                    article.get('date', ''),
                    article.get('source', ''),
                    article.get('text', '').replace('\n', ' ').replace('\r', '')  
                ])
            except:
                pass