In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def initialize_driver():
    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument(f"--user-agent={user_agent}")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

In [3]:
def accept_cookies(driver):
    accept_cookies_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//a[@data-n-messaging-accept-cookies]"))
    )
    accept_cookies_button.click()

In [4]:
def sign_in(driver, email, password):
    sign_in_link = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.LINK_TEXT, "Sign In"))
    )
    sign_in_link.click()


        # Find the email input element by its ID
    #email_input = WebDriverWait(driver, 20).until(
    #    EC.presence_of_element_located((By.ID, "enter-email"))
    

    # Clear any existing text in the input field (optional, based on your use case)
    #email_input.clear()

    # Enter the email address into the input field
    #email_address = "soumyajit.saha@bayes.city.ac.uk"  # Replace with the actual email address
    #email_input.send_keys(email_address)


    email_input = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, "enter-email"))
    )
      # Clear any existing text in the input field (optional, based on your use case)
    email_input.clear()

    email_input.send_keys(email)

# Find the "Next" button element by its ID
    next_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.ID, "enter-email-next"))
    )

    # Click the "Next" button
    next_button.click()



    # Find the "SSO Sign in" link element by its href attribute
    sso_sign_in_link = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, 'sso.ft.com')]"))
    )

    # Click the "SSO Sign in" link
    sso_sign_in_link.click()

        # Find the email input element by its ID
    email_input = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, "userNameInput"))
    )

    # Enter the email address into the email input field

    email_input.send_keys(email)

    # Find the password input element by its ID
    password_input = driver.find_element(By.ID, "passwordInput")

    # Enter the password into the password input field
   
    password_input.send_keys(password)

    # Find the "Sign in" span element by its ID
    sign_in_button = driver.find_element(By.ID, "submitButton")

    # Click the "Sign in" span element
    sign_in_button.click()

    


In [5]:
def search_topic(driver, topic):
    search_button = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//a[contains(@class, 'o-header__top-icon-link--search')]"))
    )
    driver.execute_script("arguments[0].scrollIntoView();", search_button)
    driver.execute_script("arguments[0].click();", search_button)

    search_input = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//input[@id='o-header-search-term-primary']"))
    )
    search_input.clear()
    search_input.send_keys(topic)

    search_button = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//button[@class='o-header__search-submit']"))
    )
    search_button.click()

In [6]:

def extract_content(soup, class_name):
    element = soup.find(class_=class_name)
    return element.get_text() if element else "Unknown"

In [72]:
def scrape_article_urls(driver, word='netflix'):
    netflix_articles = []

    while True:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "o-teaser__content"))
        )
        teaser_elements = driver.find_elements(By.CLASS_NAME, "o-teaser__content")
        for teaser in teaser_elements:
            heading_element = teaser.find_element(By.CLASS_NAME, "o-teaser__heading")
            heading_text = heading_element.text.lower()
            if word in heading_text:
                link_element = heading_element.find_element(By.TAG_NAME, "a")
                article_url = link_element.get_attribute("href")
                
                netflix_articles.append(article_url)
        
        # Rest of the code
        error_message = "Sorry, FT.com does not serve more than 1000 results"
        if error_message in driver.page_source:
            break
        # print("one page")debug statement
        
        next_page_arrow = driver.find_element(By.CSS_SELECTOR, ".search-pagination__next-page")
        next_page_arrow.click()

    return netflix_articles

In [57]:
def scrape_data_to_dataframe(driver, netflix_articles):
    scraped_data = []
    error_log = []

    # Loop through the URLs
    for url in netflix_articles:
        try:
            # Open the URL using ChromeDriver
            driver.get(url)
            
            # Get the page source after waiting for a bit to ensure it's fully loaded
            driver.implicitly_wait(5)
            page_source = driver.page_source
            
            # Parse the page source using Beautiful Soup
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Find the author's link
            try:
                author_link = soup.find("a", class_="n-content-tag--author")
                author_name = author_link.text if author_link else "Unknown Author"
            except Exception as author_err:
                author_name = "Error extracting author"
                error_log.append({"url": url, "field": "Author", "error": str(author_err)})
            
            # Find the heading element
            try:
                heading_element = soup.find("h1", class_="o-topper__headline")
                heading = heading_element.text if heading_element else "Unknown Heading"
            except Exception as heading_err:
                heading = "Error extracting heading"
                error_log.append({"url": url, "field": "Heading", "error": str(heading_err)})
            
            # Find the timestamp element
            try:
                timestamp_element = soup.find("time", class_="article-info__timestamp")
                timestamp = timestamp_element['datetime'] if timestamp_element else "Unknown Timestamp"
                date, time = timestamp.split('T')
            except Exception as timestamp_err:
                date = "Unknown Date"
                time = "Unknown Time"
                error_log.append({"url": url, "field": "Timestamp", "error": str(timestamp_err)})
            
            # Find the article content element
            try:
                article_content_element = soup.find("div", class_="article__content-body")
                
                # Extract the full article text
                article_text = ""
                for paragraph in article_content_element.find_all("p"):
                    article_text += paragraph.get_text() + "\n"
            except Exception as article_err:
                article_text = "Error extracting article text"
                error_log.append({"url": url, "field": "Article", "error": str(article_err)})
            
            # Store the scraped data in a dictionary
            scraped_data.append({
                "url": url,
                "author": author_name,
                "heading": heading,
                "date": date,
                "time": time[:-5],
                "article_text": article_text
            })
        
        except Exception as page_err:
            error_log.append({"url": url, "field": "Page", "error": str(page_err)})





    df_new = pd.DataFrame(scraped_data)

   
    return df_new


In [66]:
def sort_dataframe(df_new):
    df_cleaned = df_new[
        (df_new['author'] != 'Unknown Author') &
        (df_new['heading'] != 'Unknown Heading') &
        (df_new['date'] != 'Unknown') &
        (df_new['time'] != 'ime')
    ]

    df_sorted = df_cleaned.sort_values(by='date', ascending=False)
    return df_sorted


In [71]:
def initialize_and_sign_in(webpage,email_address, password):
    driver = initialize_driver()
    driver.get(webpage)
    accept_cookies(driver)
    sign_in(driver, email_address, password)
    return driver


In [None]:
if __name__ == "__main__":
    email_address = "xxx@bayes.city.ac.uk" # put email id 
    password = "xxxx"# put actual password
    

    driver = initialize_and_sign_in(email_address, password)

In [68]:
if __name__ == "__main__":
   

    topics = ["Netflix", "Netflix Inc", "Netflix quaterly result", "Netflix financial result"]
    dfs = []  # List to store individual DataFrames

    for topic in topics:
        search_topic(driver, topic)
        article_urls = scrape_article_urls(driver)
        print(f"{topic} is ok for url")
        df=scrape_data_to_dataframe(driver, article_urls)
        print(f"{topic} is ok for scraped df")

        df_sorted=sort_dataframe(df)
        print(f"{topic} is ok for scraped df sort")
        
        # Save the cleaned and sorted dataframe
        file_name = f'ft_articles_{topic}.csv'
        df_sorted.to_csv(file_name, index=False)
        
        # Append the dataframe to the list
        dfs.append(df_sorted)

    # Merge the dataframes to remove duplicates
    merged_df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset=['url'])

    # Save the merged dataframe
    merged_file_name = 'ft_articles_merged.csv'
    merged_df.to_csv(merged_file_name, index=False)

    #driver.quit()

    
    

 

one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
Netflix is ok for url
Netflix is ok for scraped df
Netflix is ok for scraped df sort
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
Netflix Inc is ok for url
Netflix Inc is ok for scraped df
Netflix Inc is ok for scraped df sort
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page
one page