In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Initialize the Chrome driver
driver = webdriver.Chrome()

# URL of the website
url = "https://exoplanets.nasa.gov/discovery/exoplanet-catalog/"

# Navigate to the website
driver.get(url)

# Initialize a DataFrame to store the data
df = pd.DataFrame(columns=['Name', 'Distance', 'Mass', 'Discovery Date', 'Stellar Magnitude'])

try:
    # Wait for the dropdown menu to load and select the third option
    dropdown = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="per_page"]')))
    Select(dropdown).select_by_index(2)  # Indices start at 0, so the third option is at index 2
    time.sleep(2)  # Wait for the page to reload

    # While the 'Next' button is enabled, keep scraping the table and clicking 'Next'
    while True:
        # Wait for the section to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="primary_column"]/section')))

        # Get the current page number
        current_page_number = driver.find_element(By.XPATH, '//*[@id="primary_column"]/div[1]/div[2]/div[1]/div/nav/div/input').get_attribute('value')

        # Get the data from each row
        names = [name.text for name in driver.find_elements(By.XPATH, '//li[@class="display_name"]')]
        distances = [dist.text for dist in driver.find_elements(By.XPATH, '//li[@class="st_dist"]')]
        masses = [mass.text for mass in driver.find_elements(By.XPATH, '//li[@class="mass_display"]')]
        discovery_dates = [date.text for date in driver.find_elements(By.XPATH, '//li[@class="discovery_date"]')]
        stellar_magnitudes = [mag.text for mag in driver.find_elements(By.XPATH, '//li[@class="st_optmag"]')]

        for name, distance, mass, discovery_date, stellar_magnitude in zip(names, distances, masses, discovery_dates, stellar_magnitudes):
            df_temp = pd.DataFrame({'Name': [name], 'Distance': [distance], 'Mass': [mass], 'Discovery Date': [discovery_date], 'Stellar Magnitude': [stellar_magnitude]})
            df = pd.concat([df, df_temp], ignore_index=True)

        # Try to click the 'Next' button
        try:
            next_button = driver.find_element(By.XPATH, '//*[@id="primary_column"]/div[1]/div[2]/div[1]/div/nav/span[2]/a')
            next_button.click()
            time.sleep(4)  # Wait for the next page to load

            # If the current page number is the same as before, break the loop
            if driver.find_element(By.XPATH, '//*[@id="primary_column"]/div[1]/div[2]/div[1]/div/nav/div/input').get_attribute('value') == current_page_number:
                break
        except:
            break  # If the 'Next' button is not found, we're on the last page

finally:
    # Close the driver
    driver.quit()

# Print the DataFrame
print(df)


                      Name Distance            Mass Discovery Date  \
0     11 Comae Berenices b      304   19.4 Jupiters           2007   
1       11 Ursae Minoris b      409  14.74 Jupiters           2009   
2          14 Andromedae b      246    4.8 Jupiters           2008   
3            14 Herculis b       58  8.053 Jupiters           2002   
4             16 Cygni B b       69   1.78 Jupiters           1996   
...                    ...      ...             ...            ...   
5458                XO-7 b      764  0.709 Jupiters           2019   
5459              YSES 2 b      357    6.3 Jupiters           2021   
5460             YZ Ceti b       12      0.7 Earths           2017   
5461             YZ Ceti c       12     1.14 Earths           2017   
5462             YZ Ceti d       12     1.09 Earths           2017   

     Stellar Magnitude  
0              4.72307  
1                5.013  
2              5.23133  
3              6.61935  
4                6.215  
...      

In [3]:
df

Unnamed: 0,Name,Distance,Mass,Discovery Date,Stellar Magnitude
0,11 Comae Berenices b,304,19.4 Jupiters,2007,4.72307
1,11 Ursae Minoris b,409,14.74 Jupiters,2009,5.013
2,14 Andromedae b,246,4.8 Jupiters,2008,5.23133
3,14 Herculis b,58,8.053 Jupiters,2002,6.61935
4,16 Cygni B b,69,1.78 Jupiters,1996,6.215
...,...,...,...,...,...
5458,XO-7 b,764,0.709 Jupiters,2019,10.521
5459,YSES 2 b,357,6.3 Jupiters,2021,10.885
5460,YZ Ceti b,12,0.7 Earths,2017,12.074
5461,YZ Ceti c,12,1.14 Earths,2017,12.074


In [4]:
df.to_csv("/Users/agastya/Documents/GitHub/Data_science_in_practice/web_scraping_assignment/exoplanet_data.csv", index=False)

In [5]:
df.describe()

Unnamed: 0,Name,Distance,Mass,Discovery Date,Stellar Magnitude
count,5463,5463.0,5463,5463,5463.0
unique,5463,2611.0,2091,31,2941.0
top,11 Comae Berenices b,,Unknown,2016,
freq,1,21.0,24,1517,208.0


In [6]:
# Check for null values
print(df.isnull().sum())

Name                 0
Distance             0
Mass                 0
Discovery Date       0
Stellar Magnitude    0
dtype: int64
