In [188]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Function to scrape data from a single page
def scrape_page(driver):
    data = []
    table = driver.find_element(By.CLASS_NAME, 'rk-table')
    rows = table.find_elements(By.TAG_NAME, 'tr')
    
    for row in rows[1:]:
        cells = row.find_elements(By.TAG_NAME, 'td')
        rank = cells[0].text.strip()
        institution = cells[1].text.strip()
        total_score = cells[3].text.strip()
        q1 = cells[4].text.strip()
        
        data.append((rank, institution, total_score, q1))
    
    return data
    

# Main function to scrape data from all pages
def scrape_all_pages():
    base_url = 'https://www.shanghairanking.com/rankings/gras/2022/RS0202'
    all_data = []
    
    # Configure Chrome WebDriver options
    options = Options()
    options.headless = True  # Run Chrome WebDriver in headless mode
    
    # Create a new Chrome WebDriver instance
    driver = webdriver.Chrome()
    url = base_url
    driver.get(url)
    for page in range(1,18):
        for i in range(1,6):
        
            # Wait for the table to be visible
            table_locator = (By.CLASS_NAME, 'rk-table')
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located(table_locator))
            
             # Wait for the CNCI dropdown to be clickable and visible
            sort_slot = driver.find_element(By.XPATH, '(//div[@class="rk-sort-slot"])[2]')
            #driver.execute_script("arguments[0].scrollIntoView();", sort_slot)
            #actions = ActionChains(driver)
            #actions.move_to_element(sort_slot).perform()
            dropdown = sort_slot.find_element(By.CLASS_NAME, "rank-select")
            dropdown.click()
            # Move the mouse to the CNCI dropdown and click it
            option = dropdown.find_element(By.CSS_SELECTOR, f'.options li:nth-child({i})')
            option.click()
        
            print(f"Scraping page {page}, dropdown option {i}...")
            page_data = scrape_page(driver)
            all_data.extend(page_data)
            
            
           
        
        
        # Check if the next page link exists
        next_button = driver.find_element(By.CLASS_NAME, 'ant-pagination-next')
        if 'disabled' in next_button.get_attribute('class'):
            break
        
        # Go to the next page
        next_button.click()
        page += 1
        time.sleep(10)
        driver.execute_script("window.scrollTo(0, 0);")
    
    
    # Quit the WebDriver instance
    driver.quit()
    
    return all_data

# Call the main function to scrape data from all pages
results = scrape_all_pages()


  options.headless = True  # Run Chrome WebDriver in headless mode


Scraping page 1, dropdown option 1...
Scraping page 1, dropdown option 2...
Scraping page 1, dropdown option 3...
Scraping page 1, dropdown option 4...
Scraping page 1, dropdown option 5...
Scraping page 2, dropdown option 1...
Scraping page 2, dropdown option 2...
Scraping page 2, dropdown option 3...
Scraping page 2, dropdown option 4...
Scraping page 2, dropdown option 5...
Scraping page 3, dropdown option 1...
Scraping page 3, dropdown option 2...
Scraping page 3, dropdown option 3...
Scraping page 3, dropdown option 4...
Scraping page 3, dropdown option 5...
Scraping page 4, dropdown option 1...
Scraping page 4, dropdown option 2...
Scraping page 4, dropdown option 3...
Scraping page 4, dropdown option 4...
Scraping page 4, dropdown option 5...
Scraping page 5, dropdown option 1...
Scraping page 5, dropdown option 2...
Scraping page 5, dropdown option 3...
Scraping page 5, dropdown option 4...
Scraping page 5, dropdown option 5...
Scraping page 6, dropdown option 1...
Scraping pag

In [193]:
import pandas as pd
df = pd.DataFrame(results)
df = df.rename(columns = {0:'Rank',1:'University_Name',2:'Total_Score',3:'Drop_Down'})

pivot_table = df.pivot_table(index=['Rank', 'University_Name', 'Total_Score'], columns=df.groupby(['Rank', 'University_Name', 'Total_Score']).cumcount()+1, values='Drop_Down', aggfunc='first')

# Rename the columns
column_names = {
    1: 'Q1',
    2: 'CNCI',
    3: 'IC',
    4: 'TOP',
    5: 'AWARD'
}
pivot_table = pivot_table.rename(columns=column_names).reset_index().sort_values('Total_Score', ascending = False)

pivot_table

Unnamed: 0,Rank,University_Name,Total_Score,Q1,CNCI,IC,TOP,AWARD
0,1,"University of California, Berkeley",329.8,42.8,85.1,76.7,100.0,86.6
111,2,Massachusetts Institute of Technology (MIT),328.5,50.9,81.8,76.6,93.8,86.6
222,3,Stanford University,310.9,41.3,85.8,72.1,69.3,100.0
333,4,Georgia Institute of Technology,275.6,51.2,80.3,70.3,80.0,50.0
444,5,Tsinghua University,258.4,100.0,79.6,62.0,66.3,0.0
...,...,...,...,...,...,...,...,...
171,201-300,Tel Aviv University,,27.9,68.3,68.4,20.0,0.0
170,201-300,Technion-Israel Institute of Technology,,28.1,68.2,73.5,28.3,0.0
169,201-300,Technical University of Denmark,,40.2,75.8,84.1,0.0,0.0
168,201-300,Technical University Darmstadt,,22.1,60.9,71.0,34.6,0.0
