In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [5]:
url = "https://www.nepalitwa.com/sanskrit"

## Extraction of Links from a Single Page

### Setting Up the Driver

In [None]:
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())
except:
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    print("Running in headless mode.")

In [None]:
driver.get(url)

In [8]:
links = []

### Finding the next button

In [9]:
next_button = driver.find_elements(By.CLASS_NAME, "nextpostslink")
if len(next_button) != 1:
    print(f"Some Problem with link: {url}")

In [10]:
next_button

[<selenium.webdriver.remote.webelement.WebElement (session="d437c9f4858694fc9952ab10cafed75c", element="f.D685ABD887BC3B72D83726C43AFB5B77.d.B437E889300460DF76C60D551B1F35C6.e.68")>]

### Link Extraction

#### Shortening the search space by specifying the div-class

In [11]:
divs = driver.find_elements(By.CLASS_NAME, "more-item")
if len(divs) < 1:
    print(f"Some Problem with the link: {url}")

#### Extracting the anchor tags from the divs and then links from anchor tags

In [12]:
for i in range(len(divs)):
    anc = divs[i].find_element(By.TAG_NAME, 'a')
    if(anc==None):
        print(f"Some problem with the link. Continuing...")
        continue
    links.append(anc.get_attribute("href"))

In [14]:
driver.close()

In [16]:
df = pd.DataFrame(links, columns=["url"])

In [17]:
for i in range(len(links)):
    df.loc[len(df)] = links[i]

In [20]:
df.head()

Unnamed: 0,url
0,https://www.nepalitwa.com/detail/12243
1,https://www.nepalitwa.com/detail/12216
2,https://www.nepalitwa.com/detail/11814
3,https://www.nepalitwa.com/detail/11813
4,https://www.nepalitwa.com/detail/11779


## Multipage Link Extraction

In [38]:
all_links = []

### Setting up the Driver

In [23]:
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())
except:
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    print("Running in headless mode.")

In [37]:
driver.get(url)

### Going through all the Pages and Extracting all the links

In [39]:
while(True):
    divs = driver.find_elements(By.CLASS_NAME, "more-item")
    if len(divs) < 1:
        print(f"Some Problem with the link: {url}")
        continue

    for i in range(len(divs)):
        anc = divs[i].find_element(By.TAG_NAME, 'a')
        if(anc==None):
            print(f"Some problem with the link. Continuing...")
            continue
        all_links.append(anc.get_attribute("href"))

    next_button = driver.find_elements(By.CLASS_NAME, "nextpostslink")
    if(len(next_button) == 0):
        break
    elif(len(next_button) == 2):
        next_button[1].click()
    elif(next_button[0].get_attribute("rel") == "next"):
        next_button[0].click()
    else:
        break
driver.close()

In [36]:
len(all_links)

2169

In [40]:
df = pd.DataFrame(all_links, columns=["url"])

In [41]:
for i in range(len(all_links)):
    df.loc[len(df)] = all_links[i]

In [42]:
df.head()

Unnamed: 0,url
0,https://www.nepalitwa.com/detail/12243
1,https://www.nepalitwa.com/detail/12216
2,https://www.nepalitwa.com/detail/11814
3,https://www.nepalitwa.com/detail/11813
4,https://www.nepalitwa.com/detail/11779


In [None]:
df.to_csv("/Links/nepalitwa_links.csv", index=False)