In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
dr = webdriver.Chrome(options=options)

In [4]:
url = 'https://synonyms.su'

In [5]:
dr.get('https://synonyms.su')

In [6]:
dr.find_elements(by=By.CLASS_NAME, value='letter-list')

[<selenium.webdriver.remote.webelement.WebElement (session="6015fbedac249b5b33ccc19cb2eb164b", element="f.3CC5B9F91D8D78A1B195316696938B8D.d.9D76487C4E9BBB0C1669FF77C6D1A991.e.36")>]

In [7]:
soup = BeautifulSoup(dr.page_source, 'lxml')

In [8]:
letter_refs = []

In [9]:
for alpha in soup.find('div', class_='letter-list').find_all('a'):
    letter_refs.append(alpha.get('href'))

dictionaries = ['https://synonymonline.ru/', 'https://synonyms.su/']

In [10]:
letter_pages_count = {}

In [11]:
for ref in letter_refs:
    dr.get(url+ref)
    soup1 = BeautifulSoup(dr.page_source, 'lxml')
    if soup1.find('nav', class_='pagination') is not None:
        letter_pages_count[ref] = int(soup1.find('nav', class_='pagination').find_all('a')[-1].contents[0])
    else:
        letter_pages_count[ref] = 1

In [12]:
letter_pages_count

{'/a': 14,
 '/b': 16,
 '/v': 23,
 '/g': 12,
 '/d': 14,
 '/e': 2,
 '/zh': 3,
 '/z': 16,
 '/i': 10,
 '/y': 1,
 '/k': 20,
 '/l': 8,
 '/m': 16,
 '/n': 21,
 '/o': 25,
 '/p': 59,
 '/r': 18,
 '/s': 30,
 '/t': 13,
 '/u': 10,
 '/f': 7,
 '/h': 5,
 '/ts': 3,
 '/ch': 4,
 '/sh': 6,
 '/sch': 1,
 '/11': 1,
 '/y1': 1,
 '/22': 1,
 '/e1': 6,
 '/yu': 1,
 '/ya': 2}

In [13]:
columns=['word', 'synonyms', 'synonyms_count']

In [15]:
def get_synonyms_list(driver, url, ref_to_synonyms):
    driver.get(url+ref_to_synonyms)
    bsoup = BeautifulSoup(dr.page_source, 'lxml')
    syn_list = []
    syn_table =  bsoup.find(class_='synonyms-table')
    if syn_table is not None:
        all_synonyms = syn_table.find('tbody').find_all('tr')
        for syn_item in all_synonyms:
            synonym = syn_item.find('a')
            if synonym is None:
                synonym = syn_item.find_all('td')[1].find('span').text
            else:
                synonym = synonym.text
            syn_list.append(synonym)
    return syn_list

In [16]:
letter_refs[7]

'/z'

In [19]:
ref = letter_refs[7]
for page in tqdm(range(1, letter_pages_count[ref]+1)):
    letter_page_df_list_format = []
    if page == 1:
        ref1 = ref
    else:
        ref1 = ref + '/' + str(page)
    dr.get(url+ref1)
    soup1 = BeautifulSoup(dr.page_source, 'lxml')
    word_list = soup1.find('div', class_='wordlist-column synonyms').find_all('a')
    for word in tqdm(word_list):
        the_word = word.contents[0]
        ref_to_synonyms = word.get('href')
        synonyms_list = get_synonyms_list(dr, url, ref_to_synonyms)
        synonyms_count = len(synonyms_list)
        letter_page_df_list_format.append([the_word, synonyms_list, synonyms_count])
    print(f'page {page} done')
    letter_df = pd.DataFrame(data=letter_page_df_list_format, columns=columns)
    letter_df.to_json(f'D:\AtomicHackData\synonyms_{ref[1:]}_{page}.json', force_ascii=False)

100%|██████████| 1000/1000 [42:43<00:00,  2.56s/it]
  6%|▋         | 1/16 [42:44<10:41:08, 2564.58s/it]

page 1 done


100%|██████████| 1000/1000 [41:59<00:00,  2.52s/it]
 12%|█▎        | 2/16 [1:24:44<9:52:17, 2538.39s/it]

page 2 done


100%|██████████| 1000/1000 [41:47<00:00,  2.51s/it]
 19%|█▉        | 3/16 [2:06:32<9:06:56, 2524.37s/it]

page 3 done


100%|██████████| 1000/1000 [40:57<00:00,  2.46s/it]
 25%|██▌       | 4/16 [2:47:30<8:19:37, 2498.13s/it]

page 4 done


100%|██████████| 1000/1000 [41:28<00:00,  2.49s/it]
 31%|███▏      | 5/16 [3:28:59<7:37:24, 2494.94s/it]

page 5 done


100%|██████████| 1000/1000 [42:22<00:00,  2.54s/it]
 38%|███▊      | 6/16 [4:11:22<6:58:34, 2511.43s/it]

page 6 done


100%|██████████| 1000/1000 [45:23<00:00,  2.72s/it]
 44%|████▍     | 7/16 [4:56:46<6:27:07, 2580.81s/it]

page 7 done


100%|██████████| 1000/1000 [46:54<00:00,  2.81s/it]
 50%|█████     | 8/16 [5:43:41<5:54:03, 2655.46s/it]

page 8 done


100%|██████████| 1000/1000 [46:20<00:00,  2.78s/it]
 56%|█████▋    | 9/16 [6:30:02<5:14:21, 2694.54s/it]

page 9 done


100%|██████████| 1000/1000 [48:36<00:00,  2.92s/it]
 62%|██████▎   | 10/16 [7:18:39<4:36:19, 2763.27s/it]

page 10 done


100%|██████████| 1000/1000 [47:41<00:00,  2.86s/it]
 69%|██████▉   | 11/16 [8:06:21<3:52:46, 2793.36s/it]

page 11 done


100%|██████████| 1000/1000 [50:19<00:00,  3.02s/it]
 75%|███████▌  | 12/16 [8:56:41<3:10:49, 2862.46s/it]

page 12 done


100%|██████████| 1000/1000 [47:22<00:00,  2.84s/it]
 81%|████████▏ | 13/16 [9:44:04<2:22:49, 2856.53s/it]

page 13 done


100%|██████████| 1000/1000 [48:23<00:00,  2.90s/it]
 88%|████████▊ | 14/16 [10:32:28<1:35:41, 2870.95s/it]

page 14 done


100%|██████████| 1000/1000 [45:23<00:00,  2.72s/it]
 94%|█████████▍| 15/16 [11:17:52<47:06, 2826.67s/it]  

page 15 done


100%|██████████| 651/651 [28:30<00:00,  2.63s/it]
100%|██████████| 16/16 [11:46:23<00:00, 2648.97s/it]

page 16 done



