From d942d69b430a7951a7e31f61b49812343a476c9a Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Sat, 18 May 2024 01:24:48 +0800 Subject: [PATCH 1/2] create input loop and remove unneeded flow --- linkedin_scraper/person.py | 70 -------------------------------------- samples/scrape_person.py | 34 +++++++++++++++--- 2 files changed, 30 insertions(+), 74 deletions(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index 1c297a5..a0f3157 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -249,7 +249,6 @@ def get_name_and_location(self): self.name = top_panel.find_element(By.TAG_NAME, "h1").text self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text - def get_about(self): try: about = self.driver.find_element(By.ID,"about").find_element(By.XPATH,"..").find_element(By.CLASS_NAME,"display-flex").text @@ -293,75 +292,6 @@ def scrape_logged_in(self, close_on_complete=True): self.get_educations() driver.get(self.linkedin_url) - - # get interest - try: - - _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( - EC.presence_of_element_located( - ( - By.XPATH, - "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']", - ) - ) - ) - interestContainer = driver.find_element(By.XPATH, - "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']" - ) - for interestElement in interestContainer.find_elements(By.XPATH, - "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']" - ): - interest = Interest( - interestElement.find_element(By.TAG_NAME, "h3").text.strip() - ) - self.add_interest(interest) - except: - pass - - # get accomplishment - try: - _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( - EC.presence_of_element_located( - ( - By.XPATH, - "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']", - ) - ) - ) - acc = driver.find_element(By.XPATH, - "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']" - ) - for block in acc.find_elements(By.XPATH, - "//div[@class='pv-accomplishments-block__content break-words']" - ): - category = block.find_element(By.TAG_NAME, "h3") - for title in block.find_element(By.TAG_NAME, - "ul" - ).find_elements(By.TAG_NAME, "li"): - accomplishment = Accomplishment(category.text, title.text) - self.add_accomplishment(accomplishment) - except: - pass - - # get connections - try: - driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/") - _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( - EC.presence_of_element_located((By.CLASS_NAME, "mn-connections")) - ) - connections = driver.find_element(By.CLASS_NAME, "mn-connections") - if connections is not None: - for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"): - anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link") - url = anchor.get_attribute("href") - name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip() - occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip() - - contact = Contact(name=name, occupation=occupation, url=url) - self.add_contact(contact) - except: - connections = None - if close_on_complete: driver.quit() diff --git a/samples/scrape_person.py b/samples/scrape_person.py index 7d4e93f..8578d2a 100644 --- a/samples/scrape_person.py +++ b/samples/scrape_person.py @@ -1,9 +1,35 @@ import os -from linkedin_scraper import Person, actions +from linkedin_scraper import Person, actions, Company from selenium import webdriver -driver = webdriver.Chrome("./chromedriver") + +driver = webdriver.Chrome() email = os.getenv("LINKEDIN_USER") password = os.getenv("LINKEDIN_PASSWORD") -actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal -person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver) +actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal +user_input = [] +urls = [] +while True: + user_input = input("Enter a comma-separated list of strings: ") + if user_input == "exit": + break + urls = user_input.split(",") + results = [] + for url in urls: + print(f'scraping {url}') + person = Person(url, driver=driver, close_on_complete=False) + company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False) + results.append((person, company)) + + print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size') + for person, company in results: + experience = person.experiences[0] + print(f'"{person.name}", ' + f'"{person.location}", ' + f'"{experience.position_title}", ' + f'"{experience.institution_name}", ' + f'"{experience.linkedin_url}", ' + f'"{company.industry}", ' + f'"{company.website}", ' + f'"{company.company_size}", ' + ) From 31612bf9f9080536060d929b4ca4fe1e3397f66c Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Sun, 19 May 2024 18:42:07 +0800 Subject: [PATCH 2/2] create loop program, log when error, and fix scraper --- linkedin_scraper/company.py | 6 +++--- linkedin_scraper/person.py | 23 +++++++++++++++++++---- samples/scrape_person.py | 3 ++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py index 77900eb..9293597 100644 --- a/linkedin_scraper/company.py +++ b/linkedin_scraper/company.py @@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): driver.get(self.linkedin_url) - _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]'))) + _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]'))) navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ") - self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip() + self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip() # Click About Tab or View All Link try: @@ -360,6 +360,6 @@ def __repr__(self): _output['affiliated_companies'] = self.affiliated_companies _output['employees'] = self.employees _output['headcount'] = self.headcount - + return json.dumps(_output).replace('\n', '') diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index a0f3157..9c83217 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -1,3 +1,5 @@ +import time + import requests from selenium import webdriver from selenium.webdriver.common.by import By @@ -115,11 +117,13 @@ def get_experiences(self): self.scroll_to_bottom() main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main) for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"): - position = position.find_element(By.XPATH, "//div[@data-view-name='profile-component-entity']") + position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']") company_logo_elem, position_details = position.find_elements(By.XPATH, "*") # company elem company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href") + if not company_linkedin_url: + continue # position details position_details_list = position_details.find_elements(By.XPATH,"*") @@ -143,15 +147,26 @@ def get_experiences(self): company = outer_positions[0].find_element(By.TAG_NAME,"span").text work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text location = outer_positions[2].find_element(By.TAG_NAME,"span").text + else: + position_title = "" + company = outer_positions[0].find_element(By.TAG_NAME,"span").text + work_times = "" + location = "" + times = work_times.split("·")[0].strip() if work_times else "" duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None from_date = " ".join(times.split(" ")[:2]) if times else "" to_date = " ".join(times.split(" ")[3:]) if times else "" - - if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")) > 1: - descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li") + if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")): + inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container") + .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*") + .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item")) + else: + inner_positions = [] + if len(inner_positions) > 1: + descriptions = inner_positions for description in descriptions: res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*") position_title_elem = res[0] if len(res) > 0 else None diff --git a/samples/scrape_person.py b/samples/scrape_person.py index 8578d2a..c34f70e 100644 --- a/samples/scrape_person.py +++ b/samples/scrape_person.py @@ -10,7 +10,7 @@ user_input = [] urls = [] while True: - user_input = input("Enter a comma-separated list of strings: ") + user_input = input("Enter a comma-separated list of linkedin urls: ") if user_input == "exit": break urls = user_input.split(",") @@ -21,6 +21,7 @@ company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False) results.append((person, company)) + print('RESULTS:') print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size') for person, company in results: experience = person.experiences[0]