From d942d69b430a7951a7e31f61b49812343a476c9a Mon Sep 17 00:00:00 2001
From: Nguyen Quoc Viet <nguyenqviet98@gmail.com>
Date: Sat, 18 May 2024 01:24:48 +0800
Subject: [PATCH 1/2] create input loop and remove unneeded flow

---
 linkedin_scraper/person.py | 70 --------------------------------------
 samples/scrape_person.py   | 34 +++++++++++++++---
 2 files changed, 30 insertions(+), 74 deletions(-)

diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py
index 1c297a5..a0f3157 100644
--- a/linkedin_scraper/person.py
+++ b/linkedin_scraper/person.py
@@ -249,7 +249,6 @@ def get_name_and_location(self):
         self.name = top_panel.find_element(By.TAG_NAME, "h1").text
         self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
 
-
     def get_about(self):
         try:
             about = self.driver.find_element(By.ID,"about").find_element(By.XPATH,"..").find_element(By.CLASS_NAME,"display-flex").text
@@ -293,75 +292,6 @@ def scrape_logged_in(self, close_on_complete=True):
         self.get_educations()
 
         driver.get(self.linkedin_url)
-
-        # get interest
-        try:
-
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located(
-                    (
-                        By.XPATH,
-                        "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
-                    )
-                )
-            )
-            interestContainer = driver.find_element(By.XPATH,
-                "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
-            )
-            for interestElement in interestContainer.find_elements(By.XPATH,
-                "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
-            ):
-                interest = Interest(
-                    interestElement.find_element(By.TAG_NAME, "h3").text.strip()
-                )
-                self.add_interest(interest)
-        except:
-            pass
-
-        # get accomplishment
-        try:
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located(
-                    (
-                        By.XPATH,
-                        "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
-                    )
-                )
-            )
-            acc = driver.find_element(By.XPATH,
-                "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
-            )
-            for block in acc.find_elements(By.XPATH,
-                "//div[@class='pv-accomplishments-block__content break-words']"
-            ):
-                category = block.find_element(By.TAG_NAME, "h3")
-                for title in block.find_element(By.TAG_NAME,
-                    "ul"
-                ).find_elements(By.TAG_NAME, "li"):
-                    accomplishment = Accomplishment(category.text, title.text)
-                    self.add_accomplishment(accomplishment)
-        except:
-            pass
-
-        # get connections
-        try:
-            driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
-            )
-            connections = driver.find_element(By.CLASS_NAME, "mn-connections")
-            if connections is not None:
-                for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
-                    anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
-                    url = anchor.get_attribute("href")
-                    name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
-                    occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
-
-                    contact = Contact(name=name, occupation=occupation, url=url)
-                    self.add_contact(contact)
-        except:
-            connections = None
-
         if close_on_complete:
             driver.quit()
 
diff --git a/samples/scrape_person.py b/samples/scrape_person.py
index 7d4e93f..8578d2a 100644
--- a/samples/scrape_person.py
+++ b/samples/scrape_person.py
@@ -1,9 +1,35 @@
 import os
-from linkedin_scraper import Person, actions
+from linkedin_scraper import Person, actions, Company
 from selenium import webdriver
-driver = webdriver.Chrome("./chromedriver")
+
+driver = webdriver.Chrome()
 
 email = os.getenv("LINKEDIN_USER")
 password = os.getenv("LINKEDIN_PASSWORD")
-actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
-person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
+actions.login(driver, email, password)  # if email and password isnt given, it'll prompt in terminal
+user_input = []
+urls = []
+while True:
+    user_input = input("Enter a comma-separated list of strings: ")
+    if user_input == "exit":
+        break
+    urls = user_input.split(",")
+    results = []
+    for url in urls:
+        print(f'scraping {url}')
+        person = Person(url,  driver=driver, close_on_complete=False)
+        company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
+        results.append((person, company))
+
+    print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
+    for person, company in results:
+        experience = person.experiences[0]
+        print(f'"{person.name}", '
+              f'"{person.location}", '
+              f'"{experience.position_title}", '
+              f'"{experience.institution_name}", '
+              f'"{experience.linkedin_url}", '
+              f'"{company.industry}", '
+              f'"{company.website}", '
+              f'"{company.company_size}", '
+              )

From 31612bf9f9080536060d929b4ca4fe1e3397f66c Mon Sep 17 00:00:00 2001
From: Nguyen Quoc Viet <nguyenqviet98@gmail.com>
Date: Sun, 19 May 2024 18:42:07 +0800
Subject: [PATCH 2/2] create loop program, log when error, and fix scraper

---
 linkedin_scraper/company.py |  6 +++---
 linkedin_scraper/person.py  | 23 +++++++++++++++++++----
 samples/scrape_person.py    |  3 ++-
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py
index 77900eb..9293597 100644
--- a/linkedin_scraper/company.py
+++ b/linkedin_scraper/company.py
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
 
         driver.get(self.linkedin_url)
 
-        _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
+        _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
 
         navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
 
-        self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
+        self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
 
         # Click About Tab or View All Link
         try:
@@ -360,6 +360,6 @@ def __repr__(self):
         _output['affiliated_companies'] = self.affiliated_companies
         _output['employees'] = self.employees
         _output['headcount'] = self.headcount
-        
+
         return json.dumps(_output).replace('\n', '')
 
diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py
index a0f3157..9c83217 100644
--- a/linkedin_scraper/person.py
+++ b/linkedin_scraper/person.py
@@ -1,3 +1,5 @@
+import time
+
 import requests
 from selenium import webdriver
 from selenium.webdriver.common.by import By
@@ -115,11 +117,13 @@ def get_experiences(self):
         self.scroll_to_bottom()
         main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
         for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
-            position = position.find_element(By.XPATH, "//div[@data-view-name='profile-component-entity']")
+            position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
             company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
 
             # company elem
             company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
+            if not company_linkedin_url:
+                continue
 
             # position details
             position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
                     company = outer_positions[0].find_element(By.TAG_NAME,"span").text
                     work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
                     location = outer_positions[2].find_element(By.TAG_NAME,"span").text
+            else:
+                position_title = ""
+                company = outer_positions[0].find_element(By.TAG_NAME,"span").text
+                work_times = ""
+                location = ""
+
 
             times = work_times.split("·")[0].strip() if work_times else ""
             duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
 
             from_date = " ".join(times.split(" ")[:2]) if times else ""
             to_date = " ".join(times.split(" ")[3:]) if times else ""
-
-            if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")) > 1:
-                descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")
+            if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
+                inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
+                                  .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
+                                  .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
+            else:
+                inner_positions = []
+            if len(inner_positions) > 1:
+                descriptions = inner_positions
                 for description in descriptions:
                     res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
                     position_title_elem = res[0] if len(res) > 0 else None
diff --git a/samples/scrape_person.py b/samples/scrape_person.py
index 8578d2a..c34f70e 100644
--- a/samples/scrape_person.py
+++ b/samples/scrape_person.py
@@ -10,7 +10,7 @@
 user_input = []
 urls = []
 while True:
-    user_input = input("Enter a comma-separated list of strings: ")
+    user_input = input("Enter a comma-separated list of linkedin urls: ")
     if user_input == "exit":
         break
     urls = user_input.split(",")
@@ -21,6 +21,7 @@
         company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
         results.append((person, company))
 
+    print('RESULTS:')
     print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
     for person, company in results:
         experience = person.experiences[0]