In [1]:
from selenium import webdriver
import time
import re
import numpy as np
import pandas as pd
import bs4
import requests
import random
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, ElementNotInteractableException, TimeoutException
import datetime
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from collections import defaultdict

In [2]:
companies = pd.read_csv('/Users/ZiyuChen/assignment/linkedin-web-scraper/companies.csv')

In [3]:
def set_driver():
    # Define Chrome options to open the window in maximized mode
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")

    # No need to wait for the page to finish loading
    desired_capabilities = DesiredCapabilities.CHROME
    desired_capabilities["pageLoadStrategy"] = "none"
    
    #Anti-web-scraping
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    # Initialize the Chrome webdriver and open the URL
    driver = webdriver.Chrome(options=options, executable_path='/Users/ZiyuChen/Desktop/chromedriver')
    
    return driver

In [4]:
def load_page(url, targets = {}, pause_time = 1):
    driver.get(url)

    # Record the starting time
    start = datetime.datetime.now()
    time.sleep(10)
    i = 0
    while True:
        # Scroll down bit by bit
        driver.execute_script("window.scrollTo(0, %d);" % i)
        
        # wait to load page
        time.sleep(pause_time)
        
        now = datetime.datetime.now()
        if float((now-start).total_seconds()) > 60:
            break

        # Calculate the bottom height
        try:
            if 'id' in targets:
                for target in targets['id']:
                    driver.find_element_by_id(target)
            if 'class' in targets:
                for target in targets['class']:
                    driver.find_element_by_class_name(target)
            break
        except NoSuchElementException:
            i += 100
            if i == 5000:
                i = 0
            continue
 
    driver.execute_script("window.scrollTo(0, 0);")
    
    # Record the end time, then calculate and print the total time
    end = datetime.datetime.now()
    delta = end-start
    print("Took {} to fully load {}".format(delta, url))

In [5]:
experience_table = defaultdict(list)
education_table = defaultdict(list)
profile_table = defaultdict(list)

In [42]:
def login(email, password):
    # Load Linked-in's signin page:
    load_page('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin', {'id':['username', 'password'],'class':['login__form_action_container','btn__primary--large']})
    # Select the username input
    username_input = driver.find_element_by_id('username')
    # Fill in the user email
    username_input.clear()
    username_input.send_keys(email)
    # Select the password input
    password_input = driver.find_element_by_id('password')
    # Fill in the password
    password_input.clear()
    password_input.send_keys(password)
    # Select the signin button:
    signin_button = driver.find_element_by_class_name('login__form_action_container ').find_element_by_class_name('btn__primary--large')
    signin_button.click()
    time.sleep(180)
    
def get_officer_data(company):
    # Load the founder's linkedin page
    load_page(company['officer_linkedin'], {'id':['experience-section', 'education-section']}, 0.1)
    time.sleep(5)
    # Press the see more button until all work experience and education are displayed
    press_all_show_more()
    press_all_show_more()
    print('Finished pressing all show more buttons')
    # Collect all work experience
    experience_list = get_officer_experience_list(company['index'], company['officer'])
    # Collect all educational experience
    education_list = get_officer_education_list(company['index'], company['officer'])
 
    
    
    # Get the founder's profile url.
    try:
        a = driver.find_element_by_class_name('pv-top-card__photo').get_attribute("src")
        if a.startswith('http'):
            # Load the founder's profile
            url = company['officer_linkedin'].strip()
            if url.endswith('/'):
                url = url+'detail/photo/'
            else:
                url = url+'/detail/photo/'
            load_page(url, {'class':['pv-member-photo-modal__profile-image']})
            profile_url = get_officer_photo()
        else:
            profile_url = None
    except NoSuchElementException:
        profile_url = None
        
        
    if experience_list:
        for experience in experience_list:
            for key in experience.keys():
                experience_table[key].append(experience[key])
    if education_list:
        for education in education_list:
            for key in education.keys():
                education_table[key].append(education[key])
    
    profile_table['founder_id'].append(company['index'])
    profile_table['founder_name'].append(company['officer'])
    profile_table['url'].append(profile_url)

def press_all_show_more(pause_time=1):
    while True:
        try:
            see_more_button = driver.find_element_by_xpath('//button[@class="pv-profile-section__see-more-inline"]')
            time.sleep(pause_time)
            see_more_button.click()
        except ElementClickInterceptedException:
            continue
        except ElementNotInteractableException:
            break
        except NoSuchElementException:
            break

def get_officer_experience_list(founder_id, founder_name):
    try:
        experience_list_el = driver.find_element_by_id('experience-section').find_elements_by_class_name('pv-profile-section__list-item')
        experience_list = []
        for experience_el in experience_list_el:
            experience = get_officer_experience(experience_el, founder_id, founder_name)
            experience_list += experience
        return experience_list
    except NoSuchElementException:
        print('Cannot access officer experience')
        pass
    except StaleElementReferenceException:
        return get_officer_experience_list(founder_id, founder_name)
    

def get_officer_experience(experience_el, founder_id, founder_name):
    try:
        experience_list_el = experience_el.find_element_by_class_name('pv-entity__position-group').find_elements_by_class_name('pv-entity__position-group-role-item')
        experience = []
        # Get company
        company = experience_el.find_element_by_class_name('pv-entity__company-summary-info').find_element_by_class_name('t-16').text.replace('Company Name', '').strip()
        for sub_experience_el in experience_list_el:
            e = {
                'founder_id': founder_id, 
                'founder_name': founder_name,
                'job_company': company,
                'job_title': None,
                'job_time': None,
                'job_duration': None,
                'job_location': None
            }
            
            # Get title
            try:
                title = sub_experience_el.find_element_by_class_name('pv-entity__summary-info--background-section').find_element_by_class_name('t-bold').text.replace('Title', '').strip()
                e['job_title'] = title
            except NoSuchElementException:
                pass
            
            # Get time
            try:
                time = sub_experience_el.find_element_by_class_name('pv-entity__date-range').text.replace('Dates Employed', '').strip()
                e['job_time'] = time
            except NoSuchElementException:
                pass

            # Get duration
            try:
                duration = sub_experience_el.find_element_by_class_name('pv-entity__bullet-item-v2').text
                e['job_duration'] = duration
            except NoSuchElementException:
                pass

            # Get location
            try:
                location = sub_experience_el.find_element_by_class_name('pv-entity__location').text.replace('Location', '').strip()
                e['job_location'] = location
            except NoSuchElementException:
                pass
            
            experience.append(e)
        return experience
    
    except NoSuchElementException:
        experience = {
            'founder_id': founder_id, 
            'founder_name': founder_name,
            'job_company': None,
            'job_title': None,
            'job_time': None,
            'job_duration': None,
            'job_location': None
        }
        experience_el = experience_el.find_element_by_class_name('pv-entity__summary-info--background-section')

        # Get company
        try:
            company = experience_el.find_element_by_class_name('pv-entity__secondary-title').text
            experience['job_company'] = company
        except NoSuchElementException:
            pass

        # Get title
        try:
            title = experience_el.find_element_by_class_name('t-16').text
            experience['job_title'] = title
        except NoSuchElementException:
            pass

        # Get time
        try:
            time = experience_el.find_element_by_class_name('pv-entity__date-range').text.replace('Dates Employed', '').strip()
            experience['job_time'] = time
        except NoSuchElementException:
            pass

        # Get duration
        try:
            duration = experience_el.find_element_by_class_name('pv-entity__bullet-item-v2').text
            experience['job_duration'] = duration
        except NoSuchElementException:
            pass

        # Get location
        try:
            location = experience_el.find_element_by_class_name('pv-entity__location').text.replace('Location', '').strip()
            experience['job_location'] = location
        except NoSuchElementException:
            pass
        return [experience]

def get_officer_education_list(founder_id, founder_name):
    try:
        education_list_el = driver.find_element_by_id('education-section').find_elements_by_class_name('pv-profile-section__list-item')
        education_list = []
        for education_el in education_list_el:
            education = get_officer_education(education_el, founder_id, founder_name)
            education_list.append(education)
        return education_list
    except NoSuchElementException:
        print('Cannot access officer education')
        pass
    except StaleElementReferenceException:
        return get_officer_education_list(founder_id, founder_name)


def get_officer_education(education_el, founder_id, founder_name):
    education = {
        'founder_id': founder_id, 
        'founder_name': founder_name,
        'education_school': None,
        'education_level': None,
        'education_time': None,
    }
    
    # Get school
    try:
        school = education_el.find_element_by_class_name('pv-entity__school-name').text
        education['education_school'] = school
    except NoSuchElementException:
        pass
    
    # Get level
    try:
        level = education_el.find_element_by_class_name('pv-entity__secondary-title').text.replace('Field Of Study', '').strip()
        education['education_level'] = level
    except NoSuchElementException:
        pass
    
    # Get time
    try:
        time = education_el.find_element_by_class_name('pv-entity__dates').text.replace("Dates attended or expected graduation","").strip()
        education['education_time'] = time
    except NoSuchElementException:
        pass
    return education
    
def get_officer_photo():
    try:
        profile_url = driver.find_element_by_class_name('pv-member-photo-modal__profile-image').get_attribute("src")
    except NoSuchElementException:
        profile_url = None
    return profile_url

In [None]:
driver = set_driver()
login()
i = 1000
while i < 1500:
    try:
        if type(companies.iloc[i]['officer_linkedin']) == str and companies.iloc[i]['officer_linkedin'].startswith('http'):
            get_officer_data(companies.iloc[i])
            print('Attempted to collect No. ' + str(i))
        else:
            print('No. ' + str(i) + ' is not available')
        i += 1
    except StaleElementReferenceException:
        try:
            driver.find_element_by_class_name('profile-unavailable')
            i += 1
            continue
        except NoSuchElementException:
            try:
                driver.find_element_by_class_name('profile-not-found__content')
                i += 1
                continue
            except NoSuchElementException:
                continue
    except TimeoutException:
        try:
            driver.find_element_by_class_name('profile-unavailable')
            i += 1
            continue
        except NoSuchElementException:
            try:
                driver.find_element_by_class_name('profile-not-found__content')
                i += 1
                continue
            except NoSuchElementException:
                continue

In [56]:
profile_df = pd.DataFrame(profile_table)
education_df = pd.DataFrame(education_table)
experience_df = pd.DataFrame(experience_table)

In [57]:
profile_df.to_csv('/Users/ZiyuChen/assignment/linkedin-web-scraper/profile4.csv')
education_df.to_csv('/Users/ZiyuChen/assignment/linkedin-web-scraper/education4.csv')
experience_df.to_csv('/Users/ZiyuChen/assignment/linkedin-web-scraper/experience4.csv')

In [53]:
# for key in profile_table.keys():
#     profile_table[key].pop()