In [1]:
###########################################
# Imports
###########################################

import time
import pandas as pd
from parsel import Selector
from bs4 import BeautifulSoup
import os, time, random, json, re
from csv import writer
from time import sleep
import tkinter as tk

#Selenium imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

In [2]:
###########################################
# Logging In
###########################################

# First thing to do is to login the credentials of LinkedIn
def login():
    driver.get('https://www.linkedin.com/login')
    time.sleep(random.choice(random_time_list_1)) 
    username = driver.find_element(By.ID,'username')
    username.send_keys('yourEmails')
    password = driver.find_element(By.ID,'password')
    password.send_keys('yourPassword')
    time.sleep(random.choice(random_time_list_1)) 
    password.submit()

In [3]:
###########################################
# Researching IT professionals on LinkedIn
###########################################

def search(query):
    # Task 2: Search for the profile we want to crawl
    # Task 2.1: Locate the search bar element
    sleep(2)
    wait = WebDriverWait(driver, 4)

    while True:
        try:
            search_field = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="global-nav-typeahead"]/input')))
            time.sleep(0.2)
            # Task 2.2: Input the search query to the search bar
            # search_query = input('What profile do you want to scrape? ')
            search_field.send_keys(query)
            # Task 2.3: Search
            search_field.send_keys(Keys.RETURN)
            break
        except:
            sleep(1)
    
    while True:
        try:
            people = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="search-reusables__filters-bar"]/ul/li[1]/button')))
            people.click()
            time.sleep(0.2)       
            break
        except:
            sleep(1)
   
    
    sleep(2)
    current_url=driver.current_url
    return(current_url)

In [4]:
##################################
# Extract experiences
##################################

def extract_experiences(experience_tags):
    
    experience_list = []

    order = 1
    for exp_tag in experience_tags:
        experience_div = exp_tag.parent.parent
        experience_div = [*experience_div.children][3]

        try:
            if experience_div.div.a is not None:
                # Multiple job position within same company
                multi_job = experience_div.find_all("div", class_="display-flex align-items-center") #.span.span
                company = multi_job[0].span.span
                job_title = multi_job[1].span.span
                job_type = experience_div.find("span", class_="t-14 t-normal").span

                job_title = re.search(r_expression, str(job_title)).group(0)
                company = re.search(r_expression, str(company)).group(0)
                job_type = re.search(r_expression, str(job_type)).group(0)
                job_type = job_type.split('·')[0] if '·' in job_type else job_type

                # experience_list.append((job_title, company, *[c.strip() for c in job_type.split('·')][::-1]))
                experience_list.append(
                    {
                        "job_title": job_title,
                        "company": company,
                        "job_type": job_type.split('·')[0].strip() if '·' in job_type else None,
                        "job_duration": job_type.split('·')[1].strip() if '·' in job_type else job_type,
                        "order": order
                    }  
                )
                
        except Exception as e:
            pass
            print(e)
        try:
            if experience_div.div.div is not None:
                job_title = experience_div.find("div", class_="display-flex align-items-center mr1 t-bold").span
                
                company = experience_div.find("span", class_="t-14 t-normal").span
                duration = experience_div.find("span", class_="t-14 t-normal t-black--light").span

          
                job_title = re.search(r_expression, str(job_title)).group(0)
                company = re.search(r_expression, str(company)).group(0)
                duration = re.search(r_expression, str(duration)).group(0)

                # experience_list.append((job_title, *[c.strip() for c in company.split('·')][::-1], duration.split('·')[-1].strip()))
                experience_list.append(
                    {
                        "job_title": job_title.strip(),
                        "company": company.split('·')[0].strip() if '·' in company else company,
                        "job_type": company.split('·')[1].strip() if '·' in company else None,
                        "job_duration": duration.split('·')[-1].strip(),
                        "order": order
                    }
                )
        except Exception as e:
            pass
            # print(e)
        order += 1

    return experience_list

In [5]:
##################################
# Extract other achievements
##################################

def extract_achievements(head_section, category):
    achievement_list = []

    # if category == "education" or category == "certification":
    achievement_tags = head_section.find_all("a", class_="optional-action-target-wrapper display-flex flex-column full-width")
    achievement_tags += head_section.find_all("div", class_="display-flex flex-column full-width")

    order = 1
    for tag in achievement_tags:
        institute = tag.div.span
        # full xpath /html/body/div[5]/div[3]/div/div/div[2]/div/div/main/section[4]/div[3]/ul/li[1]/div/div[2]/div/a/div/div/div/div/span[1]
        # old path : institute = tag.div.span.span
        # print('institute before formatting: '+str(institute))

        title = tag.find("span", class_="t-14 t-normal")
        title = title.span if title else None

        title = re.search(r_expression, str(title))
        title = title.group(0) if title else None
        institute = re.search(r_expression, str(institute))
        institute = institute.group(0) if institute else None
        # print('institute : '+str(institute))
        # print('title : '+ str(title))
        if category == "education":
            achievement_list.append(
                {
                    "title": title,
                    "institute": institute,
                    "order": order
                }
            )
        else:
            # linkedin made this
            achievement_list.append(
                {
                    "title": institute,
                    "institute": title,
                    "order": order
                }
            )
        order += 1

    return achievement_list


In [6]:
#################################################################
# Scrapping the profiles of IT professionals and their projects
#################################################################

def scrape_profiles():
    df_profile = pd.DataFrame()
    df_experiences = pd.DataFrame()
    df_education = pd.DataFrame()
    df_certification = pd.DataFrame()
    df_courses = pd.DataFrame()

    final_profiles_list = []
    undesirable_key_words = ['feed','mynetwork','jobs','messaging','notifications','premium','products', 'sales', 'story', 'company', 'results', 'people']
    desirable_countries = ['Canada','United States', 'Belgium', 'Bulgaria', 'Croatia', 'Czech Republic', 'Denmark', 
                           'Germany', 'Estonia', 'Greece', 'Spain', 'France', 'Ireland', 'Italy', 'Cyprus', 'Latvia', 
                           'Lithuania', 'Luxembourg', 'Hungary', 'Malta', 'The Netherlands', 'Austria', 'Poland', 
                           'Portugal', 'Romania', 'Slovenia', 'Slovakia', 'Finland' , 'Sweden' , 'Tunisia']
   
    #Extracting profiles URLs
    profiles = driver.find_elements(By.CLASS_NAME, 'app-aware-link')
   
    all_profile_URL = []
   
    for profile in profiles:
        profile_URL = profile.get_attribute('href')
        if (profile_URL not in all_profile_URL):
            bool_var = True
            for key_word in undesirable_key_words:
                if (profile_URL.find(key_word)!=-1) :
                    bool_var = False
            
            if ((bool_var) and (profile_URL not in all_profile_URL) and (profile_URL.find('linkedin')!=-1)):
                all_profile_URL.append(profile_URL)
        
            
    # For loop to iterate over each URL in the list
    all_profile_URL = list(set(all_profile_URL))
    for profile in all_profile_URL:
        driver.get(profile)
        scroll_indicator = 0
        while scroll_indicator < 10:                        
            driver.execute_script("window.scrollBy(0, arguments[0]);", 600)
            scroll_indicator += 1
            time.sleep(random.choice(random_time_list_2))
        
    
        soup = BeautifulSoup(driver.page_source, "lxml")          
        # Extract basic info
        time.sleep(random.choice(random_time_list_1))
        
        profile_cards = soup.find_all("section", {"class": "artdeco-card ember-view pv-top-card"})
        if ((profile_cards) and (driver.current_url not in final_profiles_list)):
            final_profiles_list.append(driver.current_url)
            profile_card = profile_cards[0]
            name = profile_card.find("div", class_="pv-text-details__left-panel").div.h1.string.strip()
            location = profile_card.find("div", class_="pv-text-details__left-panel mt2").span.string.strip()
            designation = profile_card.find("div", class_="text-body-medium break-words").string.strip()
            country = (location.split(','))[-1].strip()
            
            if country in desirable_countries:
                
                df = pd.DataFrame({"profile": driver.current_url,"name": name, "location": location, "designation": designation}, index=[0])
                df_profile = pd.concat([df_profile, df], ignore_index=True, axis=0)

                sections =  soup.find_all("section", {"class": "artdeco-card ember-view relative break-words pb3 mt2"})
                for section in sections:
                            # Extract E
                            if section.div["id"] == "experience":
                                print("found experience")
                                experiences = section.find_all("a", {"data-field": "experience_company_logo", "class": "optional-action-target-wrapper display-flex"})
                                if len(experiences):
                                    experiences_json = extract_experiences(experiences)
                                    df = pd.DataFrame(data=experiences_json)
                                    df['profile'] = driver.current_url
                                    df_experiences = pd.concat([df_experiences, df], ignore_index=True, axis=0)
                            # Extract educations
                            if section.div["id"] == "education":
                                print("found education")
                                educations_json = extract_achievements(section, "education")
                                df = pd.DataFrame(data=educations_json)
                                df['profile'] = driver.current_url
                                df_education = pd.concat([df_education, df], ignore_index=True, axis=0)
                            if section.div["id"] == "licenses_and_certifications":
                                print("found certifications")
                                certifications_json = extract_achievements(section, "certification")
                                df = pd.DataFrame(data=certifications_json)
                                df['profile'] = driver.current_url
                                df_certification = pd.concat([df_certification, df], ignore_index=True, axis=0)
                            elif section.div["id"] == "courses":
                                print("found courses")
                                courses_json = extract_achievements(section, "courses")
                                df = pd.DataFrame(data=courses_json)
                                df['profile'] = driver.current_url
                                df_courses = pd.concat([df_courses, df], ignore_index=True, axis=0)
                                
                        

                
                time.sleep(random.choice(random_time_list_1))  
                
    # Adding all informations to dataset 
    display(df_profile)
    display(df_experiences)                                   
    display(df_education)                     
    display(df_certification)
    display(df_courses)
    print("Writing dataframes to csv")
    df_profile.to_csv('/home/yassine/Desktop/End of studies Project/Scraped data/scraped_profiles.csv',mode='a', index=False, header=False)
    df_experiences.to_csv('/home/yassine/Desktop/End of studies Project/Scraped data/scraped_experiences.csv',mode='a', index=False, header=False)
    df_education.to_csv('/home/yassine/Desktop/End of studies Project/Scraped data/scraped_education.csv',mode='a', index=False, header=False)
    df_certification.to_csv('/home/yassine/Desktop/End of studies Project/Scraped data/scraped_certification.csv',mode='a', index=False, header=False)
    df_courses.to_csv('/home/yassine/Desktop/End of studies Project/Scraped data/scraped_courses.csv',mode='a', index=False, header=False)

In [7]:
##################################
# Navigating the next page
##################################

def navigate_next_page(page_number,page_URL):
    driver.get(page_URL)
    time.sleep(random.choice(random_time_list_1)) 
    wait = WebDriverWait(driver, 4)
    while True:
        try:
            next_btn = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.artdeco-pagination__button.artdeco-pagination__button--next")))
            next_btn.location_once_scrolled_into_view
            time.sleep(random.choice(random_time_list_2)) 
            next_btn.click()
            break
        except:
            driver.execute_script("window.scrollBy(0, arguments[0]);", 600)
    time.sleep(random.choice(random_time_list_1)) 
    
    print('Going to next page')
    
    current_url=driver.current_url
    return current_url
   

In [8]:
##################################
# Core programe
##################################
from selenium.webdriver.firefox.options import Options

def core_prog():
    # Login to LinkedIn
    login()
    time.sleep(5)
    # Scrape the profiles of IT professionals and their projects, using for loop to loop over all pages
    page_number=1
    # Search for IT professionals
    page_URL=search(' Software engineer AND :location=europe')
    while (page_number<10):
        time.sleep(random.choice(random_time_list_1)) 
        scrape_profiles()
        time.sleep(random.choice(random_time_list_1)) 
        page_number+=1 
        #Navigating to the next corresponding page
        new_page_URL=navigate_next_page(page_number,page_URL)
        print('Navigating to page :' + str(page_number))
        page_URL= new_page_URL
        time.sleep(random.choice(random_time_list_1)) 
        

In [9]:
##################################
# Main program
##################################
r_expression = r"(?<=-\>)[0-9a-zA-Z ,éÉèàçÀâäêëîïôöùûüÿ':·-]+"
random_time_list_1 = [ 4, 5, 6, 7, 8, 9 ]
random_time_list_2 = [ 0.3, 0.6, 0.9, 1.2, 1.5,  1.8 ]

# Set up the web driver

firefox_binary_path = "/usr/bin/firefox"

options = Options()
options.binary_location = firefox_binary_path

# Set the service with the executable path
geckodriver_path = "/usr/local/bin/geckodriver"  # Replace with the actual path of geckodriver
service = webdriver.firefox.service.Service(geckodriver_path)
driver = webdriver.Firefox(options=options, service=service)

# Create the Firefox WebDriver
driver.maximize_window()
core_prog()
driver.quit()

KeyboardInterrupt: 