In [1]:
import random
from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from selenium_stealth import stealth
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from utils.generate_name_variations import generate_name_variations
from utils.methods import *
import pandas as pd
import numpy as np
import sys
import json

In [2]:
def check_studied_at_universities(page_source, universities_to_check):
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the script tag containing the JSON-LD data
    script = soup.find('script', type='application/ld+json')

    if script:
        # Extract the JSON content
        json_data = script.string
        print(f"json data: {json_data}")

        # Parse the JSON content
        data = json.loads(json_data)
        print(f"data: {data}")

        try:
            # Check if the person studied at any of the specified universities
            alumni_of = data['@graph'][0]['alumniOf']
            print(f"alumni of: {alumni_of}")
            universities_studied = [org['name'] for org in alumni_of if org['@type'] == 'EducationalOrganization']
            print(f"universities: {universities_studied}")

            for university_to_check in universities_to_check:
                for university_studied in universities_studied:
                    if university_to_check in university_studied:
                        return True
        except KeyError:
            pass

    return False

In [3]:
def check_page_problems(page_source):
    problems = ""
    success = 1

    if "authwall" in page_source:
        print("→ You hit the authentication wall!")
        problems = "authwall_"
        success = 0

    if "captcha" in page_source:
        print("→ You hit a captcha page!")
        problems += "captcha_"
        success = 0

    if page_source.startswith("<html><head>\n    <script type=\"text/javascript\">\n"):
        print("→ You hit javascript obfuscated code!")
        problems += "obfuscatedJS_"
        success = 0
    
    return problems, success

In [4]:
def initialize_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_argument('--blink-settings=imagesEnabled=false')
    # options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(60)
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True)
    return driver

In [76]:
full_name = 'João Caprera' # André José de Queiroz Padilha, Gabriela Bertoni dos Santos, João Caprera, Caio Fiaschi da Silva
name_variation = 'João Caprera'


driver = initialize_webdriver()
sleep(1, '→ sleeping 1 second...')

print("→ Requesting 'www.google.com.br'.")
driver.get('https://www.google.com.br')
sleep(random.uniform(1, 2), '→ sleeping between 1 and 2 seconds...')

search_box = driver.find_element(By.NAME, 'q')
search_query = f"{name_variation} ufabc linkedin"
print("→ Searching on Google.")
search_box.send_keys(search_query)
search_box.send_keys(Keys.RETURN)
sleep(random.uniform(2, 3), '→ sleeping between 2 and 3 seconds...')
links = driver.find_elements(By.TAG_NAME, 'a')
linkedin_links = [link for link in links if link.get_attribute('href') and 'linkedin.com/in/' in link.get_attribute('href')]

if linkedin_links:               
    linkedin_url = linkedin_links[0].get_attribute('href').split("?")[0] # we only consider the first linkedin profile

    linkedin_link_title = linkedin_links[0].text.split()
    hyphen_index = linkedin_link_title.index('-')

    names_in_linkedin_link = linkedin_link_title[:hyphen_index]
    names_in_linkedin_link = [normalize_string(name) for name in names_in_linkedin_link]
    print(names_in_linkedin_link)

    names_in_full_name = full_name.split()
    names_in_full_name = [normalize_string(name) for name in names_in_full_name]
    print(names_in_full_name)

    is_subset = set(names_in_linkedin_link) <= set(names_in_full_name)
    print(is_subset)

    first_name_name_variation = normalize_string(name_variation.split()[0].lower())

    if not is_subset:
        print(f"→ Names of Linkedin profile are not a subset of real full name, skipping...")
    else:
        print(f"→ Requesting '{linkedin_url}'.")
        linkedin_links[0].click()
        sleep(random.uniform(5, 7), '→ sleeping between 5 and 7 seconds...')

        page_source = driver.page_source
        problems, success = check_page_problems(page_source) 
        # TODO
        # If it's the xth unsucessful reply in a row, do something
        driver.close()      
else:
    print("→ No LinkedIn search results to access.")


→ sleeping 1 second...
→ Requesting 'www.google.com.br'.
→ sleeping between 1 and 2 seconds...
→ Searching on Google.
→ sleeping between 2 and 3 seconds...
['joao', 'caprera']
['joao', 'caprera']
True
→ Requesting 'https://br.linkedin.com/in/joao-caprera-7718432b'.
→ sleeping between 5 and 7 seconds...


In [77]:
def extract_education_info(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')

    education_items = soup.find_all('li', class_='education__list-item')

    universities = []
    degrees = []
    durations = []
    details_list = []
    activities_groups_list = []

    for item in education_items:
        university_element = item.find('h3', class_='profile-section-card__title')
        university = university_element.text.strip() if university_element else None
        universities.append(university)
        
        degree_info_elements = item.find_all('span', class_='education__item--degree-info')
        degree_info = [degree_info_element.text.strip() for degree_info_element in degree_info_elements]
        degrees.append(degree_info)
        
        duration_element = item.find('p', class_='education__item--duration')
        duration = duration_element.text.strip() if duration_element else None
        durations.append(duration)

        details_element = item.find('div', class_='show-more-less-text')
        details = details_element.text.strip() if details_element else None
        details_list.append(details)

        activities_groups_element = item.find('p', class_='education__item--activities-and-societies')
        activities_groups = activities_groups_element.text.strip() if activities_groups_element else None
        activities_groups_list.append(activities_groups)

    return universities, degrees, durations, details_list, activities_groups_list

def extract_professional_experience(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all experience items
    experience_list = soup.find('ul', class_='experience__list')
    experience_items = experience_list.find_all('li', class_='profile-section-card')

    # Iterate over each experience item
    for item in experience_items:
        # Extract the role name
        role_name = item.find('h3', class_='profile-section-card__title').text.strip()

        # Extract the description (if available)
        description = None
        description_element = item.find('div', class_='experience-item__description')
        if not description_element:
            description_element = item.find('div', class_='experience-group-position__description')
        
        if description_element:
            description_big = description_element.find('p', class_='show-more-less-text__text--more')
            if description_big:
                description = description_big.get_text(strip=True).replace('Exibir menos', '')
            else:
                description = description_element.find('p', class_='show-more-less-text__text--less').get_text(strip=True)

        # Extract the location (if available)
        location = None
        location_element = item.find('p', class_='experience-item__location')
        if not location_element:
            location_element = item.find('p', class_='experience-group-position__location')
        if location_element:
            location = location_element.text.strip()


        # Extract the company name (if available)
        company_element = item.find('h4', class_='profile-section-card__subtitle')
        company_name = company_element.text.strip() if company_element else None

        # Find the date range element
        date_range_element = item.find('span', class_='date-range')

        # Extract the start time, end time, and duration
        if date_range_element:
            time_elements = date_range_element.find_all('time')
            duration_element = date_range_element.find('span', class_='before:middot')

            if time_elements and len(time_elements) >= 1:
                start_time = time_elements[0].text.strip()

                if len(time_elements) == 2:
                    end_time = time_elements[1].text.strip()
                else:
                    end_time = "Ongoing"
            else:
                start_time = None
                end_time = None

            if duration_element:
                duration = duration_element.text.strip()
            else:
                duration = None

        # Print the extracted information
        print("Role:", role_name)
        print("Description:", description)
        print("Location:", location)
        print("Company:", company_name)
        print("Start Time:", start_time)
        print("End Time:", end_time)
        print("Duration:", duration)
        print()


if success:
    extract_professional_experience(page_source)

if success:
    universities, degrees, durations, details_list, activities_groups_list = extract_education_info(page_source)

    # Print the extracted information
    print('University:', universities)
    print('Degree Info:', degrees)
    print('Duration:', durations)
    print('Details:', details_list)
    print('Activities and groups', activities_groups_list)
    print('---')

Role: Analista de Risco Operacional Pleno
Description: WMS - Investment Service Operations
Location: None
Company: Itaú Unibanco
Start Time: dez. de 2022
End Time: Ongoing
Duration: 7 meses

Role: Analista de Risco Operacional Pleno
Description: Gestão de Continuidade de Negócios
Location: None
Company: Itaú Unibanco
Start Time: set. de 2021
End Time: dez. de 2022
Duration: 1 ano 4 meses

Role: Analista de Risco Operacional Jr.
Description: Riso Operacional:- Desenvolvimento e manutenção de ferramentas- Implementação de metodologia de análise de risco operacional
Location: São Paulo e Região, Brasil
Company: Itaú Unibanco
Start Time: dez. de 2017
End Time: set. de 2021
Duration: 3 anos 10 meses

Role: Estagiário
Description: Controle Interno de Risco Operacional- Análise do negócio "Veículos"- Monitoramento de ocorrências e riscos das operações
Location: None
Company: Itaú Unibanco
Start Time: mar. de 2017
End Time: dez. de 2017
Duration: 10 meses

Role: Lean Enterprise Summer Program
