In [5]:
import random
from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from selenium_stealth import stealth
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from utils.generate_name_variations import generate_name_variations
from utils.methods import *
import pandas as pd
import numpy as np
import sys
import json

In [6]:
def check_studied_at_universities(page_source, universities_to_check):
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the script tag containing the JSON-LD data
    script = soup.find('script', type='application/ld+json')

    if script:
        # Extract the JSON content
        json_data = script.string
        print(f"json data: {json_data}")

        # Parse the JSON content
        data = json.loads(json_data)
        print(f"data: {data}")

        try:
            # Check if the person studied at any of the specified universities
            alumni_of = data['@graph'][0]['alumniOf']
            print(f"alumni of: {alumni_of}")
            universities_studied = [org['name'] for org in alumni_of if org['@type'] == 'EducationalOrganization']
            print(f"universities: {universities_studied}")

            for university_to_check in universities_to_check:
                for university_studied in universities_studied:
                    if university_to_check in university_studied:
                        return True
        except KeyError:
            pass

    return False

In [7]:
def check_page_problems(page_source):
    problems = ""
    success = 1

    if "authwall" in page_source:
        print("→ You hit the authentication wall!")
        problems = "authwall_"
        success = 0

    if "captcha" in page_source:
        print("→ You hit a captcha page!")
        problems += "captcha_"
        success = 0

    if page_source.startswith("<html><head>\n    <script type=\"text/javascript\">\n"):
        print("→ You hit javascript obfuscated code!")
        problems += "obfuscatedJS_"
        success = 0
    
    return problems, success

In [15]:
def initialize_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_argument('--blink-settings=imagesEnabled=false')
    # options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(60)
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True)
    return driver

In [50]:
full_name = 'Gabriela Bertoni dos Santos' # André José de Queiroz Padilha, Gabriela Bertoni dos Santos, João Caprera, Caio Fiaschi da Silva
name_variation = 'Gabriela Bertoni dos Santos'

driver = initialize_webdriver()
sleep(1, '→ sleeping 1 second...')

print("→ Requesting 'www.google.com.br'.")
driver.get('https://www.google.com.br')
sleep(random.uniform(1, 2), '→ sleeping between 1 and 2 seconds...')

search_box = driver.find_element(By.NAME, 'q')
search_query = f"{name_variation} ufabc linkedin"
print("→ Searching on Google.")
search_box.send_keys(search_query)
search_box.send_keys(Keys.RETURN)
sleep(random.uniform(2, 3), '→ sleeping between 2 and 3 seconds...')
links = driver.find_elements(By.TAG_NAME, 'a')
linkedin_links = [link for link in links if link.get_attribute('href') and 'linkedin.com/in/' in link.get_attribute('href')]

if linkedin_links:               
    linkedin_url = linkedin_links[0].get_attribute('href').split("?")[0] # we only consider the first linkedin profile

    linkedin_link_title = linkedin_links[0].text.split()
    hyphen_index = linkedin_link_title.index('-')

    names_in_linkedin_link = linkedin_link_title[:hyphen_index]
    names_in_linkedin_link = [normalize_string(name) for name in names_in_linkedin_link]
    print(names_in_linkedin_link)

    names_in_full_name = full_name.split()
    names_in_full_name = [normalize_string(name) for name in names_in_full_name]
    print(names_in_full_name)

    is_subset = set(names_in_linkedin_link) <= set(names_in_full_name)
    print(is_subset)

    first_name_name_variation = normalize_string(name_variation.split()[0].lower())

    if not is_subset:
        print(f"→ Names of Linkedin profile are not a subset of real full name, skipping...")
    else:
        print(f"→ Requesting '{linkedin_url}'.")
        linkedin_links[0].click()
        sleep(random.uniform(5, 7), '→ sleeping between 5 and 7 seconds...')

        page_source = driver.page_source
        problems, success = check_page_problems(page_source) 
        # TODO
        # If it's the xth unsucessful reply in a row, do something
        driver.close()      
else:
    print("→ No LinkedIn search results to access.")


→ sleeping 1 second...
→ Requesting 'www.google.com.br'.
→ sleeping between 1 and 2 seconds...
→ Searching on Google.
→ sleeping between 2 and 3 seconds...
['gabriela', 'bertoni', 'dos', 'santos']
['gabriela', 'bertoni', 'dos', 'santos']
True
→ Requesting 'https://de.linkedin.com/in/gabrielabertoni/pt'.
→ sleeping between 5 and 7 seconds...
→ You hit the authentication wall!


In [52]:
# Opening the html file
profile_html = open("../Gabriela Bertoni dos Santos - Packaging Sustainability Manager - Henkel _ LinkedIn.html", "r").read()

In [53]:
def extract_professional_experience(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all experience items
    experience_list = soup.find('ul', class_='experience__list')
    experience_items = experience_list.find_all('li', class_='profile-section-card')

    if len(experience_items) > 0:
        columns = {
            'role': str,
            'location': str,
            'start_date': str, # actually it's date
            'end_date': str, # actually it's date
            'description': str

        }

        experience_df = pd.DataFrame(columns=columns.keys())\
            .astype(columns)
    else:
        return None

    # Iterate over each experience item
    for item in experience_items:
        # Extract the role name
        role = item.find('h3', class_='profile-section-card__title').text.strip()

        
        # Extract the location (if available)
        location = None
        location_element = item.find('p', class_='experience-item__location')
        if not location_element:
            location_element = item.find('p', class_='experience-group-position__location')
        if location_element:
            location = location_element.text.strip()


        # Extract the company name (if available)
        company_element = item.find('h4', class_='profile-section-card__subtitle')
        company_name = company_element.text.strip() if company_element else None
        # TODO
        # the company should be added in another DB called "Company"

        # Find the date range element
        date_range_element = item.find('span', class_='date-range')

        # Extract the start time, end time, and duration
        if date_range_element:
            # TODO: 
            # Convert date string to DATE
            time_elements = date_range_element.find_all('time')

            if time_elements and len(time_elements) >= 1:
                start_date = time_elements[0].text.strip()

                if len(time_elements) == 2:
                    end_date = time_elements[1].text.strip()
                else:
                    end_date = "Ongoing"
            else:
                start_date = None
                end_date = None

            # duration_element = date_range_element.find('span', class_='before:middot')
            # if duration_element:
            #     duration = duration_element.text.strip()
            # else:
            #     duration = None


        # Extract the description (if available)
        description = None
        description_element = item.find('div', class_='experience-item__description')
        if not description_element:
            description_element = item.find('div', class_='experience-group-position__description')
        
        if description_element:
            description_big = description_element.find('p', class_='show-more-less-text__text--more')
            if description_big:
                description = description_big.get_text(strip=True).replace('Exibir menos', '')
            else:
                description = description_element.find('p', class_='show-more-less-text__text--less').get_text(strip=True)



        experience_df = experience_df.append({
            'role': role,
            'location': location,
            'start_date': start_date, 
            'end_date': end_date,
            'description': description}, 
            ignore_index = True)

    return experience_df


df = extract_professional_experience(profile_html)
df

Unnamed: 0,role,location,start_date,end_date,description
0,Packaging Sustainability Manager,"Düsseldorf, North Rhine-Westphalia, Germany",set. de 2021,Ongoing,- Responsible for promoting packaging innovati...
1,Jr. Application Engineer - Adhesives for Packa...,"Jundiaí, São Paulo, Brazil",out. de 2019,set. de 2021,- Work on Adhesives & Coating for Flexible Pac...
2,Estágio em Suporte Técnico - Adesivos Industri...,"São Paulo e Região, Brasil",fev. de 2019,out. de 2019,•\tConducting analyzes to evaluate the perform...
3,Estágio em P&D Tecnologia de Embalagens,"Düsseldorf, Germany",ago. de 2018,jan. de 2019,•\tResponsible for a research project of a new...
4,Iniciação Científcia,"São Paulo e Região, Brasil",jan. de 2017,mar. de 2018,Responsável por desenvolver um projeto de Inic...
5,Presidente do Capítulo Estudantil IEEE CPMT UFABC,Univerisidade Federal do ABC,jan. de 2017,dez. de 2017,Presidente do capítulo de Endenharia de Materi...
6,Iniciação Científcia,"São Paulo e Região, Brasil",ago. de 2014,jul. de 2015,Responsável por desenvolver um projeto de Inic...


In [57]:
def extract_education_info(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')

    education_items = soup.find_all('li', class_='education__list-item')

    if len(education_items) > 0:
        columns = {
            'degree': str,
            'field_of_study': str,
            'start_date': str, # actually it's date
            'end_date': str, # actually it's date
            'description': str,
            'grade': str,
            'activities_societies': str

        }

        education_df = pd.DataFrame(columns=columns.keys())\
            .astype(columns)
    else:
        return None

    for item in education_items:
        school_element = item.find('h3', class_='profile-section-card__title')
        school = school_element.text.strip() if school_element else None
        # TODO
        # the school should be added in another DB called "school"
        
        degree_info_elements = item.find_all('span', class_='education__item--degree-info')
        degree_info = [degree_info_element.text.strip() for degree_info_element in degree_info_elements]
        degree = degree_info[0]
        
        field_of_study = None
        if len(degree_info) >= 2:
            field_of_study = degree_info[1]

        grade = None
        if len(degree_info) >= 3:
            # TODO:
            # I need to extract the grade and the maximum grade (maybe not here, but definitely in the analysis)
            grade = degree_info[2]

        duration_element = item.find('p', class_='education__item--duration')

        # Extract the start time, end time, and duration
        if duration_element:
            # TODO: 
            # Convert date string to DATE
            time_elements = duration_element.find_all('time')

            if time_elements and len(time_elements) >= 1:
                start_date = time_elements[0].text.strip()

                if len(time_elements) == 2:
                    end_date = time_elements[1].text.strip()
                else:
                    end_date = "Ongoing"
            else:
                start_date = None
                end_date = None

        description_element = item.find('div', class_='show-more-less-text')
        description = description_element.text.strip() if description_element else None

        activities_societies_element = item.find('p', class_='education__item--activities-and-societies')
        activities_societies = activities_societies_element.text.strip() if activities_societies_element else None

        education_df = education_df.append({
            'degree': degree,
            'field_of_study': field_of_study,
            'start_date': start_date, # actually it's date
            'end_date': end_date, # actually it's date
            'description': description,
            'grade': grade,
            'activities_societies': activities_societies},              
            ignore_index=True)

    return education_df

df = extract_education_info(profile_html)
df

Unnamed: 0,degree,field_of_study,start_date,end_date,description,grade,activities_societies
0,Master of Science in Engineering (MSc),Packaging Technology and Sustainability,2022,Ongoing,,,
1,Bacharelado em Engenharia,Engenharia de Materiais,2013,2019,Caracterização e avaliação do desempenho de ma...,"3,92 de 4",Atividades e grupos:Entidade estudantil IEEE U...
2,Bacharelado em Engenharia,Engenharia Mecânica e Engenharia de Materiais,2018,2018,Mobilidade acadêmica para a Hochschule Offenbu...,,Atividades e grupos:International Student and ...
3,International Summer Program,,2017,2017,"Cursos: German Culture and Society, German Lan...",,
4,Bacharelado em Ciência e Tecnologia,Ciência e Tecnologia,2013,2017,"Ciência da Computação, Ciências Naturais, Ciên...","3,90 out of 4",Atividades e grupos:Entidade Estudantil IEEE U...
