In [1]:
import random
from time import sleep
from timeit import default_timer as timer
from datetime import datetime

from selenium_stealth import stealth
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from utils.generate_name_variations import generate_name_variations
from utils.methods import *
import pandas as pd
import numpy as np
import sys
import json

In [2]:
def check_studied_at_universities(page_source, universities_to_check):
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the script tag containing the JSON-LD data
    script = soup.find('script', type='application/ld+json')

    if script:
        # Extract the JSON content
        json_data = script.string
        print(f"json data: {json_data}")

        # Parse the JSON content
        data = json.loads(json_data)
        print(f"data: {data}")

        try:
            # Check if the person studied at any of the specified universities
            alumni_of = data['@graph'][0]['alumniOf']
            print(f"alumni of: {alumni_of}")
            universities_studied = [org['name'] for org in alumni_of if org['@type'] == 'EducationalOrganization']
            print(f"universities: {universities_studied}")

            for university_to_check in universities_to_check:
                for university_studied in universities_studied:
                    if university_to_check in university_studied:
                        return True
        except KeyError:
            pass

    return False

In [3]:
def check_page_problems(page_source):
    problems = ""
    success = 1

    if "authwall" in page_source:
        print("→ You hit the authentication wall!")
        problems = "authwall_"
        success = 0

    if "captcha" in page_source:
        print("→ You hit a captcha page!")
        problems += "captcha_"
        success = 0

    if page_source.startswith("<html><head>\n    <script type=\"text/javascript\">\n"):
        print("→ You hit javascript obfuscated code!")
        problems += "obfuscatedJS_"
        success = 0
    
    return problems, success

In [4]:
def initialize_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_argument('--blink-settings=imagesEnabled=false')
    # options.add_argument("--headless")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(60)
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True)
    return driver

In [13]:
full_name = 'Gabriela Bertoni dos Santos'
name_variation = 'Gabriela Bertoni dos Santos'


driver = initialize_webdriver()
sleep(1, '→ sleeping 1 second...')

print("→ Requesting 'www.google.com.br'.")
driver.get('https://www.google.com.br')
sleep(random.uniform(1, 2), '→ sleeping between 1 and 2 seconds...')

search_box = driver.find_element(By.NAME, 'q')
search_query = f"{name_variation} ufabc linkedin"
print("→ Searching on Google.")
search_box.send_keys(search_query)
search_box.send_keys(Keys.RETURN)
sleep(random.uniform(2, 3), '→ sleeping between 2 and 3 seconds...')
links = driver.find_elements(By.TAG_NAME, 'a')
linkedin_links = [link for link in links if link.get_attribute('href') and 'linkedin.com/in/' in link.get_attribute('href')]

if linkedin_links:               
    linkedin_url = linkedin_links[0].get_attribute('href').split("?")[0] # we only consider the first linkedin profile

    linkedin_link_title = linkedin_links[0].text.split()
    hyphen_index = linkedin_link_title.index('-')

    names_in_linkedin_link = linkedin_link_title[:hyphen_index]
    names_in_linkedin_link = [normalize_string(name) for name in names_in_linkedin_link]
    print(names_in_linkedin_link)

    names_in_full_name = full_name.split()
    names_in_full_name = [normalize_string(name) for name in names_in_full_name]
    print(names_in_full_name)

    is_subset = set(names_in_linkedin_link) <= set(names_in_full_name)
    print(is_subset)

    first_name_name_variation = normalize_string(name_variation.split()[0].lower())

    if not is_subset:
        print(f"→ Names of Linkedin profile are not a subset of real full name, skipping...")
    else:
        print(f"→ Requesting '{linkedin_url}'.")
        linkedin_links[0].click()
        sleep(random.uniform(5, 7), '→ sleeping between 5 and 7 seconds...')

        page_source = driver.page_source
        problems, success = check_page_problems(page_source) 
        # TODO
        # If it's the xth unsucessful reply in a row, do something
        driver.close()      
else:
    print("→ No LinkedIn search results to access.")


→ sleeping 1 second...
→ Requesting 'www.google.com.br'.
→ sleeping between 1 and 2 seconds...
→ Searching on Google.
→ sleeping between 2 and 3 seconds...
['gabriela', 'bertoni', 'dos', 'santos']
['gabriela', 'bertoni', 'dos', 'santos']
True
→ Requesting 'https://de.linkedin.com/in/gabrielabertoni/pt'.
→ sleeping between 5 and 7 seconds...


In [14]:
if success:
    # TODO
    # 3) check if the person studied at UFABC
    studied_at_ufabc = False

    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the script tag containing the JSON-LD data
    education_items = soup.find_all('li', class_='education__list-item')

    # print(education_items)


    for item in education_items:
        # TODO
        # extract activities and groups that the person participated

        university_element = item.find('h3', class_='profile-section-card__title')
        university = university_element.text.strip() if university_element else None
        
        degree_info_elements = item.find_all('span', class_='education__item--degree-info')
        degree_info = [degree_info_element.text.strip() for degree_info_element in degree_info_elements]
        
        duration_element = item.find('p', class_='education__item--duration')
        duration = duration_element.text.strip() if duration_element else None

        details_element = item.find('div', class_='show-more-less-text')
        details = details_element.text.strip() if details_element else None

        # Print the extracted information
        print('University:', university)
        print('Degree Info:', degree_info)
        print('Duration:', duration)
        print('Details:', details)
        print('---')

University: FH Campus Wien | University of Applied Sciences
Degree Info: ['Master of Science in Engineering (MSc)', 'Packaging Technology and Sustainability']
Duration: 2022 - o momento
Details: None
---
University: Universidade Federal do ABC
Degree Info: ['Bacharelado em Engenharia', 'Engenharia de Materiais', '3,92 de 4']
Duration: 2013 - 2019
Details: Caracterização e avaliação do desempenho de materiais, propriedades dos materiais, processamento de materiais, nanociência e nanotecnologia dos materiais.
---
University: Offenburg University of Applied Sciences
Degree Info: ['Bacharelado em Engenharia', 'Engenharia Mecânica e Engenharia de Materiais']
Duration: 2018 - 2018
Details: Mobilidade acadêmica para a Hochschule Offenburg, como bolsista da Fundação Baden-Württemberg.
---
University: TU Dortmund University
Degree Info: ['International Summer Program']
Duration: 2017 - 2017
Details: Cursos: German Culture and Society, German Language, International Business and Industrial Marke