### Connection to LinkedIn

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from time import sleep

from dotenv import load_dotenv
load_dotenv()
import time

In [3]:
driver = webdriver.Chrome()
driver.get('https://www.linkedin.com/login')
time.sleep(2)
driver.title

'Вход в LinkedIn, войти | LinkedIn'

In [4]:
#********** LOG IN *************

email = driver.find_element(By.ID, 'username')
password = driver.find_element(By.ID, 'password')

email.send_keys(os.environ['LINKEDIN_EMAIL'])
password.send_keys(os.environ['LINKEDIN_PASSWORD'])

password.submit()

time.sleep(2)

In [7]:
#********** GO TO PROFILE *************
url = "https://www.linkedin.com/in/zubenkoey/"
driver.get(url)
time.sleep(2)
driver.title

'(1) Egor Zubenko | LinkedIn'

### Scrapping

#### Name and Headline

In [8]:
profile_data = {}

#********** GET NAME *************
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
name = soup.find('h1', {'class': 'REcVsMzJIjWGrvfwbuhgiUDStqsxZjeKqIQFB inline t-24 v-align-middle break-words'}).get_text()

#********** GET HEADLINE *************
headline = soup.find('div', {'class': 'text-body-medium break-words'})
headline = headline.get_text().strip()

profile_data['name'] = name
profile_data['url'] = url
profile_data['headline'] = headline

profile_data

{'name': 'Egor Zubenko',
 'url': 'https://www.linkedin.com/in/zubenkoey/',
 'headline': 'Business System Analyst @ Webvork | Python, SQL, OpenAI'}

#### About

In [9]:
#********** GET ABOUT INFO *************

xPath = '//*[@id="profile-content"]/div/div[2]/div/div/main/section[4]/div[3]/div/div/div/span[1]'

try:
    # Locate the element
    element = driver.find_element(By.XPATH, xPath)
    text = element.text
    profile_data['About'] = text
except Exception as e:
    print("Error finding the About section:", e)

profile_data

{'name': 'Egor Zubenko',
 'url': 'https://www.linkedin.com/in/zubenkoey/',
 'headline': 'Business System Analyst @ Webvork | Python, SQL, OpenAI',
 'About': "I'm an Analyst with a strong background in the upstream sector of Oil&Gas industry as a Petroleum Engineer, and Business Analysis for IT projects. Currently, I am involved in various IT projects, such as conducting a comprehensive CRM system analysis, proposing a new algorithm for assigning orders to call center operators, and describing a system feature that enables data-driven decision making. I would like to move on and develop in the field of business and data analysis, implement AI solutions and develop IT products."}

#### Experience

In [10]:
#********** GO TO EXPERIENCE PAGE *************

driver.get('https://www.linkedin.com/in/zubenkoey/details/experience/')
time.sleep(2)

page_source = driver.page_source

In [12]:
#********** EXTRACT EXPERIENCE *************

# Предположим, что page_source содержит HTML-код страницы
soup = BeautifulSoup(page_source, 'lxml')

# Находим все секции опыта
sections = soup.find_all('li', class_='pvs-list__paged-list-item')

# Создаем список для хранения данных
experience_data = []

# Проходим по всем секциям и извлекаем данные
for sec in sections:
    # Извлекаем название компании
    company_tag = sec.find('span', class_='t-14 t-normal')
    company_name = company_tag.get_text(strip=True).split('·')[0] if company_tag else 'N/A'
    
    # Извлекаем должность
    position_tag = sec.find('div', class_='display-flex align-items-center mr1 t-bold')
    position_name = position_tag.get_text(strip=True).split('·')[0] if position_tag else 'N/A'

    # Извлекаем должность
    working_period_tag = sec.find('span', class_='pvs-entity__caption-wrapper')
    working_period = working_period_tag.get_text(strip=True).split('·')[0] if working_period_tag else 'N/A'

    # Длительность
    duration_tag = sec.find('span', class_='pvs-entity__caption-wrapper')
    duration = duration_tag.get_text(strip=True).split('·')[1] if duration_tag else 'N/A'

    # Находим все элементы внутри ul/li
    list_items = sec.find_all('li', class_='pvs-list__item--with-top-padding')
    
    # Description
    description = list_items[0].get_text(strip=True) if len(list_items) > 0 else 'N/A'

    # Skills
    skills = list_items[1].get_text(strip=True) if len(list_items) > 1 else 'N/A'

    
    # Сохраняем в список
    experience_data.append({
        'company': company_name,
        'position': position_name,
        'workin_period': working_period,
        'duration': duration,
        'description': description,
        'skills': skills,
    })

In [13]:
profile_data['Experience'] = experience_data

#### Education

In [14]:
#********** GO TO EDUCATION PAGE *************

driver.get('https://www.linkedin.com/in/zubenkoey/details/education/')
time.sleep(2)

page_source = driver.page_source

In [16]:
#********** EXTRACT EDUCATION *************

# Предположим, что html_data содержит HTML-код секции Education
soup = BeautifulSoup(page_source, 'lxml')

# Находим секцию Education
education_section = soup.find('div', class_='pvs-list__container')

# Список для хранения данных
education_data = []

# Проверяем, найдена ли секция Education
if education_section:
    # Находим все элементы образования
    education_items = education_section.find_all('li', class_='pvs-list__paged-list-item')
    
    for item in education_items:
        # Извлечение названия университета
        university_tag = item.find('div', class_='display-flex align-items-center mr1 hoverable-link-text t-bold')
        university = 'N/A'
        if university_tag:
            span = university_tag.find('span', attrs={'aria-hidden': 'true'})
            if span:
                university = span.get_text(strip=True)
        
        # Извлечение степени и направления
        degree_tag = item.find('span', class_='t-14 t-normal')
        degree = 'N/A'
        if degree_tag:
            span = degree_tag.find('span', attrs={'aria-hidden': 'true'})
            if span:
                degree = span.get_text(strip=True)
        
        # Извлечение дат обучения
        dates_tag = item.find('span', class_='t-14 t-normal t-black--light')
        dates = 'N/A'
        if dates_tag:
            span = dates_tag.find('span', attrs={'aria-hidden': 'true'})
            if span:
                dates = span.get_text(strip=True)
        
        # Извлечение описания и проектов
        details = []
        # Находим все элементы li внутри текущего образования
        detail_items = item.find_all('li', class_='OxttavmlcaTTQyNiTiLxfszAmvmHtRgbWzWcY')
        for detail in detail_items:
            # Извлекаем только первый span с aria-hidden=true
            span = detail.find('span', attrs={'aria-hidden': 'true'})
            if span:
                text = span.get_text(separator=' ', strip=True)
                # Добавляем только если текст не пустой и не дублируется
                if text and text not in details:
                    details.append(text)
        
        # Извлечение сертификатов или медиа (если есть)
        certificates = []
        # Внутри item, ищем все 'a' с class='optional-action-target-wrapper', но исключаем 'edit' и 'view reorder'
        media_links = item.find_all('a', class_='optional-action-target-wrapper')
        for media in media_links:
            aria_label = media.get('aria-label', '').lower()
            if 'edit education' in aria_label or 'view education reorder screen' in aria_label:
                continue  # пропускаем ссылки на редактирование и сортировку
            # Извлекаем только span с aria-hidden=true внутри 'a'
            span = media.find('span', attrs={'aria-hidden': 'true'})
            if span:
                media_text = span.get_text(strip=True)
                if media_text and media_text not in certificates:
                    certificates.append(media_text)
        
        # Сохраняем данные
        education_data.append({
            'university': university,
            'degree': degree,
            'dates': dates,
            'details': details,
            'certificates': certificates
        })

# Выводим результаты
for edu in education_data:
    print(f"University: {edu['university']}")
    print(f"Degree: {edu['degree']}")
    print(f"Dates: {edu['dates']}")
    print("Details:")
    for detail in edu['details']:
        print(f"  - {detail}")
    if edu['certificates']:
        print("Certificates/Media:")
        for cert in edu['certificates']:
            print(f"  - {cert}")
    print("\n")


University: Heriot-Watt University
Degree: Master of Science (MSc), Petroleum Engineering
Dates: 2015 - 2016
Details:
  - Activities and societies: Participant of the best Field Development Project (1st place)
  - Graduation thesis: “Uncertainty reduction of drilling wells at the Field A”
  - Skills: Oil and Gas Engineering · Reservoir Engineering · Oil & Gas
  - Diploma
Certificates/Media:
  - Heriot-Watt University
  - Diploma


University: Peter the Great St.Petersburg Polytechnic University
Degree: Postgraduate Degree, Intelligent Systems
Dates: Feb 2023 - Jun 2023
Details:
  - Grade: A
  - Activities and societies: - High-level Design of Information Control Systems - Project Management - Corporate Information Systems - Intelligent Systems - Software Development Technologies (ML) - Group Project: Safety Helmet Detection expert system - Individual Project: Modelling the system of Permeate neutralisation plant N1
  - A Group Project goal Develop an expert system for recognizing the w

In [17]:
profile_data['Education'] = education_data

#### Licenses & certifications

In [18]:
#********** GO TO Licenses & certifications PAGE *************

driver.get('https://www.linkedin.com/in/zubenkoey/details/certifications/')
time.sleep(2)

page_source = driver.page_source

In [19]:
#********** CHECK THE Licenses & certifications PAGE *************

# soup = BeautifulSoup(page_source, 'lxml')
# sections = soup.find_all('div', {'class': 'scaffold-finite-scroll__content'})

In [20]:
#********** EXTRACT Licenses & certifications *************

# Предположим, что page_source содержит HTML-код страницы с сертификатами
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')

# Находим все секции сертификатов
sections = soup.find_all('div', {'class': 'scaffold-finite-scroll__content'})

# Список для хранения данных о сертификатах
certification_data = []

# Итерация по каждой секции
for section in sections:
    # Находим все элементы сертификатов внутри секции
    certification_items = section.find_all('li', class_='pvs-list__paged-list-item')
    
    for item in certification_items:
        # Извлечение названия сертификата
        name_tag = item.find('div', class_='display-flex align-items-center mr1 hoverable-link-text t-bold')
        certification_name = 'N/A'
        if name_tag:
            span = name_tag.find('span', attrs={'aria-hidden': 'true'})
            if span:
                certification_name = span.get_text(strip=True)
        
        # Извлечение издателя (issuer)
        issuer_tag = item.find('span', class_='t-14 t-normal')
        issuer = 'N/A'
        if issuer_tag:
            span = issuer_tag.find('span', attrs={'aria-hidden': 'true'})
            if span:
                issuer = span.get_text(strip=True)
        
        # Извлечение даты выдачи
        date_tag = item.find('span', class_='t-14 t-normal t-black--light')
        issue_date = 'N/A'
        if date_tag:
            span = date_tag.find('span', attrs={'aria-hidden': 'true'})
            if span:
                issue_date = span.get_text(strip=True)
        
        # Извлечение Credential ID
        credential_id_tag = item.find_all('span', class_='t-14 t-normal t-black--light')
        credential_id = 'N/A'
        if len(credential_id_tag) >= 2:
            # Предполагается, что Credential ID находится во втором span
            span = credential_id_tag[1].find('span', attrs={'aria-hidden': 'true'})
            if span:
                credential_id = span.get_text(strip=True)
        
        # Извлечение навыков (Skills)
        skills = 'N/A'
        skills_container = item.find('div', class_='display-flex align-items-center t-14 t-normal t-black')
        if skills_container:
            skills_span = skills_container.find('span', attrs={'aria-hidden': 'true'})
            if skills_span:
                # Извлекаем текст после "Skills:"
                skills_text = skills_span.get_text(strip=True)
                if 'Skills:' in skills_text:
                    skills = skills_text.replace('Skills:', '').strip()
        
        # Извлечение сертификатов или медиа (если есть)
        certificates = []
        # Находим все 'a' теги с классом 'optional-action-target-wrapper' внутри сертификата
        media_links = item.find_all('a', class_='optional-action-target-wrapper')
        for media in media_links:
            # Избегаем ссылок на редактирование или просмотр реорганизации
            aria_label = media.get('aria-label', '').lower()
            if 'edit certification' in aria_label or 'show credential' in aria_label:
                continue  # пропускаем ссылки на редактирование и просмотр
            # Извлекаем только span с aria-hidden="true"
            span = media.find('span', attrs={'aria-hidden': 'true'})
            if span:
                media_text = span.get_text(strip=True)
                if media_text and media_text not in certificates:
                    certificates.append(media_text)
        
        # Извлечение дополнительных деталей (например, Skills)
        # В данном случае уже извлечены навыки, но если есть другие детали, можно их добавить
        
        # Сохранение данных о сертификате
        certification_data.append({
            'certification_name': certification_name,
            'issuer': issuer,
            'issue_date': issue_date,
            'credential_id': credential_id,
            'skills': skills,
            'certificates_or_media': certificates
        })

# Вывод извлеченных данных
for cert in certification_data:
    print(cert)

{'certification_name': 'Build a Machine Learning Model Skill Path', 'issuer': 'Codecademy', 'issue_date': 'Issued Nov 2024', 'credential_id': 'N/A', 'skills': 'Artificial Intelligence (AI) · Python', 'certificates_or_media': ['Build a Machine Learning Model Skill Path', 'Certificate | Codecademy.pdf', 'Ml_Capstone_Project at GitHub']}
{'certification_name': 'AI Engineering Specialization', 'issuer': 'Scrimba', 'issue_date': 'Issued Mar 2024', 'credential_id': 'Credential ID GANNTUR4L26C', 'skills': 'Large Language Models (LLM) · Artificial Intelligence (AI) · Prompt Engineering', 'certificates_or_media': ['AI Engineering Specialization']}
{'certification_name': 'Supply Chain Analytics Specialization', 'issuer': 'Rutgers University', 'issue_date': 'Issued Jan 2024', 'credential_id': 'Credential ID 4LXCK6T92GYY', 'skills': 'Inventory Analysis · Demand Forecasting · Supply Chain Risk Management · Business Analytics · Forecasting · Supply Chain', 'certificates_or_media': ['Supply Chain Ana

In [21]:
profile_data['Certification'] = certification_data

#### Social MEdia

In [22]:
#********** GO TO MEDIA PAGE *************

driver.get('https://www.linkedin.com/in/zubenkoey/recent-activity/all/')
time.sleep(2)
page_source = driver.page_source

In [24]:
#********** EXTRACT MEDIAs *************

# Инициализация BeautifulSoup
soup = BeautifulSoup(page_source, 'lxml')

# Находим контейнер всех постов
sections = soup.find('div', {'class': 'scaffold-finite-scroll__content'})
posts = sections.find_all('li', {'class': 'uZTLYReyXHzCftLccdpNxgzddhEgFKMTEnQ'})

# Список для сохранения результатов
data = []

# Извлечение данных из каждого поста
for post in posts:
    try:
        # Дата
        date_element = post.find('a', {'class': 'update-components-actor__sub-description-link'})
        date = date_element.get_text(strip=True) if date_element else None
        
        # Автор
        author_element = post.find('span', {'class': 'NyBDmOsEjERFePrZbwKjouqNUZEwPnWBI'})
        author = author_element.get_text(strip=True) if author_element else None
        
        # Контент
        content_element = post.find('div', {'class': 'feed-shared-inline-show-more-text'})
        content = content_element.get_text(strip=True) if content_element else None
        
        # Реакции
        reactions_element = post.find('li', {'class': 'social-details-social-counts__reactions'})
        reactions = reactions_element.get_text(strip=True) if reactions_element else None
        
        # Комментарии
        comments_element = post.find('li', {'class': 'social-details-social-counts__comments'})
        comments = comments_element.get_text(strip=True) if comments_element else None
        
        # Репосты (может быть отсутствовать)
        reposts_element = post.find('button', {'aria-label': 'Repost'})
        reposts = reposts_element.get_text(strip=True) if reposts_element else "0"
        
        # Впечатления (если есть аналитика)
        impressions_element = post.find('strong', {'class': 'ca-entry-point__num-views'})
        impressions = impressions_element.get_text(strip=True) if impressions_element else None

        # Добавляем только те посты, где дата, автор и контент существуют
        if date and author and content:
            data.append({
                'Дата': date,
                'Автор': author,
                'Контент': content,
                'Реакции': reactions or "0",
                'Комментарии': comments or "0",
                'Репосты': reposts,
                'Впечатления': impressions or "N/A"
            })
    except Exception as e:
        print(f"Ошибка обработки поста: {e}")

# Вывод данных
data

[{'Дата': '6d •6 days ago',
  'Автор': 'Egor ZubenkoEgor Zubenko',
  'Контент': 'Build a Machine Learning Model Skill Path fromCodecademy!So, it was useful to refresh my knowledge of regression, classification, and working with PyTorch and scikit-learn.',
  'Реакции': '9Maksim Stepura and 8 others',
  'Комментарии': '1 comment',
  'Репосты': '0',
  'Впечатления': 'N/A'},
 {'Дата': '1mo •1 month ago',
  'Автор': 'Egor ZubenkoEgor Zubenko',
  'Контент': 'It really sounds interesting ))"""Based on our interactions, it seems you have a strong interest in data analysis, AI systems, cryptography, and project management. A career path you might enjoy, which perhaps hasn’t crossed your mind, could be **AI Product Manager or AI Solutions Architect**.Here’s why:1. **AI Product Manager**: You already work with advanced analytics, machine learning, and agent-based systems like LangGraph and CrewAI. This role would allow you to combine your deep understanding of data, product development, and marke

In [25]:
profile_data['Posts'] = data

### Result

In [26]:
#********** CHECK THE RESULT *************

profile_data

{'name': 'Egor Zubenko',
 'url': 'https://www.linkedin.com/in/zubenkoey/',
 'headline': 'Business System Analyst @ Webvork | Python, SQL, OpenAI',
 'About': "I'm an Analyst with a strong background in the upstream sector of Oil&Gas industry as a Petroleum Engineer, and Business Analysis for IT projects. Currently, I am involved in various IT projects, such as conducting a comprehensive CRM system analysis, proposing a new algorithm for assigning orders to call center operators, and describing a system feature that enables data-driven decision making. I would like to move on and develop in the field of business and data analysis, implement AI solutions and develop IT products.",
 'Experience': [{'company': 'Webvork ',
   'position': 'Business System AnalystBusiness System Analyst',
   'workin_period': 'Apr 2023 - Present ',
   'duration': ' 1 yr 9 mos',
   'description': "As a Business Analyst and Data Specialist, I've leveraged my expertise in Python, SQL, and OpenAI's API to drive com

In [27]:
# Закрываем драйвер
driver.quit()

In [33]:
import json
file_path = "profile_data.json"


with open(file_path, "w", encoding="utf-8") as json_file:
    json.dump(profile_data, json_file, ensure_ascii=False, indent=4)