# Importações

In [1]:
import os
import requests
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta


import numpy as np
import pandas as pd
import seaborn as sns

# testing
from traceback import print_stack
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import *
from utilities.custom_logger import CustomLogger
import logging
import time
import os

# Classes e Funções

In [2]:
class Job():
    def __init__(self, position, company, location, posted_date, no_applicants, date_collected, type_workplace, 
                 required_skills, competitive_advantages, level, worktype, description):
        self.position = position
        self.company = company
        self.location = location
        self.posted_date = posted_date
        self.no_applicants = no_applicants
        self.date_collected = date_collected
        self.type_workplace = type_workplace
        self.required_skills = required_skills
        self.competitive_advantages = competitive_advantages
        self.level = level
        self.worktype = worktype
        self.description = description

    @staticmethod
    def calculate_date(quantity, type_of_date):
        # In portuguese, Linkedin returns 'Há 3 dias', 'Há 3 semanas' e similares para informar
        # a data; com esse código, conseguimos traduzir para uma data de fato.
        if 'minuto' in type_of_date:
            return datetime.today() - timedelta(minutes=quantity)
        elif 'hora' in type_of_date:
            return datetime.today() - timedelta(hours=quantity)
        elif 'dia' in type_of_date:
            return datetime.today() - timedelta(days=quantity)
        elif 'semana' in type_of_date:
            return datetime.today() - timedelta(weeks=quantity)
        elif 'mes' in re.sub('ê', 'e', type_of_date):
            return datetime.today() - relativedelta(months=quantity)
        return np.nan

class Company():
    def __init__(self, name, size, sector):
        self.name = name
        self.size = size
        self.sector = sector


In [3]:
class SeleniumDriver():
# https://github.com/Shivam23Thaman/POM-Project/blob/master/base/selenium_driver.py
# https://medium.com/analytics-vidhya/creating-our-own-selenium-driver-class-in-python3-x-and-important-python-concepts-8bf92d702230

# Falta melhorar o sistema de log, que está gerando arquivos demais


    log = CustomLogger(logging.DEBUG)

    def __init__(self, driver):
        self.driver = driver

    def initialize_driver():
        options = webdriver.ChromeOptions()
        options.add_argument(r"--user-data-dir=C:\Users\alexa\AppData\Local\Google\Chrome\User Data\\")
        driver = webdriver.Chrome(executable_path='./resources/chromedriver.exe', options=options)
        return driver

    def screen_shot(self, resultMessage):
        """
        Takes screenshot of the current open web page
        """
        fileName = resultMessage + "." + str(round(time.time() * 1000)) + ".png"
        screenshotDirectory = "../screenshots/"
        relativeFileName = screenshotDirectory + fileName
        currentDirectory = os.path.dirname(__file__)
        destinationFile = os.path.join(currentDirectory, relativeFileName)
        destinationDirectory = os.path.join(currentDirectory, screenshotDirectory)

        try:
            if not os.path.exists(destinationDirectory):
                os.makedirs(destinationDirectory)
            self.driver.save_screenshot(destinationFile)
            self.log.info("Screenshot save to directory: " + destinationFile)
        except:
            self.log.error("### Exception Occurred when taking screenshot")
            print_stack()

    getTitle = lambda self: self.driver.title
    
    def get_by_type(self, locatorType):
        locatorType = locatorType.lower()
        if locatorType == "id":
            return By.ID
        elif locatorType == "name":
            return By.NAME
        elif locatorType == "xpath":
            return By.XPATH
        elif locatorType == "css":
            return By.CSS_SELECTOR
        elif locatorType == "class":
            return By.CLASS_NAME
        elif locatorType == "link":
            return By.LINK_TEXT
        elif locatorType == "tag":
            return By.TAG_NAME
        else:
            self.log.info("Locator type " + locatorType +
                          " not correct/supported")
        return False

    def get_element(self, locator, locatorType="id", parent=True, parent_element=None):
        element = None
        try:
            locatorType = locatorType.lower()
            byType = self.get_by_type(locatorType)
            if parent:
                element = self.driver.find_element(byType, locator)
            else:
                element = parent_element.find_element(byType, locator)
            self.log.info("Element found with locator: " + locator +
                          " and  locatorType: " + locatorType)
        except Exception as e:
            self.log.info("Element not found with locator: " + locator +
                          " and locatorType: " + locatorType)
            print(e)
        return element

    def get_element_list(self, locator, locatorType="id", parent=True, parent_element=None):
        """
        Get list of elements
        """
        locatorType = locatorType.lower()
        byType = self.get_by_type(locatorType)
        if parent:
            elements = self.driver.find_elements(byType, locator)
        else:
            elements = parent_element.find_elements(byType, locator)
        if len(elements) > 0:
            self.log.info("Element list FOUND with locator: " + locator +
                          " and locatorType: " + locatorType)
        else:
            self.log.info("Element list NOT FOUND with locator: " + locator +
                              " and locatorType: " + locatorType)
        return elements

    def click_element(self, locator="", locatorType="id", element=None):
        """
        Click on an element -> MODIFIED
        Either provide element or a combination of locator and locatorType
        """
        try:
            if locator:  # This means if locator is not empty
                element = self.get_element(locator, locatorType)
            element.click()
            self.log.info("Clicked on element with locator: " + locator +
                          " locatorType: " + locatorType)
        except:
            self.log.info("Cannot click on the element with locator: " + locator +
                          " locatorType: " + locatorType)
            print_stack()

    def send_keys(self, data, locator="", locatorType="id", element=None):
        """
        Send keys to an element -> MODIFIED
        Either provide element or a combination of locator and locatorType
        """
        try:
            if locator:  # This means if locator is not empty
                element = self.get_element(locator, locatorType)
            element.send_keys(data)
            self.log.info("Sent data on element with locator: " + locator +
                          " locatorType: " + locatorType)
        except:
            self.log.info("Cannot send data on the element with locator: " + locator +
                  " locatorType: " + locatorType)
            print_stack()

    def clear_field(self, locator="", locatorType="id"):
        """
        Clear an element field
        """
        element = self.get_element(locator, locatorType)
        element.clear()
        self.log.info("Clear field with locator: " + locator +
                      " locatorType: " + locatorType)

    def get_text(self, locator="", locatorType="id", element=None, info=""):
        """
        NEW METHOD
        Get 'Text' on an element
        Either provide element or a combination of locator and locatorType
        """
        try:
            
            if locator: # This means if locator is not empty
                print('oiaaaa')
                element = self.get_element(locator, locatorType)
            if not element:
                return None
            
            text = element.text
           
            if len(text) == 0:
                text = element.get_attribute("innerText")
            if len(text) != 0:
                self.log.info("Getting text on element :: " +  info)
                self.log.info("The text is :: '" + text + "'")
                text = text.strip()
        except Exception as e:
            self.log.info("Failed to get text on element " + info)
            
            #print_stack()
            print(e)
            text = None
        return text

    def is_element_present(self, locator="", locatorType="id", element=None):
        """
        Check if element is present -> MODIFIED
        Either provide element or a combination of locator and locatorType
        """
        try:
            if locator:  # This means if locator is not empty
                element = self.get_element(locator, locatorType)
            if element is not None:
                self.log.info("Element present with locator: " + locator +
                              " locatorType: " + locatorType)
                return True
            else:
                self.log.info("Element not present with locator: " + locator +
                              " locatorType: " + locatorType)
                return False
        except:
            print("Element not found")
            return False

    def is_element_displayed(self, locator="", locatorType="id", element=None):
        """
        NEW METHOD
        Check if element is displayed
        Either provide element or a combination of locator and locatorType
        """
        isDisplayed = False
        try:
            if locator:  # This means if locator is not empty
                element = self.get_element(locator, locatorType)
            if element is not None:
                isDisplayed = element.is_displayed()
                self.log.info("Element is displayed" )
            else:
                self.log.info("Element not displayed")
            return isDisplayed
        except:
            print("Element not found")
            return False

   # def element_presence_check(self, locator, byType):
        """
        Check if element is present
        """
        try:
            elementList = self.driver.find_elements(byType, locator)
            if len(elementList) > 0:
                self.log.info("Element present with locator: " + locator +
                              " locatorType: " + str(byType))
                return True
            else:
                self.log.info("Element not present with locator: " + locator +
                              " locatorType: " + str(byType))
                return False
        except:
            self.log.info("Element not found")
            return False

    def wait_for_element(self, locator, locatorType="id",
                               timeout=10, pollFrequency=0.5):
        element = None
        try:
            byType = self.get_by_type(locatorType)
            self.log.info("Waiting for maximum :: " + str(timeout) +
                  " :: seconds for element to be clickable")
            wait = WebDriverWait(self.driver, timeout=timeout,
                                 poll_frequency=pollFrequency,
                                 ignored_exceptions=[NoSuchElementException,
                                                     ElementNotVisibleException,
                                                     ElementNotSelectableException])
            element = wait.until(EC.element_to_be_clickable((byType, locator)))
            self.log.info("Element appeared on the web page")
        except:
            self.log.info("Element not appeared on the web page")
            print_stack()
        return element

    def web_scroll(self, direction="up", px=800):
        """
        NEW METHOD
        """
        if direction == "up":
            # Scroll Up
            self.driver.execute_script(f"window.scrollBy(0, -{px});")

        if direction == "down":
            # Scroll Down
            self.driver.execute_script(f"window.scrollBy(0, {px});")

    def switch_to_frame(self, id="", name="", title='',index=None):
        """
        Switch to iframe using element locator inside iframe

        Parameters:
            1. Required:
                None
            2. Optional:
                1. id    - id of the iframe
                2. name  - name of the iframe
                3. index - index of the iframe
        Returns:
            None
        Exception:
            None
        """
        if id:
            self.driver.switch_to.frame(id)
        elif name:
            self.driver.switch_to.frame(name)
        elif title:
            self.driver.switch_to.frame(title)
        else:
            self.driver.switch_to.frame(index)


    def switch_to_default_content(self):
        """
        Switch to default content

        Parameters:
            None
        Returns:
            None
        Exception:
            None
        """
        self.driver.switch_to.default_content()

    def get_element_attribute_value(self, attribute, element=None, locator="", locatorType="id"):
        """
        Get value of the attribute of element

        Parameters:
            1. Required:
                1. attribute - attribute whose value to find

            2. Optional:
                1. element   - Element whose attribute need to find
                2. locator   - Locator of the element
                3. locatorType - Locator Type to find the element

        Returns:
            Value of the attribute
        Exception:
            None
        """
        if locator:
            element = self.get_element(locator=locator, locatorType=locatorType)
        value = element.get_attribute(attribute)
        return value

    def is_enabled(self, locator, locatorType="id", info=""):
        """
        Check if element is enabled

        Parameters:
            1. Required:
                1. locator - Locator of the element to check
            2. Optional:
                1. locatorType - Type of the locator(id(default), xpath, css, className, linkText)
                2. info - Information about the element, label/name of the element
        Returns:
            boolean
        Exception:
            None
        """
        element = self.getElement(locator, locatorType=locatorType)
        enabled = False
        try:
            attributeValue = self.get_element_attribute_value(element=element, attribute="disabled")
            if attributeValue is not None:
                enabled = element.is_enabled()
            else:
                value = self.get_element_attribute_value(element=element, attribute="class")
                self.log.info("Attribute value From Application Web UI --> :: " + value)
                enabled = not ("disabled" in value)
            if enabled:
                self.log.info("Element :: '" + info + "' is enabled")
            else:
                self.log.info("Element :: '" + info + "' is not enabled")
        except:
            self.log.info("Element :: '" + info + "' state could not be found")
        return enabled

In [8]:
class LinkedinScraper(SeleniumDriver):
    MAIN_LINKEDIN_LINK = 'https://www.linkedin.com/jobs/search/?'
    searched_job = 'analista de dados'
    keywords = 'keywords=' + searched_job
    location = 'location=Brasil'
    final_link = MAIN_LINKEDIN_LINK + keywords + '&' + location + '&' + 'geoId=106057199'

    def __init__(self, driver):
        super(LinkedinScraper, self).__init__(driver)

    def get_linkedin(self, page=0):
        start = page * 25
        self.driver.get(f'{self.final_link}&start={start}')
        print(f'{self.final_link}+&start={start}')

    def get_data_from_linkedin_page(self, limit=False, limit_qtd=5):
        # Get job data, scroll down to load every job, get the data again
        self.wait_for_element('scaffold-layout__list-container', 'class')
        job_list = self.get_element('scaffold-layout__list-container', 'class')
        job_list = self.get_element_list('a', 'tag', False, parent_element=job_list)
        for i in range(10):
            self.send_keys(Keys.PAGE_DOWN, element=job_list[0])
        job_list = self.get_element('scaffold-layout__list-container', 'class')
        job_list = self.get_element_list('a', 'tag', False, parent_element=job_list)
        job_collection = []
        i = 0

        for job in job_list:
            job_data = dict()
            job.click()
            self.wait_for_element('jobs-unified-top-card__job-insight', 'class', timeout=10)
            self.wait_for_element('jobs-unified-top-card__posted-date', 'class', timeout=10)
            self.wait_for_element('//*[@id="job-details"]/span', 'xpath', timeout=10)
            job_content = self.get_element('jobs-unified-top-card__content--two-pane', 'class')

            # First area of information (top)
            job_data['title'] = self.get_text(element=self.get_element('h2', 'tag', False, job_content))
            job_data['company'] = self.get_text(element=self.get_element('jobs-unified-top-card__company-name', 'class', False, job_content))
            job_data['location'] = self.get_text(element=self.get_element('jobs-unified-top-card__bullet', 'class', False, job_content))
            job_data['type_workplace'] = self.get_text(element=self.get_element('jobs-unified-top-card__workplace-type', 'class', False, job_content))
            job_data['applicant_count'] = self.get_text(element=self.get_element('jobs-unified-top-card__applicant-count', 'class', False, job_content))
            if isinstance(job_data['applicant_count'], str):
                job_data['applicant_count'] = job_data['applicant_count'].split()[0]
            
            # Calculating posted date
            quantity, temporal_type = self.get_text(element=self.get_element('jobs-unified-top-card__posted-date', 'class', False, job_content)).split()[1:]
            job_data['posted_date'] = Job.calculate_date(int(quantity), temporal_type)
        
            # Second area of information (job insight)
            
            job_insight = self.get_element('mt5', 'class', False, job_content)
            job_insights = self.get_element_list('li', 'tag', False, job_insight)
            job_insights = [self.get_text(element=insight) for insight in job_insights]
            for insight in job_insights:
                if 'competências' in insight.lower():
                    job_data['skills'] = insight.split(': ')[1]
            if '·' in job_insights[0]:
                job_data['worktype'] = job_insights[0].split('·')[0].strip()
            else:
                job_data['worktype'] = job_insights[0]
            if len(job_insights[0].split('·')) > 1:
                job_data['level'] = job_insights[0].split('·')[1].strip()
            job_data['company_size'] = job_insights[1]
                        
            # Main job content:
            job_data['about_job'] = self.get_text(element=self.get_element('job-details', 'id'))

            Job(
                
            )

            job_collection.append(job_data)
            if limit:
                i+=1
                if i == limit_qtd:
                    break
        
        return job_collection
        
        

In [9]:
scraper = LinkedinScraper(LinkedinScraper.initialize_driver())
scraper.get_linkedin()

  driver = webdriver.Chrome(executable_path='./resources/chromedriver.exe', options=options)


https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=0


In [10]:
jobs = scraper.get_data_from_linkedin_page()

Message: no such element: Unable to locate element: {"method":"css selector","selector":".jobs-unified-top-card__applicant-count"}
  (Session info: chrome=114.0.5735.90)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00E28893+48451]
	(No symbol) [0x00DBB8A1]
	(No symbol) [0x00CC5058]
	(No symbol) [0x00CF0467]
	(No symbol) [0x00CF069B]
	(No symbol) [0x00CE9631]
	(No symbol) [0x00D0A304]
	(No symbol) [0x00CE9586]
	(No symbol) [0x00D0A614]
	(No symbol) [0x00D1C482]
	(No symbol) [0x00D0A0B6]
	(No symbol) [0x00CE7E08]
	(No symbol) [0x00CE8F2D]
	GetHandleVerifier [0x01088E3A+2540266]
	GetHandleVerifier [0x010C8959+2801161]
	GetHandleVerifier [0x010C295C+2776588]
	GetHandleVerifier [0x00EB2280+612144]
	(No symbol) [0x00DC4F6C]
	(No symbol) [0x00DC11D8]
	(No symbol) [0x00DC12BB]
	(No symbol) [0x00DB4857]
	BaseThreadInitThunk [0x75D57D59+25]
	RtlInitializeExceptionChain [0x7721B74B+107]
	RtlClearBits [0x7721B6CF+191]

Message: no such element: Unable to locate element: {"method":"css selector","s

In [None]:
a = scraper.get_text(element=scraper.get_element('job-details'))
a

oi
'charmap' codec can't encode character '\U0001f49a' in position 2131: character maps to <undefined>


In [11]:
pd.DataFrame(jobs)

Unnamed: 0,title,company,location,type_workplace,applicant_count,posted_date,skills,worktype,company_size,about_job,level
0,Analista de datos,DESTINIA,Curitiba e Região,Presencial,128.0,2023-05-23 22:21:19.537360,"Comunicação, Banco de dados, e mais 8",Tempo integral,11-50 funcionários,Sobre a vaga\n¡Te queremos a ti!\n\nAgente de ...,
1,Analista de Social Listening Jr.,Empresa Confidencial,"São Paulo, São Paulo, Brasil",Híbrido,,2023-05-16 22:21:20.847548,"Redação, Análise de dados, e mais 8",Tempo integral,"51-200 funcionários · Tecnologia, Informação e...",Sobre a vaga\nEmpresa multinacional busca um A...,Júnior
2,Analytics Visualisation Analyst,Cargill,"São Paulo, São Paulo, Brasil",,51.0,2023-05-28 22:21:22.021718,"Analítica de dados, Visualização de dados, e m...",Tempo integral,+ de 10.001 funcionários · Fabricação de alime...,"Sobre a vaga\nWant to build a stronger, more s...",Assistente
3,Data Analyst,Pixodust Games,"São Paulo, Brasil",Remoto,,2023-05-23 22:21:23.225698,"Análise de dados, SQL, e mais 8",Tempo integral,11-50 funcionários,Sobre a vaga\nA Pixodust Games é uma publisher...,
4,Analista de BI,Bacio di Latte,"São Paulo, São Paulo, Brasil",Presencial,,2023-05-29 22:21:24.405598,"Comunicação, Relatórios e análises, e mais 8",Contrato,1.001-5.000 funcionários · Serviços de aliment...,Sobre a vaga\nBuscamos sempre o que há de melh...,Pleno-sênior
5,Analista de Dados Pleno,Conta Azul,"Joinville, Santa Catarina, Brasil",Híbrido,75.0,2023-05-29 22:21:25.521123,"Resolução de problemas, Python, e mais 8",Tempo integral,201-500 funcionários · Serviços financeiros,Sobre a vaga\nDESCRIÇÃO DA VAGA\nComo Analista...,Pleno-sênior
6,Analista de BI,GrupoSITI,Goiânia e Região,Presencial,55.0,2023-05-29 22:21:26.704465,"SQL, NoSQL, e mais 8",Tempo integral,11-50 funcionários · Serviços de recursos humanos,Sobre a vaga\nRESPONSABILIDADES E ATRIBUIÇÕES\...,Pleno-sênior
7,Sports Statistician,Genius Sports,"Macaé, Rio de Janeiro, Brasil",Presencial,34.0,2023-05-29 22:21:27.795622,"Setor de esportes, Esportes, e mais 6",Contrato,1.001-5.000 funcionários · Desenvolvimento de ...,Sobre a vaga\nLove sports?\n\nWe're looking fo...,Assistente
8,Analista de BI (Pleno),EAIBrasil,"São Paulo, São Paulo, Brasil",Híbrido,,2023-05-16 22:21:31.604571,"Tableau, Inteligência de negócios (BI), e mais 1",Tempo integral,51-200 funcionários,Sobre a vaga\nEstamos com uma oportunidade inc...,
9,Analista júnior,Electric Consultoria,"Porto Alegre, Rio Grande do Sul, Brasil",Presencial,101.0,2023-05-29 22:21:33.000913,"Microsoft Excel, Sistemas operacionais, e mais 5",Tempo integral,11-50 funcionários,Sobre a vaga\nVaga (s): Analista Jr.\nLocal: P...,


In [None]:
job_list = job_list.find_elements(By.TAG_NAME, 'a')
        for i in range(10):
            job_list[0].send_keys(Keys.PAGE_DOWN)
        time.sleep(2)
        job_list = self.find_element(By.CLASS_NAME, 'scaffold-layout__list-container')
        job_list = job_list.find_elements(By.TAG_NAME, 'a')

        # Getting the data for the dictionary
        job_collection = []
        i=0

        for job in job_list:
        
            job_data = dict()
            job.click()
            time.sleep(1.5)
            job_content = driver.find_element(By.CLASS_NAME, 'jobs-unified-top-card__content--two-pane')

            # First area of information (top)
            job_data['title'] = get_element(job_content, 'h2', 'tag')
            job_data['company'] = get_element(job_content, 'jobs-unified-top-card__company-name', 'class')
            job_data['location'] = get_element(job_content, 'jobs-unified-top-card__bullet', 'class')
            job_data['type_workplace'] = get_element(job_content, 'jobs-unified-top-card__workplace-type', 'class')
            job_data['applicant_count'] = get_element(job_content, 'jobs-unified-top-card__applicant-count', 'class')
            if isinstance(job_data['applicant_count'], str):
                job_data['applicant_count'] = job_data['applicant_count'].split()[0]

            # Calculating posted date
            if len(job_content.find_elements(By.CLASS_NAME, 'jobs-unified-top-card__posted-date')) > 0:
                quantity, temporal_type = get_element(job_content, 'jobs-unified-top-card__posted-date', 'class').split()[1:]
                job_data['posted_date'] = calculate_date(int(quantity), temporal_type)
            
            # Second area of information (job insight)
            try:
                job_insight = job_content.find_element(By.CLASS_NAME, 'mt5')
                job_insights = job_insight.find_elements(By.TAG_NAME, 'li')
                job_insights = [insight.get_attribute('innerText') for insight in job_insights]
                for insight in job_insights:
                    if 'competências' in insight.lower():
                        job_data['skills'] = insight.split(': ')[1]
                job_data['worktype'] = job_insights[0].split('·')[0].strip()
                if len(job_insights[0].split('·')) > 1:
                    job_data['level'] = job_insights[0].split('·')[1].strip()
                job_data['company_size'] = job_insights[1]
                
            except Exception as e:
                print('erro:', job_data['title'], '-', e)

            # Main job content:
            job_data['about_job'] = get_element(driver, 'job-details', 'id')


            job_collection.append(job_data)
            if limit:
                i+=1
                if i == limit_qtd:
                    break
        
        return job_collection
        

# Data Extraction

Primeiramente, preciso extrair os dados de meu interesse para constituir um database bacana. Farei isso com um webscraper no Linkedin, de início, embora possa pensar em usar outras plataformas de emprego se necessário.

Entendendo o link:
    https://www.linkedin.com/jobs/search/?currentJobId=3571662289&keywords=analista%20de%20dados&refresh=true
    
Parâmetros GET:
- currentJobId
- keywords: o que está sendo pesquisado
- refresh

Infelizmente, o BeautifulSoup não consegue nos retornar a página correta, exigindo o uso do Selenium.

## Selenium

A página de vagas do Linkedin é dividida em dois painéis, um com a lista de vagas e outra com a descrição da vaga selecionada, começando a partir da primeira. A lista de vagas é carregada na medida em que descemos por ela, então o comando .execute_script irá fazer um scroll down para carregarmos todas as vagas da primeira página.

Em seguida, guardamos todas as vagas numa lista de WebElements.

In [None]:
driver = open_navigator(final_link)
all_jobs = []
for page in range(40):
    get_linkedin(driver, final_link, page)
    all_jobs.extend(get_data_from_linkedin_page(driver))

  driver = webdriver.Chrome(executable_path='./resources/chromedriver.exe', options=options)


https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=0
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=25
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=50
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=75
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=100
erro: Programador - list index out of range
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=125
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=150
https://www.linkedin.com/jobs/search/?keywords=analista de dados&location=Brasil&geoId=106057199+&start=175
erro: Desenvolvedor jr. - list index out of range
https://www.linkedin.com/jobs/search/?keywords=

In [None]:
df_jobs = pd.DataFrame(all_jobs)

In [None]:
df_jobs.head()

Unnamed: 0,title,company,location,type_workplace,applicant_count,posted_date,skills,worktype,level,company_size,about_job
0,"Data Analyst (Bangkok Based, Relocation Provided)",Agoda,"Porto Alegre, Rio Grande do Sul, Brasil",,4.0,2023-05-26 16:01:22.414820,"Comunicação, Capacidade de organização, e mais 8",Tempo integral,Júnior,"5.001-10.000 funcionários · Tecnologia, Inform...",Sobre a vaga\nAbout Agoda\n\n\n\n\nAgoda is an...
1,Football Statistician,Genius Sports,"Iporá, Goiás, Brasil",Presencial,43.0,2023-05-23 16:01:24.447378,"Esportes, Inglês, e mais 8",Contrato,Assistente,1.001-5.000 funcionários · Desenvolvimento de ...,Sobre a vaga\n\nLove sports?\n\n\n\n\nWe're lo...
2,Football Statistician,Genius Sports,"Tocantinópolis, Tocantins, Brasil",Presencial,25.0,2023-05-29 16:01:26.395033,"Coleta de dados, Futebol americano, e mais 8",Contrato,Assistente,1.001-5.000 funcionários · Desenvolvimento de ...,Sobre a vaga\n\nLove sports?\n\n\n\n\nWe're lo...
3,"Statistical Analyst (Bangkok Based, Relocation...",Agoda,"Brasília, Distrito Federal, Brasil",,10.0,2023-05-26 16:01:31.199296,"Comunicação, Capacidade de organização, e mais 8",Tempo integral,Júnior,"5.001-10.000 funcionários · Tecnologia, Inform...",Sobre a vaga\nAbout Agoda\n\n\n\n\nAgoda is an...
4,Analista de BI,Bacio di Latte,"São Paulo, São Paulo, Brasil",Presencial,,2023-05-29 16:01:36.172219,"Comunicação, Relatórios e análises, e mais 8",Contrato,Pleno-sênior,1.001-5.000 funcionários · Serviços de aliment...,Sobre a vaga\n\nBuscamos sempre o que há de me...


In [None]:
df_jobs.shape

(975, 11)

In [None]:
df_jobs.location.unique()
df_jobs.location = df_jobs.location.str.strip()

In [None]:
df_jobs2 = df_jobs.copy()

In [None]:
df_jobs2['about_job'] = df_jobs['about_job'].str.replace('\n', ' - ')

In [None]:
df_jobs2.to_csv('df_jobs_v1.csv', index=None, sep=',')

In [None]:
a = pd.read_csv('df_jobs_v1.csv', sep=',')

In [None]:
a.shape

(975, 11)

In [None]:
a.head()

Unnamed: 0,title,company,location,type_workplace,applicant_count,posted_date,skills,worktype,level,company_size,about_job
0,"Data Analyst (Bangkok Based, Relocation Provided)",Agoda,"Porto Alegre, Rio Grande do Sul, Brasil",,4.0,2023-05-26 16:01:22.414820,"Comunicação, Capacidade de organização, e mais 8",Tempo integral,Júnior,"5.001-10.000 funcionários · Tecnologia, Inform...",Sobre a vaga - About Agoda - - - - - Agoda...
1,Football Statistician,Genius Sports,"Iporá, Goiás, Brasil",Presencial,43.0,2023-05-23 16:01:24.447378,"Esportes, Inglês, e mais 8",Contrato,Assistente,1.001-5.000 funcionários · Desenvolvimento de ...,Sobre a vaga - - Love sports? - - - - - W...
2,Football Statistician,Genius Sports,"Tocantinópolis, Tocantins, Brasil",Presencial,25.0,2023-05-29 16:01:26.395033,"Coleta de dados, Futebol americano, e mais 8",Contrato,Assistente,1.001-5.000 funcionários · Desenvolvimento de ...,Sobre a vaga - - Love sports? - - - - - W...
3,"Statistical Analyst (Bangkok Based, Relocation...",Agoda,"Brasília, Distrito Federal, Brasil",,10.0,2023-05-26 16:01:31.199296,"Comunicação, Capacidade de organização, e mais 8",Tempo integral,Júnior,"5.001-10.000 funcionários · Tecnologia, Inform...",Sobre a vaga - About Agoda - - - - - Agoda...
4,Analista de BI,Bacio di Latte,"São Paulo, São Paulo, Brasil",Presencial,,2023-05-29 16:01:36.172219,"Comunicação, Relatórios e análises, e mais 8",Contrato,Pleno-sênior,1.001-5.000 funcionários · Serviços de aliment...,Sobre a vaga - - Buscamos sempre o que há de ...
