In [10]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By  # Import for locating elements
from selenium.webdriver.common.keys import Keys  # Import for sending keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# Scrap data

In [16]:
# Specify the path to the Chrome driver executable
driver_path = r'\workspaces\upWork_scrap\chromedriver-win64\chromedriver.exe'

# Create an instance of ChromeOptions
chrome_options = Options()

# Add any desired options
chrome_options.add_argument("--start-maximized")  # Maximize the browser window

# Initialize the webdriver with the options and driver path
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Obtener el URL del nuevo tab
url = "https://www.upwork.com/search/freelance-jobs/data-science?sort=recency&t=1"
driver.get(url)

# Utilizar WebDriverWait para esperar a que los elementos se vuelvan visibles
wait = WebDriverWait(driver, 10)  # Puedes ajustar el tiempo máximo de espera según sea necesario

# Lists to save data
title = []
description = []
level = []
price = []
category = []
date = []

for pages in range(1, 5):
    # Esperar a que los elementos estén presentes
    elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "section > article")))

    for element in elements:
        
        #element.click()
        
        # Obtener el título
        titleData = element.find_element(By.CSS_SELECTOR, "h2.h4").text
        title.append(titleData)
        
        # Obtener el nivel
        levelData = element.find_element(By.CSS_SELECTOR, "li:nth-child(2) > strong").text
        level.append(levelData)
        
        # Obtener el precio
        priceData = element.find_element(By.CSS_SELECTOR, "strong:nth-child(2)").text
        price.append(priceData)
        
        # Obtener la descripción
        descriptionData = element.find_element(By.CSS_SELECTOR, ".clamp .mb-0").text
        description.append(descriptionData)
        
        # Obtener las categorías
        try:
            categories = element.find_element(By.CSS_SELECTOR, "section.card-list-container > article div.air3-token-container").text
        except NoSuchElementException:
            categories = ""  # Si no se encuentra la categoría, establece NA como valor predeterminado
        category.append(categories)
        
        # Obtener la fecha de publicacion
        dateData = element.find_element(By.CSS_SELECTOR, ".text-light > span:nth-child(2)").text
        date.append(dateData)
        
        #click = driver.find_element(By.CSS_SELECTOR, '.air3-slider-close-desktop svg')
        #click.click()
        #time.sleep(2)

    click = driver.find_element(By.CSS_SELECTOR, '*:nth-child(2) > *:nth-child(10) > * > *:nth-child(1)')
    click.click()
# Cerrar el navegador
driver.quit()


NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


# Create df and clean data

In [105]:
import pandas as pd
from datetime import datetime

# Create date of scrap
date_scrapped = datetime.now().strftime('%Y-%m-%d')

# Create dataframe with variables
df = pd.DataFrame({'id': range(len(title)),
                   'date_scrapped': date_scrapped, 
              'date_posted': date,
              'title': title,
              'description': description,
              'level': level,
              'category_qty': category_len,
              'category': category,
              'price': price,
              'job_search': 'Data scientist'
})


# ------------------------------------------------
# Adjust date_posted to a datetime format
# ------------------------------------------------

# Create an aux variable to calculate date_posted in datetime format
df['date_aux'] = df['date_posted'].apply(lambda x: int(x.split('days')[0]) if 'days' in x else 0)

# Leave date_scrapped in datetime format
df['date_scrapped'] = pd.to_datetime(df['date_scrapped'], format='ISO8601')

# Create date column with datetime date post.
df['date_post'] = df['date_scrapped'] - pd.to_timedelta(df['date_aux'], unit='D')

# Select only variables to use
vars = ['id', 'date_scrapped', 'date_post', 'job_search', 'title', 'description', 'level', 'category_qty', 'category', 'price']
df = df[vars]

# ------------------------------------------------
# Create cathegory dataframe by id and date post
# ------------------------------------------------

# ------------------------------------------------
# Divide each cathegories in different columns
# ------------------------------------------------

# Dividir la columna 'category' en una lista de categorías
df['category'] = df['category'].str.split('\n')

# Crear un DataFrame temporal con las categorías como columnas
df_temp = df['category'].apply(pd.Series)

# Renombrar las columnas agregando un prefijo 'category'
df_temp = df_temp.rename(lambda x: f'cathegory{x + 1}', axis='columns')

# Combinar el DataFrame original con el DataFrame temporal
df = pd.concat([df, df_temp], axis=1)

# Drop variable that will not use
df = df.drop('category', axis=1)

# ------------------------------------------------
# Change values in level to 0, 1, 2
# ------------------------------------------------

# Mapear los valores a 0, 1 y 2
mapping = {'Entry level': 0, 'Intermediate': 1, 'Expert': 2}
df['level'] = df['level'].map(mapping)

# ------------------------------------------------
# Clean price column
# ------------------------------------------------

# Replace $ value and leave as float 
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].astype(float)


In [106]:
df

Unnamed: 0,id,date_scrapped,date_post,job_search,title,description,level,category_qty,price,cathegory1,cathegory2,cathegory3,cathegory4,cathegory5,cathegory6,cathegory7,cathegory8,cathegory9,cathegory10
0,0,2023-10-19,2023-10-19,Data scientist,Share Your Scientific Body Composition & Shape...,We are a non-profit scientific research instit...,0,40,$6,Health Science,Science,Communications,Human Body,Social Media Marketing,Research Interviews,+3,,,
1,1,2023-10-19,2023-10-19,Data scientist,Senior AI/ML Consultant,Following our successful integration of Genera...,2,40,"$2,000",GPT-4,Generative AI,Azure OpenAI Service,Large Language Model,Machine Learning,Artificial Intelligence,,,,
2,2,2023-10-19,2023-10-19,Data scientist,Embedded Systems Expert for Data Analysis and ...,"I am really happy to write you back. So, I am ...",2,40,$90,Data Science,Data Analysis,Python,Machine Learning,Embedded System,,,,,
3,3,2023-10-19,2023-10-19,Data scientist,Embedded Systems Expert for Data Analysis and ...,"I am really happy to write you back. So, I am ...",2,40,$90,Data Science,Data Analysis,Machine Learning Model,Computer Vision,Python,PyQt,Thermodynamics,+3,,
4,4,2023-10-19,2023-10-19,Data scientist,Use R to analyze pictures and identify areas o...,I will provide pictures for training the model...,1,40,$100,Data Science,R,Machine Learning,Python,,,,,,
5,5,2023-10-19,2023-10-19,Data scientist,Content Writer Required - Python & Tableau & G...,I'm looking for a content writter who can prep...,1,40,$5,Data Science,Tableau,Dashboard,Python,Content Creation,,,,,
6,6,2023-10-19,2023-10-19,Data scientist,Influencer Partner for Scientific Research,"As a nonprofit scientific research institute, ...",0,40,"$2,000",Health Science,Science,Social Media Marketing,Human Body,Health & Wellness,Health,Health & Fitness,+1,,
7,7,2023-10-19,2023-10-19,Data scientist,Web development for startup sports data scienc...,Looking for: Fullstack developer I am looking ...,1,40,"$1,000",API Integration,Python,Web Design,Web Development,JavaScript,CSS,HTML,,,
8,8,2023-10-19,2023-10-19,Data scientist,NET Developer,We are seeking a highly skilled and motivated ...,1,40,$6,.NET Framework,ASP.NET,C#,ASP.NET MVC,API,Microsoft SQL Server,SQL,JavaScript,jQuery,1.0
9,9,2023-10-19,2023-10-19,Data scientist,Recently Lose Weight? Share Your Story,"Did you do the hard work and lose the weight, ...",0,40,$25,Health Science,Digital Marketing,Human Body,Health & Fitness,Health,Video Camera,Video Editing,,,


# Update MySQL table

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Create connection with table
engine = create_engine('mysql+mysqlconnector://root:Bastyan1@192.168.1.95/upwork')
df_sql = pd.read_sql('upwork_advices', con = engine)


###################
# Check for duplicated 
# values in data_scrapped
###################

# Find commons values
commonValues = df_sql.values.flatten()  # Get all data from sql

# Delete commons values in df with df_sql
df_scrapped = df[~df.isin(commonValues).any(axis=1)]  

###################
# Update table 
###################

# Insert the DataFrame into the database
df_scrapped.to_sql(name='upwork_advices', con=engine, if_exists='append', index=False)