In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import requests
import pandas as pd
import math

In [2]:
def get_job_id_list():
    cards_list = []
    jobs_id = []
    cards_list = driver.find_elements(By.XPATH, '//div[@class="srpResultCardContainer"]//div[@class="cardContainer"]')    
    jobs_id = [card.get_attribute('id') for card in cards_list]
    jobs_id.append(driver.find_element(By.XPATH, '//div[@class="srpResultCardContainer"]//div[@class="cardContainer activeCard"]').get_attribute('id'))
    
    return jobs_id

In [3]:
def get_job_title(job_id):
    return driver.find_element(By.XPATH, '//div[@id={}]//div[@class="jobTitle"]'.format(job_id)).text

In [4]:
def get_company_name(job_id):
    return driver.find_element(By.XPATH, '//div[@id={}]//div[contains(@class,"companyName")]'.format(job_id)).text

In [5]:
def get_position_type(job_id):
    return driver.find_elements(By.XPATH, '//div[@id={}]//div[@class="details"]'.format(job_id))[0].text

In [6]:
def get_job_location(job_id):
    return driver.find_elements(By.XPATH, '//div[@id={}]//div[@class="details"]'.format(job_id))[1].text

In [7]:
def get_experience(job_id):
    if len(driver.find_elements(By.XPATH, '//div[@id={}]//div[@class="details"]'.format(job_id))) > 2:
        return driver.find_elements(By.XPATH, '//div[@id={}]//div[@class="details"]'.format(job_id))[2].text
    else:
        return ''

In [8]:
def get_skills(job_id):
    skills = driver.find_elements(By.XPATH, '//div[@id={}]//div[@class="skillTitle"]'.format(job_id))
    skills_list = ''
    for skill in skills:
        if skill.text!= '':
            skills_list = skills_list + skill.text + ','
    
    return skills_list[:-1]

In [9]:
def scrape_current_page():
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="srpResultCardContainer"]//div[@class="cardContainer"]')))
    jobs_id_list = get_job_id_list()
    print(jobs_id_list)

    for job_id in jobs_id_list:
        job_title_list.append(get_job_title(job_id))
        company_name_list.append(get_company_name(job_id))
        position_type_list.append(get_position_type(job_id))
        job_location_list.append(get_job_location(job_id))
        experience_list.append(get_experience(job_id))
        skills_list.append(get_skills(job_id))

In [10]:
URL_FI = 'https://www.foundit.in'

driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(URL_FI)
driver.maximize_window()

search_box = driver.find_element(By.XPATH, '//input[@class="input search-bar home_ac"]')
search_box.send_keys('Data Engineer')

location_box = driver.find_element(By.XPATH, '//input[@class="input location_ac"]')
location_box.send_keys('India')

# experience_dropdown = Select(driver.find_element(By.XPATH, '//span[@class="multiselect__single modal-ref-class placeholderColor"]'))
# experience_dropdown.select_by_visible_text('Fresher')

submit_button = driver.find_element(By.XPATH, '//input[@value="Search"]')
submit_button.click()

driver.get(driver.current_url)

job_title_list, company_name_list, position_type_list, job_location_list = [], [], [], []
experience_list = []
skills_list = []

job_count = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[@class="job-count"]'))).text
temp = [int(i) for i in job_count.split() if i.isdigit()]
total_pages = math.ceil(temp[0] / 15)

for page in range(total_pages):
    scrape_current_page()
    if page < total_pages - 1:
        arrow_right_button = driver.find_element(By.XPATH, '//div[@class="arrow arrow-right"]')
        arrow_right_button.click()
    driver.get(driver.current_url)
    
dict_job = {'job_title': job_title_list, 'company_name': company_name_list, 'position_type': position_type_list,\
            'job_location': job_location_list, 'skills': skills_list, 'experience': experience_list}

['20587228', '20587230', '20435330', '20500131', '20260661', '20089909', '20507968', '20220506', '20586498', '20260564', '20457423', '20541813', '20447108', '20499151', '20587229']
['20550422', '20500694', '20469479', '16303794', '20451126', '20506124', '20507968', '20373571', '17556879', '20468956', '20507161', '20376420', '20587184', '20500658', '20500656']
['20376420', '18502548', '18502996', '20470887', '20449741', '20470345', '20449757', '20547643', '20507161', '20451844', '20506098', '20469976', '20469941', '20469479', '20468956']
['20220947', '19329543', '19776446', '15295687', '20501032', '20048339', '20049775', '12963655', '20172966', '19786296', '20214426', '20210281', '20169203', '20048260', '20259561']
['20372030', '20229484', '20211697', '20211446', '20375546', '20177108', '20456977', '20228468', '20372040', '20181124', '20228932', '20304071', '20376095', '20225936', '20226244']
['20267696', '20228932', '20181119', '20056233', '20267353', '20181124', '20224854', '20374695'

['20543536', '20541955', '20542323', '18234705', '20510511', '20543761', '20455880', '18234729', '18234576', '20543307', '20505096', '20459046', '18234676', '20542531', '20543475']
['20376262', '18346004', '20470245', '18088639', '18088644', '20180455', '20210655', '20197102', '20173574', '20191160', '20180549', '20191232', '20180568', '20193886', '20469684']
['20171874', '20172115', '20177863', '20170585', '20179423', '20179384', '20175447', '20170510', '20170149', '20173574', '20179336', '20171060', '20170808', '20172129', '20172128']
['20219048', '20211656', '20217664', '20193886', '20179128', '20216553', '20190663', '20190116', '20190092', '20219060', '20215255', '20210655', '20197102', '20210537', '20219059']
['20189896', '20196620', '20190657', '20187733', '20191232', '20193886', '20183093', '20183094', '20185502', '20190116', '20191798', '20196732', '20196217', '20187750', '20185546']
['20196732', '20181138', '20187750', '20179423', '20219059', '20185546', '20179384', '20176441'

In [11]:
df = pd.DataFrame.from_dict(dict_job)

In [12]:
pd.DataFrame(df).to_csv('FI_job_data.csv', index=False)