In [121]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Basics - reading a local html file 
This is to pratice bs4 methods

In [122]:
with open('home.html','r') as html_file:
    content = html_file.read()
    
    soup = BeautifulSoup(content, 'lxml')
    course_cards = soup.find_all('div', class_='card')
    for course in course_cards:
        course_name = course.h5.text
        course_price = course.a.text.split()[-1]
        
        print(f'{course_name} costs {course_price}')


Python for beginners costs 20$
Python Web Development costs 50$
Python Machine Learning costs 100$


### Get the necessary part of HTML code from the website

In [123]:
html_text = requests.get('https://m.timesjobs.com/mobile/jobs-search-result.html?txtKeywords=python&cboWorkExp1=-1&txtLocation=').text
soup = BeautifulSoup(html_text,'lxml')

jobs = soup.find_all("li")

### Formatting for the needed data

Version 1 of the code implementation


In [127]:
posted_period = []
company_names = []
skills_list = []
for job in jobs:
    posting_times = job.find('span',class_='posting-time')
    if posting_times:
        # print(int(posting_times.text[:2]))
        posted_period.append(int(posting_times.text[:2]))
    company_name = job.find('span',class_='srp-comp-name')
    if company_name:
        company_names.append(company_name.text)
    skills = [s.get_text(strip=True) for s in job.find_all('a',class_='srphglt')]
    if skills:    
        skills_list.append(skills)
        
        
# print(len(posted_period))
# print(len(company_names))
# print(len(skills_list))
    

# for company, skills, period in zip(company_names, skills_list, posted_period):
#     if period < 10:
#         print(f"Company: {company}")
#         print(f"Skills: {', '.join(skills)}")
#         print(f"Posted: {period}")
#         print("-" * 30)


df = pd.DataFrame({
    'Company': company_names,
    'Skills': skills_list,
    'Posted': posted_period
})

df['Skills'] = df['Skills'].apply(lambda x: ", ".join(x))
df.head()


Unnamed: 0,Company,Skills,Posted
0,LTIMindtree Ltd.,"Solidworks, Python Scripting",7
1,Trigyn Technologies Ltd,"python proficiency, web frameworks, api develo...",8
2,Analytics Vidhya,"python, css, html5, html, ajax, javascript",9
3,magnarus technologies private limited,"image processing, python, c, c programming, op...",9
4,WBC Software Lab,"python, git, rest api, linux, mobile, mongodb,...",12


### Version 2
(more optimized)

In [128]:
data = []
for job in jobs:
    posting_time = job.find('span', class_='posting-time')
    company_name = job.find('span', class_='srp-comp-name')
    skills = job.find_all('a', class_='srphglt')
    
    if posting_time and company_name and skills:
        data.append({
            'Company': company_name.text,
            'Skills': ', '.join(s.get_text(strip=True) for s in skills),
            'Posted': int(posting_time.text[:2])
        })

df = pd.DataFrame(data)
df

Unnamed: 0,Company,Skills,Posted
0,LTIMindtree Ltd.,"Solidworks, Python Scripting",7
1,Trigyn Technologies Ltd,"python proficiency, web frameworks, api develo...",8
2,Analytics Vidhya,"python, css, html5, html, ajax, javascript",9
3,magnarus technologies private limited,"image processing, python, c, c programming, op...",9
4,WBC Software Lab,"python, git, rest api, linux, mobile, mongodb,...",12
5,SYNECHRON,"python core programming, data manipulation lib...",19
6,INFINITY GROUP,"python, css, django, html, bootstrap",17
7,LTIMindtree Ltd.,"Python Developer, Microsoft Office",7
8,Parahit Technologies Limited,"python, django, json, html, ajax, javascript",2
9,SEVEN CONSULTANCY,"rest, python, oracle, sybase, bi, storage, jav...",18
