In [37]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Basics - reading a local html file 
This is to pratice bs4 methods

In [38]:
with open('home.html','r') as html_file:
    content = html_file.read()
    
    soup = BeautifulSoup(content, 'lxml')
    course_cards = soup.find_all('div', class_='card')
    for course in course_cards:
        course_name = course.h5.text
        course_price = course.a.text.split()[-1]
        
        print(f'{course_name} costs {course_price}')


Python for beginners costs 20$
Python Web Development costs 50$
Python Machine Learning costs 100$


### Get the necessary part of HTML code from the website

In [39]:
html_text = requests.get('https://m.timesjobs.com/mobile/jobs-search-result.html?txtKeywords=python&cboWorkExp1=-1&txtLocation=').text
soup = BeautifulSoup(html_text,'lxml')

jobs = soup.find_all("li")

### Formatting for the needed data

Version 1 of the code implementation


In [40]:
posted_period = []
company_names = []
skills_list = []
for job in jobs:
    posting_times = job.find('span',class_='posting-time')
    if posting_times:
        # print(int(posting_times.text[:2]))
        posted_period.append(int(posting_times.text[:2]))
    company_name = job.find('span',class_='srp-comp-name')
    if company_name:
        company_names.append(company_name.text)
    skills = [s.get_text(strip=True) for s in job.find_all('a',class_='srphglt')]
    if skills:    
        skills_list.append(skills)
        
        
# print(len(posted_period))
# print(len(company_names))
# print(len(skills_list))
    

# for company, skills, period in zip(company_names, skills_list, posted_period):
#     if period < 10:
#         print(f"Company: {company}")
#         print(f"Skills: {', '.join(skills)}")
#         print(f"Posted: {period}")
#         print("-" * 30)


df = pd.DataFrame({
    'Company': company_names,
    'Skills': skills_list,
    'Posted': posted_period
})

df['Skills'] = df['Skills'].apply(lambda x: ", ".join(x))
df.head()


ValueError: All arrays must be of the same length

### Version 2
(more optimized)

In [41]:
data = []
for job in jobs:
    posting_time = job.find('span', class_='posting-time')
    company_name = job.find('span', class_='srp-comp-name')
    skills = job.find_all('a', class_='srphglt')
    
    if posting_time and company_name and skills:
        data.append({
            'Company': company_name.text,
            'Skills': ', '.join(s.get_text(strip=True) for s in skills),
            'Posted': int(posting_time.text[:2])
        })

df = pd.DataFrame(data)
df

Unnamed: 0,Company,Skills,Posted
0,Bright Vision Technologies,"python programming, software development, data...",24
1,Bright Vision Technologies,"python programming, software development, data...",24
2,Bright Vision Technologies,"python programming, software development, data...",24
3,Bright Vision Technologies,"python programming, software development, data...",24
4,Drawsera,"css, ajax, jquery, sql, database, postgresql, ...",1
5,CONNECTING 2 WORK,"python, css, github, hosting, bug fixing, boot...",1
6,cloudxlab,"css, html5, jquery, git, linux, html, mysql, r...",1
7,CONNECTING 2 WORK,"python, storage, django, testing tools, debugging",1
8,FINITE HR CONSUTING PVT LTD,"python, github, mobile, problem solving, team ...",1
9,CONNECTING 2 WORK,"python, css, bug fixing, bootstrap, javascript...",1


In [42]:
url = "https://www.amazon.in/s?k=mobile+phone+under+20000&crid=3MZE3X15NFGK6&sprefix=mobile%2Caps%2C345&ref=nb_sb_ss_mvt-t11-ranker_ci_tech-br-left_1_6"

In [43]:
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.amazon.in/",
    "DNT": "1",
    "Connection": "keep-alive",
}

In [44]:
webpage = requests.get(url,headers=header)

In [45]:
print(webpage)

<Response [200]>


In [55]:
# Fetch links as List of Tag Objects
links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
links

[<a aria-hidden="true" class="a-link-normal s-no-outline" href="/sspa/click?ie=UTF8&amp;spc=MTo2MjU0MzY2MzQwMDYzODMzOjE3NTg1MzY2NzY6c3BfYXRmOjMwMDYyMjQzOTM5MDQzMjo6MDo6&amp;url=%2FSamsung-Storage-Enhanced-Unmatched-Nightography%2Fdp%2FB0FDB9ZCTD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3MZE3X15NFGK6%26dib%3DeyJ2IjoiMSJ9.fu4hhSmr_5olZzv3SuUGqFoMjaADfUrImbvjBBZrW_q507YHL5NaMHGmphfmsz6PAkwTLGbWHnilhogbuni3cgYd6jIT2mVn2ZmbZoOPFlVPJoBOTUfcnSUqIsohFcgZ5cPsUJ0snqRHEheykfuz0z2fSqx0eDEdUbjrCLFZSa98zMpTUVmV2jHFlZUMmjNxzRJD0vu1XRHHRxBLVBaJyE5FzAdYHhh5nKNyqu5lYSo.FfLDGU3opv0NzQyqNVElL2DAF8m_b6xCGsabqPw-ue0%26dib_tag%3Dse%26keywords%3Dmobile%2Bphone%2Bunder%2B20000%26qid%3D1758536676%26sprefix%3Dmobile%252Caps%252C345%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1&amp;aref=BBnDjwayYG&amp;sp_cr=ZAZ" tabindex="-1" target="_blank"><div class="a-section aok-relative s-image-fixed-height"><img alt="Sponsored Ad - Samsung Galaxy M36 5G (Velvet Black, 8 GB RAM, 128 GB Storage)| Circle to Search| Google

In [51]:
soup = BeautifulSoup(webpage.content,'html')

In [56]:
the_list  = soup.find_all('div',class_="puisg-col-inner")
the_list

[<div class="puisg-col-inner"><div class="a-section a-spacing-none aok-relative puis-status-badge-container s-list-status-badge-container"></div><div class="s-product-image-container aok-relative s-text-center s-image-overlay-grey puis-image-overlay-grey s-padding-left-small s-padding-right-small puis-flex-expand-height puis puis-vkh8i5perkddu2r7ogc2c0qfsb" data-cy="image-container"><div class="aok-relative"><span class="rush-component" data-component-type="s-product-image" data-render-id="r8m07mapfrofw2p2ozwmgd4u7n" data-version-id="vkh8i5perkddu2r7ogc2c0qfsb"><a aria-hidden="true" class="a-link-normal s-no-outline" href="/sspa/click?ie=UTF8&amp;spc=MTo2MjU0MzY2MzQwMDYzODMzOjE3NTg1MzY2NzY6c3BfYXRmOjMwMDYyMjQzOTM5MDQzMjo6MDo6&amp;url=%2FSamsung-Storage-Enhanced-Unmatched-Nightography%2Fdp%2FB0FDB9ZCTD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3MZE3X15NFGK6%26dib%3DeyJ2IjoiMSJ9.fu4hhSmr_5olZzv3SuUGqFoMjaADfUrImbvjBBZrW_q507YHL5NaMHGmphfmsz6PAkwTLGbWHnilhogbuni3cgYd6jIT2mVn2ZmbZoOPFlVPJoBOTUfcnSUqIso

In [None]:
for i in the_list:
    title = i.find('h2',class_="a-size-medium a-spacing-none a-color-base a-text-normal")
    prize = i.find('span',class_="a-price-whole")
    rating = i.find('span',class_="a-size-small a-color-base")
    reviews = i.find("a", class_="a-link-normal s-underline-text s-underline-link-text s-link-style")

None
<h2 aria-label="Sponsored Ad - Samsung Galaxy M36 5G (Velvet Black, 8 GB RAM, 128 GB Storage)| Circle to Search| Google Gemini| Corning Gorilla Glass Victus+| 7.7mm Slim| AI Enhanced 50 MP OIS Triple Camera| Unmatched Nightography" class="a-size-medium a-spacing-none a-color-base a-text-normal"><span>Samsung Galaxy M36 5G (Velvet Black, 8 GB RAM, 128 GB Storage)| Circle to Search| Google Gemini| Corning Gorilla Glass Victus+| 7.7mm Slim| AI Enhanced 50 MP OIS Triple Camera| Unmatched Nightography</span></h2>
None
None
None
<h2 aria-label="Sponsored Ad - iQOO Z10R 5G (Aquamarine, 8GB RAM, 128GB Storage) | 32MP 4K Selfie Camera | Quad-Curved AMOLED Display | Dimensity 7400 Processor with 750K+ AnTuTu" class="a-size-medium a-spacing-none a-color-base a-text-normal"><span>iQOO Z10R 5G (Aquamarine, 8GB RAM, 128GB Storage) | 32MP 4K Selfie Camera | Quad-Curved AMOLED Display | Dimensity 7400 Processor with 750K+ AnTuTu</span></h2>
None
None
None
<h2 aria-label="OnePlus Nord CE4 Lite 5G 

In [65]:
type(title)

NoneType

In [66]:
print(f'''
Title = {len(title)}      
Prize = {len(prize)}      
Rating = {len(rating)}      
Reviews = {len(reviews)}      
      ''')

TypeError: object of type 'NoneType' has no len()

In [59]:
# Example: flatten each scraped list
titles = [t.strip() for t in title]           # remove leading/trailing spaces
prices = [p.replace(',', '').strip() for p in prize]  # remove commas if needed
ratings = [r.strip() for r in rating]
reviews = [rev.strip() for rev in reviews]

TypeError: 'NoneType' object is not iterable

In [49]:
# Create dataframe using a dictionary
df = pd.DataFrame({
    'Title': pd.Series(title),
    'Price': pd.Series(prize),
    'Rating': pd.Series(rating),
    'Reviews': pd.Series(reviews)
})

print(df)

                                                Title     Price  Rating  \
0   [[Samsung Galaxy M36 5G (Velvet Black, 8 GB RA...  [15,499]   [4.0]   
1   [[iQOO Z10R 5G (Aquamarine, 8GB RAM, 128GB Sto...  [19,498]   [4.0]   
2   [[OnePlus Nord CE4 Lite 5G (Super Silver, 8GB ...  [16,749]   [4.0]   
3   [[Samsung Galaxy M35 5G (DayBreak Blue,8GB RAM...  [18,499]   [4.0]   
4   [[realme 14T 5G Smartphone 8GB RAM 128GB ROM 6...  [17,490]   [4.0]   
5   [[Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 ...  [17,499]   [4.0]   
6   [[Motorola G45 5G (Viva Magenta, 8GB RAM, 128G...  [19,498]   [4.0]   
7   [[Redmi 13 5G Prime Edition, Hawaiian Blue, 8G...  [21,998]   [4.0]   
8   [[realme C75 5G (Lily White, 6GB RAM, 128GB St...  [18,998]   [4.0]   
9   [[Redmi 13 5G Prime Edition, Orchid Pink, 8GB+...  [24,998]   [4.0]   
10  [[iQOO Z10 5G (Glacier Silver, 8GB RAM, 256GB ...   [6,249]   [4.0]   
11  [[Lava Agni 3 5G (Pristine Glass, 8GB+128GB) |...  [11,890]   [4.0]   
12  [[Samsung Galaxy M06 

In [50]:
df

Unnamed: 0,Title,Price,Rating,Reviews
0,"[[Samsung Galaxy M36 5G (Velvet Black, 8 GB RA...","[15,499]",[4.0],"[[Let us know], ]"
1,"[[iQOO Z10R 5G (Aquamarine, 8GB RAM, 128GB Sto...","[19,498]",[4.0],"[[1,427], ]"
2,"[[OnePlus Nord CE4 Lite 5G (Super Silver, 8GB ...","[16,749]",[4.0],"[[Let us know], ]"
3,"[[Samsung Galaxy M35 5G (DayBreak Blue,8GB RAM...","[18,499]",[4.0],"[[1,007], ]"
4,[[realme 14T 5G Smartphone 8GB RAM 128GB ROM 6...,"[17,490]",[4.0],"[[9,494], ]"
5,"[[Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 ...","[17,499]",[4.0],"[[14,143], ]"
6,"[[Motorola G45 5G (Viva Magenta, 8GB RAM, 128G...","[19,498]",[4.0],"[[1,427], ]"
7,"[[Redmi 13 5G Prime Edition, Hawaiian Blue, 8G...","[21,998]",[4.0],"[[1,007], ]"
8,"[[realme C75 5G (Lily White, 6GB RAM, 128GB St...","[18,998]",[4.0],"[[357], ]"
9,"[[Redmi 13 5G Prime Edition, Orchid Pink, 8GB+...","[24,998]",[4.0],"[[1,264], ]"


In [68]:
df.describe

<bound method NDFrame.describe of                                                 Title     Price  Rating  \
0   [[Samsung Galaxy M36 5G (Velvet Black, 8 GB RA...  [15,499]   [4.0]   
1   [[iQOO Z10R 5G (Aquamarine, 8GB RAM, 128GB Sto...  [19,498]   [4.0]   
2   [[OnePlus Nord CE4 Lite 5G (Super Silver, 8GB ...  [16,749]   [4.0]   
3   [[Samsung Galaxy M35 5G (DayBreak Blue,8GB RAM...  [18,499]   [4.0]   
4   [[realme 14T 5G Smartphone 8GB RAM 128GB ROM 6...  [17,490]   [4.0]   
5   [[Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 ...  [17,499]   [4.0]   
6   [[Motorola G45 5G (Viva Magenta, 8GB RAM, 128G...  [19,498]   [4.0]   
7   [[Redmi 13 5G Prime Edition, Hawaiian Blue, 8G...  [21,998]   [4.0]   
8   [[realme C75 5G (Lily White, 6GB RAM, 128GB St...  [18,998]   [4.0]   
9   [[Redmi 13 5G Prime Edition, Orchid Pink, 8GB+...  [24,998]   [4.0]   
10  [[iQOO Z10 5G (Glacier Silver, 8GB RAM, 256GB ...   [6,249]   [4.0]   
11  [[Lava Agni 3 5G (Pristine Glass, 8GB+128GB) |...  [11,890]   

In [None]:
data = {
    'Title': [],
    'Prize': [],
    'Rating': [],
    'Reviews': []
}


In [None]:
heading_list = [h.get_text(strip=True) for h in title]
price_list = [p.get_text(strip=True) for p in prize]
rating_list = [r.get_text(strip=True) for r in rating]
review_list = [r.get_text(strip=True) for r in reviews]


In [None]:
data['Title'] = heading_list
data['Prize'] = price_list
data['Rating'] = rating_list
data['Reviews'] = review_list