In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urlparse

In [None]:
#function to extract title
def mobile_title(soup):
    try:
        title=soup.find("span",attrs={"id":"productTitle"}).text.strip()
    except AttributeError:
        title=""
    return title
# function to extract price
def moblie_price(soup):
    try:
        price=soup.find("span",attrs={"class":"a-price-whole"}).text.strip()
    except AttributeError:
        price=""
    return price

# function to extract number of reviews
def mobile_review(soup):
    try:
        review=soup.find("span",attrs={"id":"acrCustomerReviewText"}).text.strip()
    except AttributeError:
        review=""
    return review
# function to extract ratings
def mobile_rating(soup):
    try:
        rating=soup.find("span",attrs={"class":"a-icon-alt"}).text.strip()
    except AttributeError:
        rating=""
    return rating

In [None]:

if __name__ == '__main__':
    # add user-agent
    HEADERS = ({'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", 'accept-language': 'en-US,en;q=0.5'})

    # define URL
    url = "https://www.amazon.in/s?k=mobile+phone+under+30000&crid=1YRBMC82JUZMO&sprefix=mobile+phone%2Caps%2C228&ref=nb_sb_ss_pltr-mrr_6_12"

    # send HTTP requests
    webpage = requests.get(url, headers=HEADERS)

    # create soup object to parse content
    soup = BeautifulSoup(webpage.content, "html.parser")

    # fetch links as link of objects
    links = soup.find_all('a', attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

    links_list = []

    # loop to extract the links
    for l in links:
        links_list.append(l.get('href'))

    link_dict = {"title": [], "price": [], "rating": [], "reviews": []}

    for link in links_list:
        try:
            # Check if the link is relative and starts with '/'
            if link.startswith('/'):
                new_webpage = requests.get('https://www.amazon.in' + link, headers=HEADERS)
            else:
                # If it's an absolute URL, likely an amazon product link, extract the actual link using urlparse
                parsed_url = urlparse(link)

                # Extract the path and query parts for building the product URL
                product_url = parsed_url.path + (('?' + parsed_url.query) if parsed_url.query else '')

                new_webpage = requests.get('https://www.amazon.in' + product_url, headers=HEADERS)

            # Parse the new webpage
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")

            # Append the data to each respective list, ensuring that all lists stay in sync
            link_dict['title'].append(mobile_title(new_soup))
            link_dict['price'].append(moblie_price(new_soup))
            link_dict['rating'].append(mobile_rating(new_soup))
            link_dict['reviews'].append(mobile_review(new_soup))

        except Exception as e:
            # In case of any error, ensure that we append empty data for each field
            link_dict['title'].append("")
            link_dict['price'].append("")
            link_dict['rating'].append("")
            link_dict['reviews'].append("")
            print(f"Error processing link: {link}, error: {e}")

    # Ensure all lists are of the same length
    max_length = max(len(link_dict['title']), len(link_dict['price']), len(link_dict['rating']), len(link_dict['reviews']))

    for key in link_dict:
        while len(link_dict[key]) < max_length:
            link_dict[key].append(None)  # Append None to shorter lists

    # Create DataFrame
    mobile_df = pd.DataFrame.from_dict(link_dict)

    # Replace empty titles with NaN and drop rows where title is missing
    mobile_df['title'].replace('', np.nan, inplace=True)
    mobile_df = mobile_df.dropna(subset=['title'])

    # Save to CSV
    mobile_df.to_csv("mobile_data.csv", header=True, index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mobile_df['title'].replace('', np.nan, inplace=True)


In [None]:
mobile_df

Unnamed: 0,title,price,rating,reviews
0,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 256GB ...",24999.0,4.2 out of 5 stars,"5,648 ratings"
1,"Vivo V30e 5G Smartphone (Velvet Red, 8GB RAM, ...",25999.0,4.3 out of 5 stars,34 ratings
2,"Vivo V40e 5G AI Smartphone (Royal Bronze, 8GB ...",28390.0,5.0 out of 5 stars,1 rating
3,"Vivo V40e 5G AI Smartphone (Royal Bronze, 8GB ...",26132.0,5.0 out of 5 stars,1 rating
4,"OPPO F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",29999.0,4.0 out of 5 stars,308 ratings
5,"OnePlus 11R 5G (Galactic Silver, 8GB RAM, 128G...",28499.0,4.4 out of 5 stars,"18,097 ratings"
6,"Motorola Edge 50 Fusion 5G (Hot Pink, 12GB RAM...",27130.0,4.1 out of 5 stars,94 ratings
7,"Nothing Phone (2a) 5G (Black, 8GB RAM, 256GB S...",22920.0,4.1 out of 5 stars,712 ratings
8,"realme GT 6T 5G (Fluid Silver,8GB RAM+128GB St...",29999.0,4.3 out of 5 stars,"2,324 ratings"
9,"Vivo V30e 5G Smartphone (Silk Blue, 8GB RAM, 2...",25280.0,4.5 out of 5 stars,6 ratings
