## Web Scraping Demo of Amazon & Nike data

In [1]:
## IF running for the 1st time then uncomment below code
# !pip install selenium
# !pip install msedge-selenium-tools
# !pip install bs4

In [2]:
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup

import csv
import pandas as pd

In [3]:

#We will be using functions to achieve this

def my_url(keyword):
    temp = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    keyword = keyword.replace(' ', '+')
    
    # Add Term Query To URL
    url = temp.format(keyword)
    
    # Add Page Query Placeholder
    url += '&page{}'
    
    return url

def extract_record(obj):
    atag = obj.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    #it is possible that some items on amazom.com might not be having one of the items we are looking for(e.g. some items might not be having ratings or price), we will be getting error if we dont take care of that. We will therefore add some error handlers
    #if there are no price,probably the item is out of stock or not available, then we will ignore the item, but if there are no reviews yet, it's fine, we will still want to extract the item.
    try:
        parent=obj.find('span','a-price')
        price=parent.find('span','a-offscreen').text
    except AttributeError: #we are excepting the error if it occurs so that we can move to extract the next item, else the program will stop running and gives error
        return
    
    try:
        rate=obj.i.text
        counts_review = obj.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        #assigning empty string to ratings and 
        rate = ''
        counts_review = ''
    
    image = obj.find('img', {'class': 's-image'}).get('src') 
    
    #let's create a tuple that will contain all these items and assign it to a result variable
    result = (description, price, rate, counts_review, url,image)
    return result

'''Run Main Program Routine'''
def main(keyword):
    # Startup The Webdriver
    driver = webdriver.Chrome()
    
    records = []  #an empty records list to contain all of our extracted records
    url =my_url(keyword)
    
    for page in range(1, 50):
        driver.get(url.format(page))
        soup =BeautifulSoup(driver.page_source, 'html.parser')
        results=soup.find_all('div',{'data-component-type':'s-search-result'})
#         results=soup.find_all('div',{'data-component-type': 's-search-result'}) #same as we did above

        
#we will like to check if what we have return from the extract_record function is empty or not
        for item in results:
            record = extract_record(item) 
            if record: #if the record has something in it append to records list
                records.append(record) 
                
        #driver.quit()
    
#     # Save Results To CSV File
        with open('Results.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Description', 'Price', 'Rating', 'Reviews Count', 'URL','Image link'])
            writer.writerows(records)


In [4]:
#we can search for any item to extract data
main('mobile')

In [5]:
df=pd.read_csv(r"E:\Data Science Project\Untitled Folder\Results.csv")
df

Unnamed: 0,Description,Price,Rating,Reviews Count,URL,Image link
0,Samsung Galaxy Note 20 Ultra 5G Factory Unlock...,$899.99,4.6 out of 5 stars,4983,https://www.amazon.com/Samsung-Electronics-Unl...,https://m.media-amazon.com/images/I/81AT+Flc+E...
1,Tiny Love Meadow Days Take Along Mobile,$24.99,4.5 out of 5 stars,14420,https://www.amazon.com/Tiny-Love-Meadow-Along-...,https://m.media-amazon.com/images/I/71M-ll7ojb...
2,"OnePlus 8 Glacial Green,​ 5G Unlocked Android ...",$393.15,4.5 out of 5 stars,1978,https://www.amazon.com/OnePlus-Glacial-Unlocke...,https://m.media-amazon.com/images/I/51uEwkqjZT...
3,The Peanutshell Safari Animals Musical Crib Mo...,$39.99,4.6 out of 5 stars,342,https://www.amazon.com/Peanutshell-Animals-Mus...,https://m.media-amazon.com/images/I/61p5u8T4UW...
4,"UNIH Baby Crib Mobile with Lights and Music, M...",$35.99,4.1 out of 5 stars,924,https://www.amazon.com/UNIH-Mobile-Lights-Proj...,https://m.media-amazon.com/images/I/71566r1i6D...
...,...,...,...,...,...,...
2231,Life and Thread Hanging Mobile - 50 Inches Pla...,$82.00,4.8 out of 5 stars,30,https://www.amazon.com/gp/slredirect/picassoRe...,https://m.media-amazon.com/images/I/713TKWRXmD...
2232,3 Swallow Hanging Mobile - 20 Inches - Handmad...,$35.50,4.4 out of 5 stars,425,https://www.amazon.com/gp/slredirect/picassoRe...,https://m.media-amazon.com/images/I/51GToOzSyK...
2233,"Baby Crib Mobile Wooden Wind Chime Bed Bell,Nu...",$27.98,4.7 out of 5 stars,758,https://www.amazon.com/gp/slredirect/picassoRe...,https://m.media-amazon.com/images/I/61p-7imMV8...
2234,Flensted Mobiles 5 Balloon Hanging Nursery Mob...,$55.25,4.6 out of 5 stars,170,https://www.amazon.com/gp/slredirect/picassoRe...,https://m.media-amazon.com/images/I/31Slyx+2Zg...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2236 entries, 0 to 2235
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Description    2236 non-null   object
 1   Price          2236 non-null   object
 2   Rating         2136 non-null   object
 3   Reviews Count  2136 non-null   object
 4   URL            2236 non-null   object
 5   Image link     2236 non-null   object
dtypes: object(6)
memory usage: 104.9+ KB


In [7]:
df.isnull().sum()

Description        0
Price              0
Rating           100
Reviews Count    100
URL                0
Image link         0
dtype: int64

In [8]:
df.isnull().mean()

Description      0.000000
Price            0.000000
Rating           0.044723
Reviews Count    0.044723
URL              0.000000
Image link       0.000000
dtype: float64

In [9]:
## ERROR
'''
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome(r"D:\Python Project\Selenium\chromedriver")
products=[] #List to store name of the product
prices=[] #List to store price of the product
Descriptions=[] #List to store rating of the product
driver.get("https://www.nike.com/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok")
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.find_all('a',href=True, attrs={'class':'product-card__link-overlay'}): #anchor type a
    name = a.find('div', attrs={'class':'product-card__title'})
    price=a.find('div', attrs={'class':'product-card__price'})
    products.append(name.text)
    prices.append(price.text)
    Descriptions=a.find('li', attrs={'class':'product-card__subtitle'}) 
    #ratings.append(rating.text) 
df = pd.DataFrame({'Product Name':products,'Price':prices, 'Description' :Descriptions}) 
#,'Rating':ratings
df.to_csv('scraping .csv', index=False, encoding='utf-8')
'''

'''
AttributeError                            Traceback (most recent call last)
<ipython-input-1-4cf00f6807fb> in <module>
     12     name = a.find('div', attrs={'class':'product-card__title'})
     13     price=a.find('div', attrs={'class':'product-card__price'})
---> 14     products.append(name.text)
     15     prices.append(price.text)
     16     Descriptions=a.find('li', attrs={'class':'product-card__subtitle'})

AttributeError: 'NoneType' object has no attribute 'text'
''';

In [10]:
## Correction

In [11]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome(r"D:\Python Project\Selenium\chromedriver")
product_list=[] #List to store name of the product
price_list=[] #List to store price of the product
#Descriptions=[] #List to store rating of the product
driver.get("https://www.nike.com/w/womens-lifestyle-shoes-13jrmz5e1x6zy7ok")
content = driver.page_source
soup = BeautifulSoup(content)
for products in soup.find_all('div', attrs={'class':'product-card__info'}):
    product_name = products.find('div', attrs={'class': 'product-card__title'}).text
    product_price = products.find('div', attrs={'class': 'product-price'}).text
    product_colour = products.find('div', attrs={'class': 'product-card__product-count'}).text
    product_list.append(product_name)
    price_list.append(product_price)
    # Descriptions=a.find('li', attrs={'class':'product-card__subtitle'})
    #ratings.append(rating.text) 
df_nike = pd.DataFrame({'Product Name':product_list,'Price':price_list}) 
#,'Rating':ratings
df_nike.to_csv('scraping .csv', index=False, encoding='utf-8')

In [12]:
df_nike

Unnamed: 0,Product Name,Price
0,Nike Air Force 1 '07,"₹8,195"
1,Nike Air Force 1 Shadow,"₹8,995"
2,Nike Air Force 1 '07 Next Nature,"₹8,195"
3,Nike Waffle One,"₹8,295"
4,Air Jordan 1 Mid,"₹8,995"
5,Air Jordan 1 Mid SE,"₹10,295"
6,Nike Air Max Koko Serena Design Crew,"₹9,295"
7,Jordan NOLA,"₹2,995"
8,Nike RYZ 365 2 Serena Design Crew,"₹8,295"
9,NikeCourt Legacy Serena Design Crew,"₹6,795"
