In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

from IPython.display import clear_output

In [2]:
# Cleaning Functions
def clean_tag(ptag):
    return ptag.get('onclick').replace("window.location.href=", "")
def clean_price(price):
    return price.get('value').replace("Add 1 for" ,"")

# Extracting price and product info from ONE page
def page_extractor(url):
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')
    
    prod_info = soup.find_all('div', class_="product-info")
    price_info = soup.find_all('input', class_="btn", type='button')
    
    products = [prod.find('a').text for prod in prod_info]
    prices = [clean_price(price) for price in price_info]
    price_item_tags = [clean_tag(ptag) for ptag in price_info]
        
    category = url.replace("https://www.ajijo.com.au/collections/", "")
    category_feature = [category] * len(products)

    data = pd.DataFrame({'name' : products,
                         'price_correspondance' : price_item_tags,
                         'collection' : category_feature,
                         'price' : prices})
    
    return data

# Extracting price and product info from ALL pages
def website_extractor():

    url = "https://www.ajijo.com.au/collections/basmati-rice"
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    # Get all product catgeories from Basmati Rice webpage
    all_collections = [ele.get('href') for ele in soup.find_all('a', href=True) if ("/collections/" in ele.get('href')) & ("/products/" not in ele.get('href'))]
    all_collections = np.unique(all_collections)
    
    # Compile urls
    all_urls = ["https://www.ajijo.com.au" + ele for ele in all_collections]
    
    # Extract data from each url
    all_data = pd.DataFrame() # Initialize Empty DataFrame
    for i, url in enumerate(all_urls): 
        
        clear_output(wait=True)
        
        page_data = page_extractor(url)    
        all_data = pd.concat((all_data, page_data))
        
        print("Product and Price Data Compilation Progress : {} %".format(np.round((i+1)/len(all_urls)*100 ,2)))
        
        
    all_data.reset_index(drop=True, inplace=True)

    return all_data

# Extract inventory Data
def extract_inventory_data(data):
    
    url_base = "https://www.ajijo.com.au/collections/"

    data['item_url'] = url_base + data['collection'] + data['price_correspondance']
    data['item_url'] = data['item_url'].str.replace("'","")

    bad_url_condition = data.item_url.str.contains("page")
    data.loc[bad_url_condition, 'item_url'] = data.loc[bad_url_condition, 'item_url'].str.replace("\?page=2", "")
    data.loc[bad_url_condition, 'item_url'] = data.loc[bad_url_condition, 'item_url'].str.replace("\?page=3", "")

    inventory = []
    url_list = []
    urls = data['item_url']

    for i, url in enumerate(urls):
        
        clear_output(wait=True)
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            inventory_info = soup.find_all('div', class_="inventory")[0].text

            inventory.append(inventory_info)
            url_list.append(url)
            
        except:
            inventory.append("Error")
            url_list.append(url)            

        print("Inventory Data Compilation Progress : {} %".format(np.round(((i+1)/len(data)*100) ,2)))

    inventory_data = pd.DataFrame({'inventory' : inventory,
                                    'item_url' : url_list})

    inventory_data = inventory_data[~inventory_data.duplicated()]

    return inventory_data

# Clean Inventory Data
def clean_inventory_data(inventory_data):

        case_1 = inventory_data['inventory'].str.count("\n") == 9 
        case_2 = (inventory_data['inventory'].str.count("\n") == 5) & (inventory_data['inventory'].str.contains("available!"))
        case_3 = inventory_data['inventory'].str.contains("out of stock")

        inventory_data.loc[case_1, 'inventory'] = inventory_data.loc[case_1, 'inventory'].apply(lambda x : x.replace("\n","").split()[3])
        inventory_data.loc[case_2, 'inventory'] = -99 # Available but unknown
        inventory_data.loc[case_3, 'inventory'] = -199 # Out of Stock

        inventory_data.inventory = inventory_data.inventory.astype(int)

        return inventory_data
    
    
def compile_dataset():
    
    price_product_data = website_extractor()
    inventory_data = extract_inventory_data(price_product_data)
    
    inventory_data = clean_inventory_data(inventory_data)
    
    dataset = pd.merge(price_product_data, inventory_data, on='item_url')
    dataset = dataset[['name', 'inventory', 'price', 'price_correspondance', 'collection', 'item_url']]
    
    return dataset

ajijo_dataset = compile_dataset()

Inventory Data Compilation Progress : 100.0 %


In [14]:
ajijo_dataset

Unnamed: 0,name,inventory,price,price_correspondance,collection,item_url
0,"HALEEM MIX, AHMED",-99,$4.75,'/products/haleem-mix-ahmed',ahmed,https://www.ajijo.com.au/collections/ahmed/pro...


In [None]:
inventory_data

In [None]:
ajijo_dataset.to_csv("Final_dataset.csv", index=False)

In [9]:
url = "https://www.ajijo.com.au/collections/basmati-rice"

page = requests.get(url)
page

<Response [200]>

In [5]:
page.content



In [8]:
soup = BeautifulSoup(page.content, 'html.parser')


In [10]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 9 ]><html class="ie9 no-js" lang="en"> <![endif]-->
<!--[if (gt IE 9)|!(IE)]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <title>
   Basmati Rice – AJIJO
  </title>
  <meta content="website" property="og:type"/>
  <meta content="Basmati Rice" property="og:title"/>
  <meta content="https://www.ajijo.com.au/collections/basmati-rice" property="og:url"/>
  <meta content="http://cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/logo.png?v=15177635785745443535" property="og:image"/>
  <meta content="https://cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/logo.png?v=15177635785745443535" property="og:image:secure_url"/>
  <meta content="AJIJO" property="og:site_name"/>
  <meta content="summary" name="twitter:card"/>
  <link href="https://www.ajijo.com.au/collections/basmati-rice" rel="canonical"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="#f45b4f" name="theme-color"/>
 

In [11]:
list(soup.children)

['html',
 '\n',
 '[if IE 9 ]><html class="ie9 no-js" lang="en"> <![endif]',
 '\n',
 '[if (gt IE 9)|!(IE)]><!',
 ' ',
 <html class="no-js" lang="en"> <!--<![endif]-->
 <head>
 <meta charset="utf-8"/>
 <title>
     Basmati Rice – AJIJO
   </title>
 <meta content="website" property="og:type"/>
 <meta content="Basmati Rice" property="og:title"/>
 <meta content="https://www.ajijo.com.au/collections/basmati-rice" property="og:url"/>
 <meta content="http://cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/logo.png?v=15177635785745443535" property="og:image"/>
 <meta content="https://cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/logo.png?v=15177635785745443535" property="og:image:secure_url"/>
 <meta content="AJIJO" property="og:site_name"/>
 <meta content="summary" name="twitter:card"/>
 <link href="https://www.ajijo.com.au/collections/basmati-rice" rel="canonical"/>
 <meta content="width=device-width,initial-scale=1" name="viewport"/>
 <meta content="#f45b4f" name="theme-color"/>
 <l

In [12]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype,
 bs4.element.NavigableString,
 bs4.element.Comment,
 bs4.element.NavigableString,
 bs4.element.Comment,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [22]:
html = list(soup.children)[6]
list(html.children)
# [type(item) for item in list(html.children)]

[' ',
 '<![endif]',
 '\n',
 <head>
 <meta charset="utf-8"/>
 <title>
     Basmati Rice – AJIJO
   </title>
 <meta content="website" property="og:type"/>
 <meta content="Basmati Rice" property="og:title"/>
 <meta content="https://www.ajijo.com.au/collections/basmati-rice" property="og:url"/>
 <meta content="http://cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/logo.png?v=15177635785745443535" property="og:image"/>
 <meta content="https://cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/logo.png?v=15177635785745443535" property="og:image:secure_url"/>
 <meta content="AJIJO" property="og:site_name"/>
 <meta content="summary" name="twitter:card"/>
 <link href="https://www.ajijo.com.au/collections/basmati-rice" rel="canonical"/>
 <meta content="width=device-width,initial-scale=1" name="viewport"/>
 <meta content="#f45b4f" name="theme-color"/>
 <link href="//cdn.shopify.com/s/files/1/0143/0306/8224/t/2/assets/bootstrap.min.css?v=10240604753461107007" media="all" rel="stylesheet" type

In [23]:
body = list(html.children)[3]

In [40]:
# Items
soup.find_all('div', class_="product-info")

<div class="product-info clearfix">
<a href="/collections/basmati-rice/products/devaaya-basmati-rice-10-kg">BASMATI RICE, DEVAAYA 10 KG</a>
<div class="product-per-unit">
</div>
<div class="product-expiry">
</div>
<div class="products-box">
<div class="action">
<form action="/cart/add" class="variants product-actions-4621548224605" enctype="multipart/form-data" method="post" style="padding:0px;">
<input class="btn add-to-cart-btn" disabled="disabled" type="submit" value="Unavailable"/>
</form>
</div>
<div class="button-list">
<a class="ajax_add_to_cart_button cart-button product-btn btn btn-default active" data-productid="4621548224605" href="#">
<span class="tooltip">Add to Cart</span>
<span>Add to Cart</span>
</a>
<a class="wishlist wishlist-btn product-btn" href="/account/login" title="Add To Wishlist">
<span class="tooltip">Add To Wishlist</span>
<span class="lnr lnr-heart"></span>
</a>
</div>
<div class="list_info hidden">
         .
      </div>
</div>
</div>

In [43]:
product = soup.find_all('div', class_="product-info")[0]
product.find('a').text

'BASMATI RICE, DEVAAYA 10 KG'

In [45]:
for product in soup.find_all('div', class_="product-info"):
    print(product.find('a').text)

BASMATI RICE, DEVAAYA 10 KG
Basmati Rice, Feast Rozzana, India Gate, 5kg
SMALL SHELLS PASTA, SAN REMO, 500 G
INSTANT SPAGHETTI, SAN REMO, 500 G
PENNE PASTA ( FAMILY PACK), SAN REMO, 750 G
MACARONI , SAN REMO, 500 G
ELBOW PASTA, SAN REMO, 500 G
ANGEL HAIR SPAGHETTI, SAN REMO, 500 G
IDLI/DOSA RICE, KOTTACKAL MAYIL, 5 KG


In [51]:
# Prices
soup.find_all('input', class_="btn", type='button')

[<input class="btn" onclick="window.location.href='/products/devaaya-basmati-rice-10-kg'" type="button" value="Add 1 for $27.99"/>,
 <input class="btn" onclick="window.location.href='/products/basmati-rice-india-gate-everyday-5kg'" type="button" value="Add 1 for $14.50"/>,
 <input class="btn" onclick="window.location.href='/products/small-shells-pasta-san-remo-500-g'" type="button" value="Add 1 for $3.50"/>,
 <input class="btn" onclick="window.location.href='/products/instant-spaghetti-san-remo-500-g'" type="button" value="Add 1 for $3.50"/>,
 <input class="btn" onclick="window.location.href='/products/penne-pasta-family-pack-san-remo-750-g'" type="button" value="Add 1 for $4.50"/>,
 <input class="btn" onclick="window.location.href='/products/macaroni-san-remo-500-g'" type="button" value="Add 1 for $3.50"/>,
 <input class="btn" onclick="window.location.href='/products/elbow-pasta-san-remo-500-g'" type="button" value="Add 1 for $3.50"/>,
 <input class="btn" onclick="window.location.href

In [69]:
product_price = soup.find_all('input', class_="btn", type='button')[0]
product_price.get('onclick')
product_price.get('value')

'Add 1 for $27.99'

In [73]:
for price_info in soup.find_all('input', class_="btn", type='button'):
    print(price_info.get('onclick').replace("window.location.href=", ""), price_info.get('value').replace("Add 1 for" ,""))

'/products/devaaya-basmati-rice-10-kg'  $27.99
'/products/basmati-rice-india-gate-everyday-5kg'  $14.50
'/products/small-shells-pasta-san-remo-500-g'  $3.50
'/products/instant-spaghetti-san-remo-500-g'  $3.50
'/products/penne-pasta-family-pack-san-remo-750-g'  $4.50
'/products/macaroni-san-remo-500-g'  $3.50
'/products/elbow-pasta-san-remo-500-g'  $3.50
'/products/angel-hair-spaghetti-san-remo-500-g'  $3.50
'/products/idli-dosa-rice-kottackal-mayil-5-kg'  $12.50


In [113]:
# Test with a few random urls
test_urls = ["https://www.ajijo.com.au/collections/basmati-rice",
             "https://www.ajijo.com.au/collections/biscuits-cookies-arnotts",
             "https://www.ajijo.com.au/collections/banana-chips", 
             "https://www.ajijo.com.au/collections/biscuits-cookies-others",
             "https://www.ajijo.com.au/collections/century"]

data_0 = page_extractor(test_urls[0])

for url in test_urls[1:]:
    
    data = page_extractor(url)
    
    data_0 = pd.concat((data_0, data))
    
data_0.reset_index(drop=True)

Unnamed: 0,name,price_correspondance,collection,price
0,"BASMATI RICE, DEVAAYA 10 KG",'/products/devaaya-basmati-rice-10-kg',basmati-rice,$27.99
1,"Basmati Rice, Feast Rozzana, India Gate, 5kg",'/products/basmati-rice-india-gate-everyday-5kg',basmati-rice,$14.50
2,"SMALL SHELLS PASTA, SAN REMO, 500 G",'/products/small-shells-pasta-san-remo-500-g',basmati-rice,$3.50
3,"INSTANT SPAGHETTI, SAN REMO, 500 G",'/products/instant-spaghetti-san-remo-500-g',basmati-rice,$3.50
4,"PENNE PASTA ( FAMILY PACK), SAN REMO, 750 G",'/products/penne-pasta-family-pack-san-remo-75...,basmati-rice,$4.50
5,"MACARONI , SAN REMO, 500 G",'/products/macaroni-san-remo-500-g',basmati-rice,$3.50
6,"ELBOW PASTA, SAN REMO, 500 G",'/products/elbow-pasta-san-remo-500-g',basmati-rice,$3.50
7,"ANGEL HAIR SPAGHETTI, SAN REMO, 500 G",'/products/angel-hair-spaghetti-san-remo-500-g',basmati-rice,$3.50
8,"IDLI/DOSA RICE, KOTTACKAL MAYIL, 5 KG",'/products/idli-dosa-rice-kottackal-mayil-5-kg',basmati-rice,$12.50
9,ARNOTT'S TIM TAM ORIGINAL 200G,'/products/arnotts-timtam-original-200grams',biscuits-cookies-arnotts,$3.70


In [114]:
soup.find_all('a', href=True)[1].get('href')

'/collections/grains-pasta-rice'

In [126]:
all_collections = [ele.get('href') for ele in soup.find_all('a', href=True) if ("/collections/" in ele.get('href')) & ("/products/" not in ele.get('href'))]

all_collections = np.unique(all_collections)

all_urls = ["https://www.ajijo.com.au" + ele for ele in all_collections]

In [127]:
data_0 = page_extractor(all_urls[0])

for url in all_urls[1:]:
    
    data = page_extractor(url)
    
    data_0 = pd.concat((data_0, data))
    
data_0.reset_index(drop=True)

Unnamed: 0,name,price_correspondance,collection,price
0,"HALEEM MIX, AHMED",'/products/haleem-mix-ahmed',ahmed,$4.75
1,"FRUIT CHAT MASALA, AHMED, 50G",'/products/fruit-chat-masala-ahmed',ahmed,$1.80
2,"DAHI BARA MASALA, AHMED, 50G",'/products/dahi-bara-masala-ahmed',ahmed,$1.80
3,"CHICKEN MASALA, AHMED, 50G",'/products/chicken-masala-ahmed',ahmed,$1.80
4,"CHANA MASALA, AHMED, 50G",'/products/chana-masala-ahmed',ahmed,$1.80
...,...,...,...,...
1091,YOGHURT 1KG SHREE GANESHA,'/products/yoghurt-1kg-shree-ganesha',yoghurt,$5.00
1092,SHARMA KITCHEN YOGHURT 2KG,'/products/sharmas-yoghurt-2kg',yoghurt,$9.30
1093,SHARMA KITCHEN YOGHURT 1KG,'/products/sharmas-yoghurt-1kg',yoghurt,$6.00
1094,NATURAL POT SET YOGHURT FU 200G,'/products/natural-pot-set-yoghurt-200g',yoghurt,$2.30


In [130]:
len(all_urls)

# data_0.to_csv("Products and Prices (Ajijo)", index=False)

In [132]:
data_0.duplicated().sum()

0

In [34]:
url = "https://www.ajijo.com.au/collections/basmati-rice/products/devaaya-basmati-rice-10-kg"

page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

In [31]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 9 ]><html class="ie9 no-js" lang="en"> <![endif]-->
<!--[if (gt IE 9)|!(IE)]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <title>
   BASMATI RICE, DEVAAYA 10 KG – AJIJO
  </title>
  <meta content="product" property="og:type"/>
  <meta content="BASMATI RICE, DEVAAYA 10 KG" property="og:title"/>
  <meta content="https://www.ajijo.com.au/products/devaaya-basmati-rice-10-kg" property="og:url"/>
  <meta content="http://cdn.shopify.com/s/files/1/0143/0306/8224/products/devaayabasmatirice10kg3_grande.jpg?v=1587996871" property="og:image"/>
  <meta content="https://cdn.shopify.com/s/files/1/0143/0306/8224/products/devaayabasmatirice10kg3_grande.jpg?v=1587996871" property="og:image:secure_url"/>
  <meta content="http://cdn.shopify.com/s/files/1/0143/0306/8224/products/devaayabasmatirice10kg2_grande.jpg?v=1587996908" property="og:image"/>
  <meta content="https://cdn.shopify.com/s/files/1/0143/0306/8224/products/devaayabasm

In [43]:
soup.find_all('div', class_="inventory")

[<div class="inventory variants-product">
 <label>Inventory policy :</label>
 <span id="variant-inventory">
 <p>This product is out of stock.</p>
 </span>
 </div>]

In [45]:
url = "https://www.ajijo.com.au/collections/aluminium-foils-wraps/products/aluminium-foil-10m-b-g"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [52]:
soup.find_all('div', class_="inventory")[0].text

'\nInventory policy :\n\n                \n                5\n                \n                 \n              This product is available!\n\n'

In [4]:
# Get item url
all_data = pd.read_csv("Products and Prices (Ajijo).csv")
url_base = "https://www.ajijo.com.au/collections/"

all_data['item_url'] = url_base + all_data['collection'] + all_data['price_correspondance']
all_data['item_url'] = all_data['item_url'].str.replace("'","")

all_data

Unnamed: 0,name,price_correspondance,collection,price,item_url
0,"HALEEM MIX, AHMED",'/products/haleem-mix-ahmed',ahmed,$4.75,https://www.ajijo.com.au/collections/ahmed/pro...
1,"FRUIT CHAT MASALA, AHMED, 50G",'/products/fruit-chat-masala-ahmed',ahmed,$1.80,https://www.ajijo.com.au/collections/ahmed/pro...
2,"DAHI BARA MASALA, AHMED, 50G",'/products/dahi-bara-masala-ahmed',ahmed,$1.80,https://www.ajijo.com.au/collections/ahmed/pro...
3,"CHICKEN MASALA, AHMED, 50G",'/products/chicken-masala-ahmed',ahmed,$1.80,https://www.ajijo.com.au/collections/ahmed/pro...
4,"CHANA MASALA, AHMED, 50G",'/products/chana-masala-ahmed',ahmed,$1.80,https://www.ajijo.com.au/collections/ahmed/pro...
...,...,...,...,...,...
1091,YOGHURT 1KG SHREE GANESHA,'/products/yoghurt-1kg-shree-ganesha',yoghurt,$5.00,https://www.ajijo.com.au/collections/yoghurt/p...
1092,SHARMA KITCHEN YOGHURT 2KG,'/products/sharmas-yoghurt-2kg',yoghurt,$9.30,https://www.ajijo.com.au/collections/yoghurt/p...
1093,SHARMA KITCHEN YOGHURT 1KG,'/products/sharmas-yoghurt-1kg',yoghurt,$6.00,https://www.ajijo.com.au/collections/yoghurt/p...
1094,NATURAL POT SET YOGHURT FU 200G,'/products/natural-pot-set-yoghurt-200g',yoghurt,$2.30,https://www.ajijo.com.au/collections/yoghurt/p...


In [94]:
# Test with a few random urls
test_urls = ["https://www.ajijo.com.au/collections/basmati-rice",
             "https://www.ajijo.com.au/collections/biscuits-cookies-arnotts",
             "https://www.ajijo.com.au/collections/banana-chips", 
             "https://www.ajijo.com.au/collections/biscuits-cookies-others",
             "https://www.ajijo.com.au/collections/century"]

inventory = []
url_list = []
data = all_data['item_url'].sample(5)
for i, url in enumerate(data):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        inventory_info = soup.find_all('div', class_="inventory")[0].text

        inventory.append(inventory_info)
        url_list.append(url)
    
    except:
        inventory.append("Error here")
        url_list.append(url)
        
    print(i+1, "Complete, {} more to go!".format(len(data) - i - 1))

1 Complete, 4 more to go!
2 Complete, 3 more to go!
3 Complete, 2 more to go!
4 Complete, 1 more to go!
5 Complete, 0 more to go!


In [5]:
inventory = []
url_list = []
data = all_data['item_url']

for i, url in enumerate(data):
    clear_output(wait=True)
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        inventory_info = soup.find_all('div', class_="inventory")[0].text

        inventory.append(inventory_info)
        url_list.append(url)
    
    except:
        inventory.append("Error here")
        url_list.append(url)
        
    print(" {} % Complete".format(np.round((i+1)/len(data)*100 ,2)))

 100.0 % Complete


In [133]:
inventory_data = pd.DataFrame({'inventory' : inventory,
             'item_url' : url_list})

In [10]:
inventory_data['inventory'].value_counts()

\nInventory policy :\n\nThis product is available!\n\n                                                                                             362
\nInventory policy :\n\nThis product is out of stock.\n\n                                                                                          285
\nInventory policy :\n\n                \n                10\n                \n                 \n              This product is available!\n\n     64
\nInventory policy :\n\n                \n                8\n                \n                 \n              This product is available!\n\n      57
\nInventory policy :\n\n                \n                6\n                \n                 \n              This product is available!\n\n      42
\nInventory policy :\n\n                \n                9\n                \n                 \n              This product is available!\n\n      39
\nInventory policy :\n\n                \n                5\n                \n               

In [14]:
subset1 = "\nInventory policy :\n\n                \n                "
subset2 = "\n                \n                 \n              This product is available!\n\n"
inventory_data[inventory_data['inventory'].str.contains(subset1)]['inventory'].str.replace(subset1, "").str.replace(subset2, "")

1        8
2        8
3        1
4        1
5       10
        ..
1070    10
1071    10
1072    12
1074    15
1077    15
Name: inventory, Length: 433, dtype: object

In [18]:
inventory_data[inventory_data['inventory'].str.contains(subset1)]['inventory'].apply(lambda x : x.split()[3])

1        8
2        8
3        1
4        1
5       10
        ..
1070    10
1071    10
1072    12
1074    15
1077    15
Name: inventory, Length: 433, dtype: object

In [51]:
# inventory_data['inventory'].apply(lambda x : x.replace("\n", "").split(":")).value_counts()
inventory_data[~(inventory_data['inventory'].str.contains(subset1) | inventory_data['inventory'].str.contains("Error"))]['inventory'].apply(lambda x:x.replace("\n", "").split(":")[1])

0          This product is available!
6          This product is available!
11      This product is out of stock.
12      This product is out of stock.
14         This product is available!
                    ...              
1091       This product is available!
1092       This product is available!
1093       This product is available!
1094    This product is out of stock.
1095       This product is available!
Name: inventory, Length: 647, dtype: object

In [57]:
inventory_data['inventory'].str.count("\n") == 5

0        True
1       False
2       False
3       False
4       False
        ...  
1091     True
1092     True
1093     True
1094     True
1095     True
Name: inventory, Length: 1096, dtype: bool

In [72]:
inventory_present_condition = inventory_data['inventory'].str.count("\n") == 9
inventory_present_data = inventory_data[inventory_present_condition]
inventory_present_data['inventory'] = inventory_present_data['inventory'].apply(lambda x : x.split()[3])
inventory_present_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,inventory,item_url
1,8,https://www.ajijo.com.au/collections/ahmed/pro...
2,8,https://www.ajijo.com.au/collections/ahmed/pro...
3,1,https://www.ajijo.com.au/collections/ahmed/pro...
4,1,https://www.ajijo.com.au/collections/ahmed/pro...
5,10,https://www.ajijo.com.au/collections/aluminium...
...,...,...
1070,10,https://www.ajijo.com.au/collections/wheat-flo...
1071,10,https://www.ajijo.com.au/collections/wheat-flo...
1072,12,https://www.ajijo.com.au/collections/wheat-flo...
1074,15,https://www.ajijo.com.au/collections/wheat-flo...


In [89]:
inventory_absent_data = inventory_data[~inventory_present_condition]

inventory_absent_data['inventory'].str.replace("\n", "").value_counts()

Inventory policy :This product is available!       362
Inventory policy :This product is out of stock.    285
Error here                                          16
Name: inventory, dtype: int64

In [54]:
"\nInventory policy :\n\nThis product is available!\n\n".count("\n")

9

In [143]:
case_2.sum()

16

In [148]:
def extract_inventory_data(all_data):
    
    inventory = []
    url_list = []
    data = all_data['item_url']

    for i, url in enumerate(data):

        clear_output(wait=True)

        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            inventory_info = soup.find_all('div', class_="inventory")[0].text

            inventory.append(inventory_info)
            url_list.append(url)

        except:
            inventory.append("Error here")
            url_list.append(url)

        print(" {} % Complete".format(np.round((i+1)/len(data)*100 ,2)))

        inventory_data = pd.DataFrame({'inventory' : inventory,
                 'item_url' : url_list})
        
        return inventory_data
    
def clean_inventory_data(inventory_data):

        case_1 = inventory_data['inventory'].str.count("\n") == 9 
        case_2 = (inventory_data['inventory'].str.count("\n") == 5) & (inventory_data['inventory'].str.contains("available!"))
        case_3 = inventory_data['inventory'].str.contains("out of stock")
        case_4 = inventory_data['inventory'].str.contains("Error")

        inventory_data.loc[case_1, 'inventory'] = inventory_data.loc[case_1, 'inventory'].apply(lambda x : x.replace("\n","").split()[3])
        inventory_data.loc[case_2, 'inventory'] = -99 # Available but unknown
        inventory_data.loc[case_3, 'inventory'] = -199 # Out of Stock
        inventory_data.loc[case_4, 'inventory'] = -299 # No information at all

        inventory_data.inventory = inventory_data.inventory.astype(int)

        return inventory_data

SyntaxError: invalid syntax (<ipython-input-148-65d9e4904e92>, line 30)

In [147]:
inventory_data = pd.DataFrame({'inventory' : inventory,
             'item_url' : url_list})

case_1 = inventory_data['inventory'].str.count("\n") == 9 
case_2 = (inventory_data['inventory'].str.count("\n") == 5) & (inventory_data['inventory'].str.contains("available!"))
case_3 = inventory_data['inventory'].str.contains("out of stock")
case_4 = inventory_data['inventory'].str.contains("Error")

inventory_data.loc[case_1, 'inventory'] = inventory_data.loc[case_1, 'inventory'].apply(lambda x : x.replace("\n","").split()[3])
inventory_data.loc[case_2, 'inventory'] = -99 # Available but unknown
inventory_data.loc[case_3, 'inventory'] = -199 # Out of Stock
inventory_data.loc[case_4, 'inventory'] = -299 # No information at all


inventory_data.inventory = inventory_data.inventory.astype(int)

inventory_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   inventory  1096 non-null   int32 
 1   item_url   1096 non-null   object
dtypes: int32(1), object(1)
memory usage: 13.0+ KB


In [158]:
df = pd.merge(all_data, inventory_data, on='item_url')

df = df[['name', 'inventory', 'price', 'price_correspondance', 'collection', 'item_url']]

df.to_csv("AJIJO_data.csv", index=False)

In [173]:
error_urls = df[df.item_url.str.contains("page")].item_url

error_urls = error_urls.str.replace("\?page=2", "")
error_urls = error_urls.str.replace("\?page=3", "")

error_urls.values

array(['https://www.ajijo.com.au/collections/basmati-rice/products/copy-of-matta-rice-kottackal-mayil-10-kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/basmati-rice-miller-5kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/rice-1kg-india-today',
       'https://www.ajijo.com.au/collections/basmati-rice/products/pattu-premium-basmati-rice-5-kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/india-gate-premium-basmati-rice-5kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/india-gate-premium-basmati-rice-10kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/india-gate-classic-basmati-rice-5kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/india-gate-classic-basmati-rice-10kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/daawat-select-rice-1kg',
       'https://www.ajijo.com.au/collections/basmati-rice/products/daawat-everyday-basmati-r

In [181]:
bad_url_condition = all_data.item_url.str.contains("page")
all_data.loc[bad_url_condition, 'item_url'] = all_data.loc[bad_url_condition, 'item_url'].str.replace("\?page=2", "")
all_data.loc[bad_url_condition, 'item_url'] = all_data.loc[bad_url_condition, 'item_url'].str.replace("\?page=3", "")

In [174]:
inventory = []
url_list = []
data = error_urls

for i, url in enumerate(data):

    clear_output(wait=True)

    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        inventory_info = soup.find_all('div', class_="inventory")[0].text

        inventory.append(inventory_info)
        url_list.append(url)

    except:
        inventory.append("Error here")
        url_list.append(url)

    print(" {} % Complete".format(np.round((i+1)/len(data)*100 ,2)))

    new_inventory_data = pd.DataFrame({'inventory' : inventory,
             'item_url' : url_list})

 100.0 % Complete


In [176]:
clean_inventory_data(new_inventory_data)

Unnamed: 0,inventory,item_url
0,7,https://www.ajijo.com.au/collections/basmati-r...
1,-199,https://www.ajijo.com.au/collections/basmati-r...
2,5,https://www.ajijo.com.au/collections/basmati-r...
3,-199,https://www.ajijo.com.au/collections/basmati-r...
4,20,https://www.ajijo.com.au/collections/basmati-r...
5,-199,https://www.ajijo.com.au/collections/basmati-r...
6,-199,https://www.ajijo.com.au/collections/basmati-r...
7,10,https://www.ajijo.com.au/collections/basmati-r...
8,-199,https://www.ajijo.com.au/collections/basmati-r...
9,20,https://www.ajijo.com.au/collections/basmati-r...
