# Problem Statement :

Write an application to crawl an online fashion brand website, e.g. https://www.fordays.com, https://www.reformation.com or https://www.zara.com using a crawler framework such as Selenium, bs4, etc. You can use a crawl framework of your choice in Python. (YOU ONLY NEED TO SCRAPE A FEW PRODUCTS, not entire website, however, please explain your strategy to scrape the whole website, extract all the URLs and update the database automatically overtime (new products, update old products not available anymore)

this is an example of how the extracted information should be structured:

1.   display_name (str)
2.   product_material (str)
1.   color (str)
1.   size (list)
1.   price (str)
6.   product_url (str)
7.   image_links (list)
8.   brand_name (str)
1.   description (str)
2.   scrapped_date (date)
1.   low_level (str) [category of clothes: e.g. casual pants, dress]
2.   gender (str) [men, women, or kids]
13.  secondhand (bool) [is it from a second hand retailer, already worn?]

Store the data in a hosted PostGRES database.

In [1]:
#! pip install beautifulsoup4



In [1]:
import os
import re
import psycopg2
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen,Request

import sys
!{sys.executable} -m pip install psycopg2


# os.environ['host'] = /// set as your RDS endpoint 
# os.environ['username'] = /// set as your username
# os.environ['password'] = /// Set as your password


  """)




In [136]:
def get_product_details(url):
  # Empty dictionary to store the results
  product = {}

  # Getting the HTML Data from the specified url
  req = Request(url)
  page = urlopen(req)
  html = page.read().decode("utf-8")
  soup = BeautifulSoup(html, "html.parser")
  text = soup.get_text()

  # Extracting the display name of the product
  main_body = soup.find('div',{'class':'pdp-main__details max-width--xmedium'})
  title = main_body.find('div',{'class':'pdp__title'})
  display_name = str(title.h1.contents[0]).replace('\n','')

  #Extracting the price details of the product
  price_details = main_body.find('div',{'class':'price'}).find('span',{'class':'price--reduced'}).string
  price_details = price_details.replace('\n','')

  #Extracting the description of the product 
  main_details = main_body.find('div',{'class':'pdp__description display--small-up font-size--14 font-family--book margin-b--15'}).find('div',{'class':'cms-generic-copy'})
  description = str(main_details.string).replace('\n','')

  #Extracting the color of the product
  attributes = main_body.find('div',{'class':'product-attribute__list'})
  color_attr = attributes.find('label',{'class':'product-attribute__label product-attribute__label--color form-control-label'})
  color = str(color_attr.find('span',{'class':'product-attribute__selected-value'}).contents)

  #Extracting the size of the product 
  size_attr = attributes.find('div',{'class':'product-attribute product-attribute--size product-attribute--type-anchor product-attribute--last'})
  size_attr = size_attr.find('div',{'class':'product-attribute__contents flex flex-flow-wrap'})
  #Storing the sizes as a list
  sizes = []
  for child in size_attr.children:
    sizes.append(child.string.replace('\n',''))
  sizes = [sizes[i] for i in range(len(sizes)) if i % 2 == 1]

  #Extracting the fabric and details of the product
  product_details = main_body.find('div',{'class':'pdp__accordion'})
  fabric_care = product_details.find('div',{'class':'pdp__accordion-content js-pdp-care'})
  #Extracting only the fabric description which consists of a blend of different fabrics
  for info in fabric_care.contents:
    if any(chr.isdigit() for chr in info.string):
      product_material = info.string



  product_material = product_material.replace('\n','')


  product_page = soup.find('div',{'class':'pdp-main'})
  bread_crumbs = product_page.find('div',{'class':'pdp__breadcrumbs max-width--xlarge display--medium-up'})
  url_pieces = [value 
            for element in bread_crumbs.find_all('a',class_=True) 
            for value in element["href"]]
  product_url = "".join(url_pieces)
  product['display_name'] = display_name
  product['price_details'] = int(price_details.replace('$',''))
  product['description'] = description.replace(","," ")
  product['color'] = color 
  product['sizes'] = "["+" ".join(sizes)+"]"
  product['product_material'] = product_material.replace(","," ")
  product['product_url'] = product_url 

  return product



data = get_product_details("https://www.thereformation.com/products/ezrana-knit-tank/1310760BLK.html?dwvar_1310760BLK_color=BLK")


In [137]:
data

{'color': "['Black']",
 'description': 'Your arms want some attention. The Ezrana is a sleeveless  fitted top with a square neckline. It features chain link straps and can be worn day-to-night if you ask us.',
 'display_name': 'Ezrana Knit Tank',
 'price_details': 128,
 'product_material': 'Eco Rib is a medium weight  stretchy ribbed knit with a soft handfeel - 88% TENCEL™ lyocell  12% spandex.',
 'product_url': 'https://www.thereformation.com//tops/products/ezrana-knit-tank/1310760.html',
 'sizes': '[XS S M L XL]'}

In [139]:
df = pd.DataFrame([data])
df['color'] = df['color'].astype(str)
df['description'] = df['description'].astype(str)
df['display_name'] = df['display_name'].astype(str)
df['price_details'] =  df['price_details'].astype(float)
df['product_material'] = df['product_material'].astype(str)
df['product_url'] = df['product_url'].astype(str)
df['sizes'] = df['sizes'].astype(str)

In [140]:
df.to_csv('new_data.csv',index= None)

In [129]:
#!curl ifconfig.me

In [147]:



connection = psycopg2.connect(
    host = os.environ.get('host'),
    port = 5432,
    user =  os.environ.get('username'),
    password =  os.environ.get('password'),
    database='postgres'
    )
cursor=connection.cursor()

In [148]:
cursor.execute("""CREATE TABLE product_details_12(
display_name text,
price_details float,
description text,
color text,
sizes text,
product_material text,
product_url text)""")

In [149]:
connection.commit()

In [150]:
with open('new_data.csv', 'r') as row:
    next(row) 
    cursor.copy_from(row, 'product_details_12', sep=',')