# Star Jeans _ Product pricing project based on competition

## Business problem
- What is the best selling price for pants? 

## Questions to be answered:
- What is the best selling price for the pants?
- How many types of pants and their colors for the initial product?
- What are the raw materials needed to make the pants?

## Data source
- H&M: https://www2.hm.com/en_us/men/products/jeans.html
- Macys: https://www.macys.com/shop/mens-clothing/mens-jeans

# Imports

In [183]:
# !pip install requests
# !pip install bs4

In [204]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np
import re

In [205]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
           
page = requests.get( url, headers=headers )

In [206]:
soup = BeautifulSoup( page.text, 'html.parser' )

In [207]:
products = soup.find( 'ul', class_='products-listing small' )

In [208]:
product_list = products.find_all( 'article', class_='hm-product-item')

# product id
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# product category
product_category = [p.get( 'data-category' ) for p in product_list]

# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

In [209]:
data = pd.DataFrame( [product_id, product_category, product_name,product_price] ).T
data.columns = ['product_id', 'product_category', 'product_name','product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )

In [210]:
total_item = soup.find_all( 'h2', class_='load-more-heading' )[0].get('data-total')
total_item

'86'

In [211]:
page_number = np.ceil( int( total_item ) / 36 )
page_number

3.0

In [212]:
url02 = url + '?page-size=' + str( int( page_number*36 ) )
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

# One Product

In [213]:
# # API Requests
# url = 'https://www2.hm.com/en_us/productpage.0636207010.html'
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
# page = requests.get( url, headers=headers )

# # Beautiful Soup object
# soup = BeautifulSoup( page.text, 'html.parser' )



In [214]:
# # ==================== color name =================================
# product_list = soup.find_all( 'a', class_='filter-option miniature' )
# color_name = [p.get( 'data-color' ) for p in product_list]

# # product id
# product_id = [p.get( 'data-articlecode' ) for p in product_list]
# df_color = pd.DataFrame( [product_id, color_name] ).T
# df_color.columns = ['product_id', 'color_name']

In [215]:
# # generate style id + color id
# df_color['style_id'] = df_color['product_id'].apply( lambda x: x[:-3] )
# df_color['color_id'] = df_color['product_id'].apply( lambda x: x[-3:] )

In [216]:
# # ==================== composition =================================
# product_composition_list = soup.find_all( 'div',class_='pdp-description-list-item' )
# product_composition = [list( filter( None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]

# # rename dataframe
# df_composition = pd.DataFrame( product_composition ).T
# df_composition.columns = df_composition.iloc[0]

# # delete first row
# df_composition = df_composition.iloc[1:].fillna( method='ffill' )

# # generate style id + color id
# df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3])
# df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:])

# # merge data color + decomposition
# data_sku = pd.merge( df_color, df_composition[['style_id', 'Fit','Composition']], how='left', on='style_id' )

# Multiple Products

In [217]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-20 12:43:12
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-09-20 12:43:12
2,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-20 12:43:12
3,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-20 12:43:12
4,985197003,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-20 12:43:12


In [218]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# empty dataframe
df_details = pd.DataFrame()

# unique columns for all products
aux = []
cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame( columns=cols )

for i in range( len( data ) ):
    
# API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    page = requests.get( url, headers=headers )

        # Beautiful Soup object
    soup = BeautifulSoup( page.text, 'html.parser' )

        # ==================== color name =================================
    product_list = soup.find_all( 'a', class_='filter-option miniature' )
    color_name = [p.get( 'data-color' ) for p in product_list]

        # product id
    product_id = [p.get( 'data-articlecode' ) for p in product_list]
    df_color = pd.DataFrame( [product_id, color_name] ).T
    df_color.columns = ['product_id', 'color_name']

        # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply( lambda x: x[:-3] )
    df_color['color_id'] = df_color['product_id'].apply( lambda x: x[-3:] )

# ==================== composition =================================

    product_composition_list = soup.find_all( 'div',class_='pdp-description-list-item' )
    product_composition = [list( filter( None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]

        # reaname dataframe
    df_composition = pd.DataFrame( product_composition ).T
    df_composition.columns = df_composition.iloc[0]

        # delete first row
    df_composition = df_composition.iloc[1:].fillna( method='ffill' )

        # garantee the same number of columns
    df_composition = pd.concat( [df_pattern, df_composition], axis=0 )

        # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
    df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )
    aux = aux + df_composition.columns.tolist()

        # merge data color + decomposition
    data_sku = pd.merge( df_color, df_composition[['style_id', 'Fit','Composition', 'Size', 'Product safety']], how='left', on='style_id' )

        # all details products
    df_details = pd.concat( [df_details, data_sku], axis=0 )

                                                                                                       
# Join Showroom data + details
data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )
data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit','Composition', 'Size', 'Product safety']], 
                    how='left', on='style_id' )


In [219]:
# ## Metodo lambda (Estudos)

# ## product price - remove $
# df_details['product_id'] = df_details['product_id'].apply( lambda x: x.replace( '$ ','' ) if pd.notnull( x ) else x )

# ## product price - remove $ all columns
# df_details['product_id'] = df_details.apply( lambda x: x['product_id'].replace( '$ ', '') if pd.notnull( x['product_id'] ) else x, axis=1 )

# # ## product price - remove $ - both conditional
# # df_details['product_id'] = df_details[['product_id', 'color_name']].apply( lambda x: x['product_id'].replace('$ ', '') if (pd.notnull(x['product_id']) ) & (x['color_name']=='Gray') else x, axis=1 )

# # # product price - remove $ - sequential conditional
# df_details['product_id'] = df_details[['product_id', 'color_name']].apply(lambda x: x if pd.isnull( x['product_id'] ) else x['product_id'].replace( '$ ', '')if x['color_name'] == 'Gray' else x, axis=1 )

In [232]:
import re
import pandas as %pd
data = pd.read_csv( '' )



# # product id
# data = data.dropna( subset=['product_id'] )
# data['product_id'] = data['product_id'].astype( int )

# # product name
# data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ','_' ).lower().lower() )



# data['product_price'] = data['product_price'].apply( lambda x: x.replace( '$ ','' ) ).astype( float )


# # scrapy datetime
# data['scrapy_datetime'] = pd.to_datetime( data['scrapy_datetime'],format='%Y-%m-%d %H:%M:%S' )

# # style id
# data['style_id'] = data['style_id'].astype( int )

# # color id
# data['color_id'] = data['color_id'].astype( int )

# data['color_name'] = data['color_name'].apply( lambda x: x.replace( ' ', '_' ).replace( '/', '_' ).lower() 
#                                               if pd.notnull( x ) else x )

# #fit
# data['fit'] = data['fit'].apply( lambda x: x.replace( ' ', '_' ).lower() if pd.notnull( x ) else x )

# # size number
# data['size_number'] = data['size_number'].apply( lambda x: re.search( '\d{3}cm', x ).group(0) if pd.notnull( x ) else x )
# data['size_number'] = data['size_number'].apply( lambda x: re.search( '\d+', x).group(0) if pd.notnull( x )else x )

# # size model
# data['size_model'] = data['size'].str.extract( '(\d+/\\d+)' )
# data = data.drop( columns=['size', 'product safety'], axis=1 )

# composition
data.head()

KeyError: 'size_number'

In [201]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-20 12:37:13,985197,1
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-09-20 12:37:13,985159,1
2,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-20 12:37:13,690449,51
3,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-20 12:37:13,690449,22
4,985197003,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-20 12:37:13,985197,3
