# Star Jeans _ Product pricing project based on competition

## Business problem
- What is the best selling price for pants? 

## Questions to be answered:
- What is the best selling price for the pants?
- How many types of pants and their colors for the initial product?
- What are the raw materials needed to make the pants?

## Data source
- H&M: https://www2.hm.com/en_us/men/products/jeans.html
- Macys: https://www.macys.com/shop/mens-clothing/mens-jeans

# Imports

In [1]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np
import re
import sqlite3 
from sqlalchemy import create_engine

# Loading data

# Data Collection

In [2]:
# Parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# URL
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# Request to URL
page = requests.get( url, headers=headers )

# Beautiful soup object
soup = BeautifulSoup( page.text, 'html.parser' )

#=============Proucts data=======================
products = soup.find( 'ul', class_='products-listing small' )

product_list = products.find_all( 'article', class_='hm-product-item')

# product id
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# product category
product_category = [p.get( 'data-category' ) for p in product_list]

# product name
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

data = pd.DataFrame( [product_id, product_category, product_name,product_price] ).T
data.columns = ['product_id', 'product_category', 'product_name','product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )

In [3]:
data.shape

(36, 5)

# Data collect by product

In [4]:
# # empty dataframe
df_compositions = pd.DataFrame()

# # unique columns for all products
aux = []

df_pattern = pd.DataFrame(columns=['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size'] )

for i in range(len(data)):
    
# # API Requests
#         url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
        url = 'https://www2.hm.com/en_us/productpage.'+'0985197005'+'.html'
        print(url)
        
        page = requests.get( url, headers=headers )

        # Beautiful Soup object
        soup = BeautifulSoup( page.text, 'html.parser' )

        # ==================== color name =================================
        product_list = soup.find_all( 'a', class_='filter-option miniature active')+soup.find_all( 'a', class_='filter-option miniature' )
        color_name = [p.get( 'data-color' ) for p in product_list]

        # product id
        product_id = [p.get( 'data-articlecode' ) for p in product_list]
        
        df_color = pd.DataFrame( [product_id, color_name] ).T
        df_color.columns = ['product_id', 'color_name']
        
        for j in len(df_color):
    

# # ==================== composition =================================

                product_composition_list = soup.find_all( 'div',class_='pdp-description-list-item' )
                product_composition = [list( filter( None, p.get_text().split( '\n' ) ) ) for p in product_composition_list]

                # reaname dataframe
                df_composition = pd.DataFrame( product_composition ).T
                df_composition.columns = df_composition.iloc[0]

                # delete first row
                df_composition = df_composition.iloc[1:].fillna( method='ffill' )

                # remove pocket lining, shell and lining
                df_composition['Composition']=df_composition['Composition'].replace('Pocket lining: ','',regex=True)
                df_composition['Composition']=df_composition['Composition'].replace('Shell: ','',regex=True)
                df_composition['Composition']=df_composition['Composition'].replace('Lining: ','',regex=True)


        #         # garantee the same number of columns
                df_composition = pd.concat( [df_pattern, df_composition], axis=0 )



        # rename columns

                df_composition.columns=['product_id','composition','fit','product safety','size']


        #         keep new colors if its shows up
                aux = aux + df_composition.columns.tolist()

        #         # merge data color + composition
                df_composition = pd.merge(df_composition, df_color, how='left', on='product_id' )

        #         # all details products
                df_compostions = pd.concat([df_compositions,df_composition, data_sku], axis=0 )


        #         # generate style id + color id
        #         df_composition['style_id'] = df_composition['Art. No.'].apply( lambda x: x[:-3] )
        #         df_composition['color_id'] = df_composition['Art. No.'].apply( lambda x: x[-3:] )

# # Join Showroom data + details
# data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
# data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )
# data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit','Composition', 'Size', 'Product safety']], how = 'left', on='style_id')




# data_raw.to_csv('Datasets/data_raw_star_jeans.csv', index=False)

https://www2.hm.com/en_us/productpage.0985197005.html


TypeError: 'int' object is not iterable

In [5]:
df_composition

NameError: name 'df_composition' is not defined

# Data Cleaning

In [6]:
data = pd.read_csv('Datasets/data_raw_star_jeans.csv')

# product id
data = data.dropna( subset=['product_id'] )
data['product_id'] = data['product_id'].astype( int )

# product name
data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ','_' ).lower() )

# product price
data['product_price'] = data['product_price'].apply( lambda x: x.replace( '$ ','' ) ).astype( float )

# scrapy datetime
data['scrapy_datetime'] = pd.to_datetime( data['scrapy_datetime'],format='%Y-%m-%d %H:%M:%S' )

# style id
data['style_id'] = data['style_id'].astype( int )

# color id
data['color_id'] = data['color_id'].astype( int )

# color name
data['color_name'] = data['color_name'].apply( lambda x: x.replace( ' ', '_' ).replace( '/', '_' ).lower() 
                                              if pd.notnull( x ) else x )

# fit
data['Fit'] = data['Fit'].apply( lambda x: x.replace( ' ', '_' ).lower() if pd.notnull( x ) else x )

# size number
data['size_number'] = data['Size'].apply( lambda x: re.search( '\d{3}cm', x ).group(0)
                                         if pd.notnull( x ) else x )

data['size_number'] = data['Size'].apply( lambda x: re.search( '\d+', x ).group(0) if pd.notnull( x ) else x )

# size model
data['size_model'] = data['Size'].str.extract( '(\d+/\\d+)' )

# composition

### drop Pocket lining,Lining, Shell

data=data[~data['Composition'].str.contains('Pocket lining:',na=False)]
data=data[~data['Composition'].str.contains('Lining:',na=False)]
data=data[~data['Composition'].str.contains('Shell:',na=False)]
data=data[~data['Composition'].str.contains('Pocket:',na=False)]

# Drop duplicates
data=data.drop_duplicates(subset=['product_id', 'product_category', 'product_name', 'product_price',
      'scrapy_datetime', 'style_id', 'color_id', 'color_name', 'Fit'],keep='last')

#Reset index
data=data.reset_index(drop=True)


# break composition by comma
df1=data['Composition'].str.split(',', expand=True)


# ##############################Creating a collumn for each matterials#####################################

# cotton/polyester/Elastano/Elasterell

df_ref= pd.DataFrame(index=np.arange(len(data)),columns=['cotton','polyester','elastane','elasterell'])

# Cotton
df_cotton=df1[0]
df_cotton.name = 'cotton'

df_ref= pd.concat([df_ref,df_cotton],axis=1)
df_ref=df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')] # elimina e mantem a ultima coluna 
df_ref['cotton']=df_ref['cotton'].fillna('Cotton 0%')
    
# Polyester

df_polyester= df1.loc[df1[1].str.contains('Polyester',na=True),1]
df_polyester.name = 'polyester'
df_ref= pd.concat([df_ref,df_polyester],axis=1)
df_ref=df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')] # elimina e mantem a ultima coluna 
df_ref['polyester']=df_ref['polyester'].fillna('Polyester 0%')

# # Elastano
df_elastane= df1.loc[df1[1].str.contains('Elastane',na=True),1]
df_elastane.name = 'elastane'

df_elastane=df_elastane.combine_first(df1[2])
df_ref= pd.concat([df_ref,df_elastane],axis=1)
df_ref=df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')] # elimina e mantem a ultima coluna 
df_ref['elastane']=df_ref['elastane'].fillna('Elastane 0%')


# # Elasterell
df_elasterell= df1.loc[df1[1].str.contains('Elasterell',na=True),1]
df_elasterell.name = 'elasterell'

df_ref= pd.concat([df_ref,df_elasterell],axis=1)
df_ref=df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')] # elimina e mantem a ultima coluna 
df_ref['elasterell']=df_ref['elasterell'].fillna('Elasterell 0%')


# # #Final join
data=pd.concat([data, df_ref], axis=1)
data['cotton']=data['cotton'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
data['elastane']=data['elastane'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
data['polyester']=data['polyester'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)
data['elasterell']=data['elasterell'].apply(lambda x: int(re.search('\d+',x).group(0))/100 if pd.notnull(x) else x)

#Drop columns
data = data.drop( columns=['Size', 'Product safety', 'Composition'], axis=1 )
data.drop_duplicates()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,size_number,size_model,cotton,polyester,elastane,elasterell
0,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,light_denim_blue_trashed,skinny_fit,187,32/32,0.98,0.00,0.02,0.00
1,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,denim_blue,skinny_fit,187,32/32,0.98,0.00,0.02,0.00
2,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,light_denim_blue,skinny_fit,187,32/32,0.98,0.00,0.02,0.00
3,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,black_washed_out,skinny_fit,187,32/32,0.98,0.00,0.02,0.00
4,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,white,skinny_fit,187,32/32,0.98,0.00,0.02,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,720504010,men_jeans_skinny,skinny_jeans,24.99,2021-10-08 11:55:50,720504,10,light_blue,skinny_fit,,,0.90,0.08,0.02,0.00
115,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-10-08 11:55:50,1004476,5,black_no_fade_black,slim_fit,,,0.90,0.00,0.02,0.08
116,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-10-08 11:55:50,1004476,5,light_denim_blue,slim_fit,182,31/32,0.90,0.00,0.02,0.08
117,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-10-08 11:55:50,1004476,5,dark_denim_blue,slim_fit,182,31/32,0.90,0.00,0.02,0.08


In [7]:
data.to_csv('Datasets/products_hm_cleaned.csv', index=False)

# Data Base Sqlite

In [26]:
data = pd.read_csv('Datasets/products_hm_cleaned.csv')

In [27]:
query_showroom_schema = '''
    CREATE TABLE showroom (
        product_id        INTEGER, 
        product_category  TEXT, 
        product_name      TEXT,
        product_price     REAL,
        scrapy_datetime   TEXT, 
        style_id          INTEGER, 
        color_id          TEXT, 
        color_name        TEXT, 
        Fit               TEXT,
        size_number       REAL, 
        size_model        TEXT, 
        cotton            REAL, 
        polyester         REAL, 
        elastane          REAL,
        elasterell        REAL
  
    )
    
'''

In [28]:
#connect to data base 

conn = sqlite3.connect ('hm_db.sqlite')
conn.execute(query_showroom_schema)
conn.commit()
conn.close()

In [29]:
conn=create_engine('sqlite:///hm_db.sqlite',echo=False)

In [30]:
query = """
    SELECT* FROM showroom
"""
df = pd.read_sql_query( query, conn )

In [31]:
# insert data to table
data.to_sql( 'showroom', con=conn, if_exists='append', index=False )

In [32]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,size_number,size_model,cotton,polyester,elastane,elasterell
0,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,light_denim_blue_trashed,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
1,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,denim_blue,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
2,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,light_denim_blue,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
3,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,black_washed_out,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
4,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 11:55:50,690449,51,white,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
