### Imports

In [125]:
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
import config
import datetime as dt
import re
import googletrans
from googletrans import Translator
from nltk import PorterStemmer
import inflection as inf
ps = PorterStemmer()
import os
from PIL import Image
import requests
from pathlib import Path
from io import BytesIO


import warnings
warnings.filterwarnings('ignore') 

pd.options.display.max_columns = 300
pd.options.display.max_rows = 300
pd.options.display.max_colwidth = 400

In [29]:
dbtype = config.database_new['dbtype']
user = config.database_new['user']
password = config.database_new['password']
ip = config.database_new['ip']
port = config.database_new['port']
name = config.database_new['name']

engine = create_engine(f'{dbtype}://{user}:{password}@{ip}:{port}/{name}')

### Importing the dataset

In [30]:
query = """SELECT image, product_type FROM public.competitor_products"""

In [161]:
df = pd.read_sql_query(query, engine)

In [162]:
print(f'The shape of the Dataset is: {df.shape}')

The shape of the Dataset is: (22889, 2)


### Planification

In this notebook we will organize a dataset that contains only product type and image for us to do a classification dataset.
We don't know how well the articles are classified inside of shopify because the variable that they use is a text box open for whatever the store wants to put in there. 
Luckily, Zalando has a dataset that contains 60,000 classified pictures with 10 different classes, that is very useful for us to imputate some of the unknown classes in our dataset.

Since we are trying to classify the pictures, first we have to clean and group different product types

#### All the categories are in a certain language and we want to translate them to english, for this, we use the google translator module.

In [163]:
translator = Translator()

translations = {}
unique_elements = df['product_type'].unique()
for element in unique_elements:
    translations[element] = translator.translate(element).text

In [164]:
def replacing(x, maestro, df):
    for key in maestro:
        df[x] = df[x].str.replace(key, maestro[key])
        
def replace_rootwords(word, column, df):
    df.loc[df[column].str.contains(word), column] = word
    
def clean_word(word, column, df, clean_word):
    df.loc[df[column]==word, column] = clean_word

In [165]:
replacing('product_type', translations, df) #Translate all words to english

df['product_type'] = df['product_type'].str.lower() #lower all the words to avoid capitalized buckets

df['product_type'] = df['product_type'].apply(lambda x: ' '.join([inf.singularize(item) for item in x.split()])) #singularize words to group them all

word_list = ['sneaker', 't-shirt', 'sweatshirt', 'hoodie', 'dress'] #avoid single brand usage of the word for us to group in that category. Not going to use in all words to avoid unnecessary "overfitting"
for word in word_list:
    replace_rootwords(word, 'product_type', df)
    
df.loc[df['product_type'].str.contains('women')|df['product_type'].str.contains('woman'), 'product_type'] = df['product_type'].str.replace(' - ', '').str.replace(' women', '').str.replace('women ', '').str.replace('women', '').str.replace('woman', '')
df.loc[df['product_type'].str.contains('men')|df['product_type'].str.contains('man'), 'product_type'] = df['product_type'].str.replace(' - ', '').str.replace(' men', '').str.replace('men ', '').str.replace(' man', '').str.replace('man ', '')
df.loc[df['product_type'].str.contains('unisex'), 'product_type'] = df['product_type'].str.replace(' - ', '').str.replace('unisex', '')

dirty_t = ['tshirt', 't shirt', 'tee-shirt', 'tee', 'tee shirt']
for word in dirty_t:
    clean_word(word, 'product_type', df, 't-shirt')
dirty_sweat = ['sweat', 'sweatmen']
for word in dirty_sweat:
    clean_word(word, 'product_type', df, 'sweatshirt')

In [166]:
df['product_type'].value_counts()

sneaker                                          7506
t-shirt                                          2217
sweatshirt                                        913
shoe                                              684
shirt                                             671
sweater                                           529
dress                                             503
bottom                                            431
hoodie                                            414
sandal                                            395
boot                                              383
jumper                                            349
pant                                              328
footwear                                          328
outerwear                                         274
high-heeled sandal                                265
                                                  253
top                                               225
short                       

Now let's right a little bit of code that downloads every image of the dataset so that we have it in local.
I'm really new at image storing so I will store them all in a folder called images and then maybe will optimize the storing of the images.
I'll show two ways to store images, one in the same dataframe with ByteIO

#### Dataframe saved images

In [None]:
Using a web user agent we can request all the images and save them with BytesIo library in the same dataframe. The issue with this 

In [None]:
pictures = [None] * df.shape[0]

df.insert(df.shape[1], "pictures", pictures, True)

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/xxx.xx (KHTML, like Gecko) Chrome/xx.x.xxxx.xxx Safari/xxx.xx'}

for i in range(df.shape[0]):
    r = requests.get(df.iloc[i,0], headers=headers)
    r.raise_for_status()
    df.iloc[i,2] = Image.open(BytesIO(r.content))

In [158]:
df['filename'] = df.index.astype('string') + '.jpg'

Unnamed: 0,image,product_type,pictures,filename
4683,https://cdn.shopify.com/s/files/1/0003/6270/9002/products/TSHIRT_NOT_WELCOME_GREY_008_03_FULLRES_1_67093bd4-552d-4f17-b0ed-6d3f91cfeb12.jpg?v=1573129967,t-shirt,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2000x2000 at 0x1CBD0BA4EC8>,4683.jpg
6768,https://cdn.shopify.com/s/files/1/1514/1888/products/The_Brubaker_23-1-20112969.jpg?v=1592839138,american,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1333x2000 at 0x1CBD0CF5788>,6768.jpg
10820,https://cdn.shopify.com/s/files/1/0251/4544/2350/products/zapatillas-mujer-mayfair-05.jpg?v=1598615762,sneaker,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1500x1995 at 0x1CBD0C4FF08>,10820.jpg
8181,https://cdn.shopify.com/s/files/1/0037/8925/8841/products/17_d699f47c-111d-4eb4-8fb5-5d56db2405b5.jpg?v=1582144753,jumpsuit,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1920x2880 at 0x1CBD0BA48C8>,8181.jpg
3359,https://cdn.shopify.com/s/files/1/0122/2724/8185/products/OBEX_BC_SPIN_AVIP_RIGHT.jpg?v=1574068457,helmet,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2500x2500 at 0x1CBD0CF5A48>,3359.jpg
...,...,...,...,...
21420,https://cdn.shopify.com/s/files/1/1846/7645/products/mim-shoes-sandalia-cerdena-dorado-11950909816926.jpg?v=1579066791,high-heeled sandal,,21420.jpg
16201,https://cdn.shopify.com/s/files/1/0251/4544/2350/products/zapatillas-nino-vienna-kids-5.jpg?v=1598619398,sneaker,,16201.jpg
14540,https://cdn.shopify.com/s/files/1/0251/4544/2350/products/zapatillas-mujer-new-york-5.jpg?v=1598618621,sneaker,,14540.jpg
8748,https://cdn.shopify.com/s/files/1/0209/1522/products/hoodies-macaroni-hoodie-2_7a44097f-85e6-41e0-9bf8-4988eb2edb3e.jpg?v=1601506358,hoody,,8748.jpg


In [168]:
pictures = [None] * df.shape[0]

df.insert(df.shape[1], "pictures", pictures, True)

headers = {agent}

for i in range(df.shape[0]):
    r = requests.get(df.iloc[i,0], headers=headers)
    r.raise_for_status()
    df.iloc[i,2] = Image.open(BytesIO(r.content))

Unnamed: 0,image,product_type,filename
0,https://cdn.shopify.com/s/files/1/0015/0942/5197/products/26048-PINK-3.jpg?v=1604060161,blazer,0.jpg
1,https://cdn.shopify.com/s/files/1/0015/0942/5197/products/26048-PINK-3.jpg?v=1604060161,blazer,1.jpg
2,https://cdn.shopify.com/s/files/1/0015/0942/5197/products/26048-PINK-3.jpg?v=1604060161,blazer,2.jpg
3,https://cdn.shopify.com/s/files/1/0015/0942/5197/products/26048-PINK-3.jpg?v=1604060161,blazer,3.jpg
4,https://cdn.shopify.com/s/files/1/0015/0942/5197/products/24933-BEIGE-3.jpg?v=1604055392,cap,4.jpg
...,...,...,...
22884,https://cdn.shopify.com/s/files/1/0762/7039/products/back-strap-camel-sandal-alohas-138384.jpg?v=1578543593,sandal,22884.jpg
22885,https://cdn.shopify.com/s/files/1/0762/7039/products/back-strap-camel-sandal-alohas-138384.jpg?v=1578543593,sandal,22885.jpg
22886,https://cdn.shopify.com/s/files/1/0762/7039/products/back-strap-camel-sandal-alohas-138384.jpg?v=1578543593,sandal,22886.jpg
22887,https://cdn.shopify.com/s/files/1/0762/7039/products/back-strap-camel-sandal-alohas-138384.jpg?v=1578543593,sandal,22887.jpg
