In [2]:
import pandas as pd
import gzip
import json
import seaborn as sns
import numpy as np
import time
import html
import re
from tqdm import trange
import os
from typing import *
from tqdm import tqdm, trange
import urllib.request
from PIL import Image
import cv2 as cv
from matplotlib import pyplot as plt

### Предобработка датасета, 

In [3]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

        
def getPandasDataFrame(path, limit=None):
    i = 0
    df = {}
    try:
        for d in tqdm(parse(path)):
            df[i] = d
            i += 1
            if limit is not None and i >= limit:
                break
    except KeyboardInterrupt:
        pass
    return pd.DataFrame.from_dict(df, orient='index')

In [7]:
%%time
meta_electronics = getPandasDataFrame('meta_Electronics.json.gz', limit=None)

786445it [12:38, 1037.12it/s]


Wall time: 12min 47s


In [92]:
meta_electronics.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Electronics, Camera &amp; Photo, Video Survei...",,[The following camera brands and models have b...,,Genuine Geovision 1 Channel 3rd Party NVR IP S...,[],,GeoVision,"[Genuine Geovision 1 Channel NVR IP Software, ...","[>#3,092 in Tools &amp; Home Improvement &gt; ...",[],Camera &amp; Photo,,"January 28, 2014",$65.00,11300000,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
1,"[Electronics, Camera &amp; Photo]",,[This second edition of the Handbook of Astron...,,"Books ""Handbook of Astronomical Image Processi...",[0999470906],,33 Books Co.,[Detailed chapters cover these fundamental top...,"[>#55,933 in Camera &amp; Photo (See Top 100 i...","[0943396670, 1138055360, 0999470906]",Camera &amp; Photo,,"June 17, 2003",,43396828,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Electronics, eBook Readers &amp; Accessories,...",,[A zesty tale. (Publishers Weekly)<br /><br />...,,One Hot Summer,"[0425167798, 039914157X]",,Visit Amazon's Carolina Garcia Aguilera Page,[],"3,105,177 in Books (",[],Books,,,$11.49,60009810,[],[],
3,"[Electronics, eBook Readers & Accessories, eBo...",,[],,Hurray for Hattie Rabbit: Story and pictures (...,"[0060219521, 0060219580, 0060219394]",,Visit Amazon's Dick Gackenbach Page,[],"2,024,298 in Books (","[0060219521, 0060219475, 0060219394]",Books,,,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602,[],[],
4,"[Electronics, eBook Readers & Accessories, eBo...",,[&#8220;sex.lies.murder.fame. is brillllli&#82...,,sex.lies.murder.fame.: A Novel,[],,Visit Amazon's Lolita Files Page,[],"3,778,828 in Books (",[],Books,,,$13.95,60786817,[],[],


### Подготовим датасет

In [87]:
meta_electronics = meta_electronics.drop_duplicates('title')
meta_electronics.shape

(742022, 19)

In [88]:
# функция для обработки pandas-df батчами
def map_by_batches(df, func, batch_size = 2000):
    batch_results = []
    for i in trange(df.shape[0] // batch_size + 1):
        l = batch_size * i
        r = min(df.shape[0], l + batch_size)
        batch_results.append(func(df[l : r]))
    return pd.concat(batch_results)

In [93]:
# парсим категорию и ранг из строки 
def extract_rank_and_category(data):
    rank_str = re.compile(">#[\d,]* in ")
    
    ranks = []
    categories = []

    description = []
    title = []
    brand = []
    feature = []
    price = []
    images = []
    imagesHighRes = []
    asin = []

    def extract(string):
        result = rank_str.match(string)
        if not result:
            return -1, ""
        rank = result.group(0)[2:-4]
        category = string[result.span()[1]:] 
        return int(rank.replace(',', '')), html.unescape(category)
    try:
        for i in range(data.shape[0]):
            string = data.iloc[i]['rank']
            if not isinstance(string, str):
                if not string:
                    continue
                else:
                    string_list = string
                    for string in string_list:
                        rank, category = extract(string)
                        if rank == -1:
                            continue
                        ranks.append(rank)
                        categories.append(category)
                        description.append(data.iloc[i]['description'])
                        title.append(data.iloc[i]['title'])
                        brand.append(data.iloc[i]['brand'])
                        feature.append(data.iloc[i]['feature'])
                        price.append(data.iloc[i]['price'])
                        asin.append(data.iloc[i]['asin'])
                        images.append(data.iloc[i]['imageURL'])
                        imagesHighRes.append(data.iloc[i]['imageURLHighRes'])
            else:
                rank, category = extract(string)
                if rank == -1:
                    continue
                ranks.append(rank)
                categories.append(category)
                description.append(data.iloc[i]['description'])
                title.append(data.iloc[i]['title'])
                brand.append(data.iloc[i]['brand'])
                feature.append(data.iloc[i]['feature'])
                price.append(data.iloc[i]['price'])
                asin.append(data.iloc[i]['asin'])
                images.append(data.iloc[i]['imageURL'])
                imagesHighRes.append(data.iloc[i]['imageURLHighRes'])
    except KeyboardInterrupt:
        pass

    df = pd.DataFrame({'rank':np.array(ranks), 'category':np.array(categories), 
                                      'description':np.array(description), 'title':np.array(title),
                                      'brand':np.array(brand), 'feature':np.array(feature),
                                      'price':np.array(price),
                                      'asin': np.array(asin),
                                      'images': np.array(images),
                                      'imagesHighRes': np.array(imagesHighRes),
                                     }, dtype=object)
    return df

In [94]:
meta_electronics_with_prices = meta_electronics[meta_electronics['price'] != '']
meta_electronics_with_prices.shape

(291809, 19)

In [95]:
def simple_filter(df):
    df = df[df['price'] != '']
    df = df.drop_duplicates('title')
    print(df.shape)
    return df

In [100]:
%%time
meta_Arts_Crafts_and_Sewing = getPandasDataFrame('meta_Arts_Crafts_and_Sewing.json.gz', limit=None)
meta_Arts_Crafts_and_Sewing = simple_filter(meta_Arts_Crafts_and_Sewing)

302988it [01:22, 3683.56it/s]


(182572, 19)
Wall time: 1min 27s


In [101]:
meta_electronics_clear = map_by_batches(meta_electronics_with_prices, extract_rank_and_category, batch_size = 20000)

  'description':np.array(description), 'title':np.array(title),
  'brand':np.array(brand), 'feature':np.array(feature),
  'images': np.array(images),
  'imagesHighRes': np.array(imagesHighRes),
100%|██████████| 15/15 [03:29<00:00, 13.99s/it]


In [102]:
meta_Arts_Crafts_and_Sewing = map_by_batches(meta_Arts_Crafts_and_Sewing, extract_rank_and_category, batch_size = 20000)

  'description':np.array(description), 'title':np.array(title),
  'brand':np.array(brand), 'feature':np.array(feature),
  'images': np.array(images),
  'imagesHighRes': np.array(imagesHighRes),
100%|██████████| 10/10 [02:37<00:00, 15.78s/it]


In [157]:
metadata_clear = pd.concat([meta_Arts_Crafts_and_Sewing, meta_electronics_clear])

In [159]:
metadata_clear.shape

(879794, 10)

In [160]:
metadata_clear.head()

Unnamed: 0,rank,category,description,title,brand,feature,price,asin,images,imagesHighRes
0,9616321,Home & Kitchen (See Top 100 in Home & Kitchen),[],Pinkie Tm girl flower Handmade soap silicone m...,,[],$35.00,7121277158,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
1,11120,"Arts, Crafts & Sewing > Craft Supplies > Soap ...",[],Pinkie Tm girl flower Handmade soap silicone m...,,[],$35.00,7121277158,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
2,1953912,Home & Kitchen (See Top 100 in Home & Kitchen),[],Pinkie Tm 3D blooming chrysanthemums Flower So...,,[],$35.00,7121280027,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
3,2574,"Arts, Crafts & Sewing > Craft Supplies > Soap ...",[],Pinkie Tm 3D blooming chrysanthemums Flower So...,,[],$35.00,7121280027,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,761776,Kitchen & Dining (See Top 100 in Kitchen & Din...,[moldsize:L7.2xW3.7xH4.8cm(L2.83xW1.46xH1.89in...,Pinkie Tm Rabbit animal silicone soap mold for...,pinkie,"[Made in 100% pure silicone,softness and comfo...",$7.99,7121281821,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [163]:
# ! pip install beautifulsoup4
from bs4 import BeautifulSoup

def get_filtered_sentence(sentence):
    cleantext = BeautifulSoup(str(sentence).replace("[", "").replace("'", "").replace("]", ""), "html.parser").text
    return cleantext

In [164]:
metadata_clear['description'] = metadata_clear['description'].progress_apply(lambda x: get_filtered_sentence(str(x)))

100%|██████████| 879794/879794 [03:28<00:00, 4222.69it/s]


In [165]:
metadata_clear['title'] = metadata_clear['title'].progress_apply(lambda x: get_filtered_sentence(str(x)))

100%|██████████| 879794/879794 [01:47<00:00, 8161.81it/s]


In [166]:
metadata_clear['feature'] = metadata_clear['feature'].progress_apply(lambda x: get_filtered_sentence(str(x)))

100%|██████████| 879794/879794 [02:33<00:00, 5747.55it/s]


In [167]:
metadata_clear['brand'] = metadata_clear['brand'].progress_apply(lambda x: get_filtered_sentence(str(x)))

100%|██████████| 879794/879794 [01:54<00:00, 7700.74it/s]


In [168]:
from price_parser import Price

def parse_price(price):
    res = 0
    try:
        res = Price.fromstring(str(price)).amount_float
    except Exception:
        res = -1
    return res

In [169]:
metadata_clear['price'] = metadata_clear['price'].progress_apply(lambda x: parse_price(x))

100%|██████████| 879794/879794 [00:30<00:00, 29266.40it/s]


In [170]:
metadata_clear = metadata_clear.dropna()

In [171]:
metadata_clear.shape

(878653, 10)

Создаём уникальные id для категорий

In [172]:
categories_list = metadata_clear['category'].values.tolist()

In [173]:
len(set(categories_list))

2682

In [174]:
from collections import defaultdict
  
temp = defaultdict(lambda: len(temp))
categories_ids = [temp[ele] for ele in categories_list]
len(set(categories_ids))

2682

In [175]:
metadata_clear['category_id'] = np.array(categories_ids)

In [176]:
metadata_clear['rank'].describe()

count     878653
unique    316786
top            1
freq         448
Name: rank, dtype: int64

Уберём очень большие ранги

In [177]:
metadata_clear = metadata_clear[metadata_clear['rank'] <= 1000000]

In [178]:
metadata_parsed = metadata_clear.copy()

In [179]:
metadata_parsed['rank'] = metadata_parsed['rank'].astype(float)

Модели ранжирования в TF Ranking подразумевают, что чем больше скор, тем лучше, поэтому нужна функция, которая адекватно переводит ранги из отсортированных по возрастанию в отсортированные по убыванию, в идеале - ещё в небольшом диапазоне, для вычислительной стабильности

In [180]:
metadata_parsed['rank_scaled'] = np.log10(metadata_parsed['rank'])

In [181]:
metadata_parsed.head()

Unnamed: 0,rank,category,description,title,brand,feature,price,asin,images,imagesHighRes,category_id,rank_scaled
1,11120.0,"Arts, Crafts & Sewing > Craft Supplies > Soap ...",,Pinkie Tm girl flower Handmade soap silicone m...,,,35.0,7121277158,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,1,4.046105
3,2574.0,"Arts, Crafts & Sewing > Craft Supplies > Soap ...",,Pinkie Tm 3D blooming chrysanthemums Flower So...,,,35.0,7121280027,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,1,3.410609
4,761776.0,Kitchen & Dining (See Top 100 in Kitchen & Din...,moldsize:L7.2xW3.7xH4.8cm(L2.83xW1.46xH1.89inc...,Pinkie Tm Rabbit animal silicone soap mold for...,pinkie,"Made in 100% pure silicone,softness and comfor...",7.99,7121281821,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,2,5.881827
5,636.0,"Arts, Crafts & Sewing > Craft Supplies > Ceram...",moldsize:L7.2xW3.7xH4.8cm(L2.83xW1.46xH1.89inc...,Pinkie Tm Rabbit animal silicone soap mold for...,pinkie,"Made in 100% pure silicone,softness and comfor...",7.99,7121281821,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,3,2.803457
7,7303.0,"Arts, Crafts & Sewing > Craft Supplies > Soap ...",,Pinkie Tm Womens Wallets handbag purse shaped ...,,,35.0,7121282534,[],[],1,3.863501


In [182]:
metadata_parsed['rank_scaled'].describe()

count    818780.000000
mean          4.139722
std           1.089932
min           0.000000
25%           3.392873
50%           4.195124
75%           4.989192
max           5.999997
Name: rank_scaled, dtype: float64

In [183]:
metadata_parsed['rank_scaled'] = -metadata_parsed['rank_scaled'] + metadata_parsed['rank_scaled'].max() 

In [184]:
metadata_parsed['rank_scaled'].describe()

count    818780.000000
mean          1.860274
std           1.089932
min           0.000000
25%           1.010805
50%           1.804872
75%           2.607124
max           5.999997
Name: rank_scaled, dtype: float64

In [185]:
df = metadata_parsed.groupby('category_id')['title'].count() > 40

In [186]:
df = df.reset_index()

In [187]:
category_ids = list(df[df['title'] == True].category_id)

In [188]:
metadata_parsed = metadata_parsed[metadata_parsed['category_id'].isin(category_ids)]

In [189]:
metadata_parsed.shape

(805088, 12)

In [191]:
%%time
metadata_parsed.to_csv('metadata_parsed_with_images.csv', index=False)

Wall time: 29.4 s


Работа с изображениями

In [4]:
%%time
metadata_test = pd.read_csv('metadata_parsed_with_images.csv')

CPU times: total: 8.55 s
Wall time: 8.83 s


In [5]:
metadata_test.shape

(805088, 12)

In [6]:
metadata_test = metadata_test.fillna('')

In [7]:
import ast

In [8]:
%%time
metadata_test['images'] = metadata_test['images'].apply(lambda x: ast.literal_eval(x))

CPU times: total: 9.88 s
Wall time: 9.91 s


In [9]:
%%time
metadata_test['imagesHighRes'] = metadata_test['imagesHighRes'].apply(lambda x: ast.literal_eval(x))

CPU times: total: 9.91 s
Wall time: 9.91 s


функция скачивает изображение и сохраняет по пути path

In [12]:
import traceback
from functools import partial
def get_image(urls, path='images_0/'):
    import traceback
    import urllib.request
    try:
        if len(urls) == 0:
            return None
        url = urls[0]
        urllib.request.urlretrieve(url, path + url[49:])
    except Exception:
        print(f"Exception: {traceback.format_exc()}")
    return None

In [21]:
%%time
import pandas as pd
import mapply

mapply.init(
    n_workers=-1
)

# _ = part.images.mapply(partial(get_image, path='images_0/'))

CPU times: total: 15.6 ms
Wall time: 48.1 ms


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
part = metadata_test.iloc[500000:600000]
part.imagesHighRes.mapply(partial(get_image, path=f'images_high_res_small/'))

 30%|███       | 17/56 [06:09<13:23, 20.61s/it]  