In [None]:
import numpy as np
import pandas as pd
import lxml.html
import json
import math

## POST Получить категории

In [None]:
file = open('/home/abezpalov/data/treolan/categories_data.xml', 'rb')
tree = lxml.html.parse(file)

In [None]:
def fix_category_name(text):
    return text.split('. ')[-1]

# Преобразуем формат
data_list = list()
for item in tree.xpath('.//category'):
    row_ = dict()
    for key in item.keys():
        row_[key] = item.get(key)
    data_list.append(row_)
df = pd.DataFrame(data_list)

# Чистим данные
df = df.rename(columns={'id': 'key', 'parentid': 'parent_key', 'sortindex': 'order'})
df['name'] = df['name'].apply(fix_category_name)

In [None]:
df

## POST Получить каталог

In [None]:
file = open('/home/abezpalov/data/treolan/products_data.xml', 'rb')
tree = lxml.html.parse(file)

In [None]:
# Преобразуем формат
data_list = list()
for category_item in tree.xpath('.//category'):
    category_key = category_item.xpath('./@id')[0]
    for item in category_item.xpath('./position'):
        row_ = dict()
        row_['category_key'] = category_key
        for key in item.keys():
            row_[key] = item.get(key)
        data_list.append(row_)
df = pd.DataFrame(data_list)
df

In [None]:
# vendor_df
vendors_df = df[['vendor-id', 'vendor']].copy().drop_duplicates()
vendors_df = vendors_df.rename(columns={'vendor-id': 'key', 'vendor': 'name'})
vendors_df

In [None]:
# product_df

def get_product_metadata(row):
    metadata = dict(row[['articul', 'prid']])
    return json.dumps(metadata)

def get_part_number(x):
    x = x.strip()
    ends = ('..', '.', '_', '_-AFTER_DEMO', '-AFTER_DEMO', '_DEMO', '-DEMO', '-NC1', '-NC2', '_CTO1', '-NC1-001',
            '-NC2-001', '-NC2-002', '-NC3-001', '-NC3-002', '-NNC-001', '-NNC-002', '-NNC-003', '-NNC-004',
            '-NNC-005', '-NNC-006', '-NNC-007', '-NNC-008', '-NNC-009', '-NNC-011')
    for end in ends:
        if x.endswith(end):
            x = x[:-len(end)]
    return x

def get_product_name(row):
    if row['rusdescr'] is not None:
        return row['rusdescr']
    else:
        return row['name']

def get_out_of_trade(x):
    return True if x == "X" else False

def get_warranty(x):
    x = str(x).lower().strip()
    x = x.replace('.', '')
    while '  ' in x:
        x = x.replace('  ', ' ')
    to_m = {'нет гарантии': 0,
            '1 год': 12,
            '2 года': 24,
            '3 года': 36,
            '3 года - коммутатор, 1 год - трансиверы': 36,
            '4 года': 48,
            '5 лет': 60,
            '6 лет': 72,
            '7 лет': 84,
            '10 лет': 120,
            '1 месяц': 1,
            '3 месяца': 3,
            '6 месяцев': 6,
            '12 месяцев': 12,
            '18 месяцев': 18,
            '36 месяцев': 36}
    return to_m[x] if x in to_m else None

def get_min_of_quantity(x):
    return int(x) if str == type(x) else 1

def get_traceable(x):
    return True if x == '1' else False

def get_length(row):
    if str == type(row['length_']) and str == type(row['width_']):
        return max(float(row['length_']), float(row['width_'])) / 100
    else:
        return None

def get_width(row):
    if str == type(row['length_']) and str == type(row['width_']):
        return min(float(row['length_']), float(row['width_'])) / 100
    else:
        return None

def get_height(row):
    if str == type(row['height']):
        return float(row['height']) / 100
    else:
        return None

def get_volume(row):
    try:
        return row['length'] * row['width'] * row['height']
    except TypeError:
        return None

def get_unit_key(row):
    return 'м' if row['multiplicity'] > 1 else 'шт'

def get_promo(row):
    return True if row['uchmark'] == "2" else False

products_df = df[['id', 'category_key', 'vendor-id', 'prid', 'articul', 'name', 'rusdescr', 'gp', 'outoftrade', 'multiplicity', 'width',
                  'length', 'height', 'brutto', 'istraceable', 'addinfo', 'uchmark']].copy()
products_df['metadata'] = products_df.apply(get_product_metadata, axis=1)
products_df['part_number'] = products_df['articul'].apply(get_part_number)
products_df['name'] = products_df.apply(get_product_name, axis=1)
products_df['eol'] = products_df['outoftrade'].apply(get_out_of_trade)
products_df['warranty'] = products_df['gp'].apply(get_warranty)
products_df['multiplicity'] = products_df['multiplicity'].apply(get_min_of_quantity)
products_df['unit_key'] = products_df.apply(get_unit_key, axis=1)
products_df['min_of_quantity'] = products_df['multiplicity']
products_df['step_of_quantity'] = products_df['multiplicity']
products_df['traceable'] = products_df['istraceable'].apply(get_traceable)
products_df = products_df.rename(columns={'length': 'length_', 'width': 'width_'})
products_df['length'] = products_df.apply(get_length, axis=1)
products_df['width'] = products_df.apply(get_width, axis=1)
products_df['height'] = products_df.apply(get_height, axis=1)
products_df['volume'] = products_df.apply(get_volume, axis=1)
products_df['promo'] = products_df.apply(get_promo, axis=1)

products_df = products_df.rename(columns={'id': 'key',
                                          'vendor-id': 'vendor_key',
                                          'gp': 'warranty_description',
                                          'brutto': 'weight'})
products_df = products_df.drop(['articul', 'prid', 'addinfo', 'rusdescr', 'outoftrade', 'multiplicity', 'istraceable',
                                'length_', 'width_', 'uchmark'], axis=1)
products_df

In [None]:
barcodes_data_list = list()
for n, item in df.iterrows():
    if item['gtin']:
        row_ = dict()
        row_['product_key'] = item['id']
        row_['value'] = item['gtin']
        barcodes_data_list.append(row_)

barcodes_df = pd.DataFrame(barcodes_data_list)
barcodes_df = barcodes_df.assign(form="GTIN")
barcodes_df

In [None]:
def get_unconditional(x):
    ends = ('_-AFTER_DEMO', '-AFTER_DEMO', '_DEMO', '-DEMO', '-NC1', '-NC2', '_CTO1', '-NC1-001',
            '-NC2-001', '-NC2-002', '-NC3-001', '-NC3-002', '-NNC-001', '-NNC-002', '-NNC-003', '-NNC-004',
            '-NNC-005', '-NNC-006', '-NNC-007', '-NNC-008', '-NNC-009', '-NNC-011')
    for end in ends:
        if x.endswith(end):
            return True
    return False

def get_price(x):
    if x:
        x = float(x)
        if x < 0.01:
            return np.nan
        else:
            return x
    else:
        return np.nan

def get_quantity(x):
    if x is None:
        return 0
    elif float == type(x) and math.isnan(x):
        return 0
    elif 'много' in x:
        return 10
    elif '<' in x:
        x = x.replace('<', '')
        return int(x) // 2

    dictionary = {'+': '', '>': '', '*': '', '<': ''}
    for key in dictionary:
        x = x.replace(key, dictionary[key])
    return int(x)

def get_quantity_great_than(x):
    if x is None:
        return False
    elif float == type(x) and math.isnan(x):
        return False
    keys = ('+', '>', 'много')
    for key in keys:
        if key in x:
            return True
    return False

def get_party_metadata(row):
    metadata = dict(row[['prid', 'ntdate', 'ntstatus']])
    return json.dumps(metadata)

parties_df = df[['id', 'prid', 'articul', 'price', 'currency', 'recommendedprice', 'recommendedcurrency', 'dprice',
                 'uchmark', 'sale', 'freenom', 'freeptrans', 'ntdate', 'ntstatus', 'multiplicity']].copy()

parties_df = parties_df.rename(columns={'id': 'product_key',
                                        'currency': 'currency_key_in',
                                        'recommendedcurrency': 'currency_key_out'})
parties_df['unconditional'] = parties_df['articul'].apply(get_unconditional)
parties_df['price_in'] = parties_df['dprice'].apply(get_price)
parties_df['price_out'] = parties_df['recommendedprice'].apply(get_price)
parties_df['metadata'] = parties_df.apply(get_party_metadata, axis=1)
parties_df['multiplicity'] = parties_df['multiplicity'].apply(get_min_of_quantity)
parties_df['unit_key'] = parties_df.apply(get_unit_key, axis=1)

parties_df = parties_df.drop(['articul', 'prid', 'price', 'recommendedprice', 'dprice', 'ntdate', 'ntstatus',
                              'uchmark', 'sale', 'multiplicity'], axis=1)

stock_df = parties_df.copy()
stock_df['quantity'] = stock_df['freenom'].apply(get_quantity)
stock_df['quantity_great_than'] = stock_df['freenom'].apply(get_quantity_great_than)
stock_df = stock_df.assign(location_key="Склад")

transit_df = parties_df.copy()
transit_df['quantity'] = stock_df['freeptrans'].apply(get_quantity)
transit_df['quantity_great_than'] = stock_df['freeptrans'].apply(get_quantity_great_than)
transit_df = transit_df.assign(location_key="Транзит")

parties_df = pd.concat([stock_df, transit_df])
parties_df = parties_df.drop(['freenom', 'freeptrans'], axis=1)
parties_df = parties_df[parties_df['quantity'] > 0.1]

parties_df

## POST Получить информацию о продукте

In [None]:
file = open('/home/abezpalov/data/treolan/0cf122fa-898a-4bf6-9c68-6b6f315eabde_content_data.xml', 'rb')
tree = lxml.html.parse(file)

In [None]:
data_list = list()
for item in tree.xpath('.//propertygroup'):
    row_ = dict()
    for key in item.keys():
        row_[key] = item.get(key)
        data_list.append(row_)

groups_df = pd.DataFrame(data_list)

groups_df = groups_df.rename(columns={'id': 'key', 'sort': 'order'})
groups_df = groups_df.drop(['level'], axis=1)

groups_df

In [None]:
data_list = list()
for group_item in tree.xpath('.//propertygroup'):
    for item in group_item.xpath('./property'):
        row_ = dict()
        row_['group_key'] = group_item.get('id')
        for key in item.keys():
            row_[key] = item.get(key)
        data_list.append(row_)

df = pd.DataFrame(data_list)

parameters_df = df[['group_key', 'id', 'name', 'sort']].copy()
parameters_df = parameters_df.rename(columns={'id': 'key',
                                              'sort': 'order'})
parameters_df

In [None]:
values_df = df[['group_key', 'id', 'value']].copy()
values_df = values_df.rename(columns={'id': 'parameter_key'})

values_df

In [None]:
def get_image_order(x):
    return 0 if x == '1' else 1

data_list = list()
for item in tree.xpath('.//picturelink/row'):

    row_ = dict()
    for key in item.keys():
        row_[key] = item.get(key)
    data_list.append(row_)
    
images_df = pd.DataFrame(data_list)
images_df['order'] = images_df['main'].apply(get_image_order)
images_df = images_df.rename(columns={'link': 'source_url'})
images_df = images_df.drop(['imagesize', 'imagetype', 'isoldimage', 'main'], axis=1)

images_df

In [None]:
# Vendor

vendor_url = tree.xpath('.//vendorwebaddress/row/@webaddress')[0]
vendor_url