In [2]:
import requests
import json
import time
from lxml import html
from pprint import pprint
import pandas as pd
import os
import arff
import numpy as np

In [3]:
from urllib.parse import urljoin


def get_links(url, number_of_pages):
    session = requests.Session()
    product_links = set()
    for page in range(1, number_of_pages + 1):
        response = session.get(f"{url}?page={page}")
        tree = html.fromstring(response.content)
        product_cards = tree.xpath('//div[contains(@class, "catalog-2-level-product-card")]')
        for card in product_cards:
            link_elements = card.xpath('.//a[contains(@class, "product-card-photo__link")]/@href')
            if link_elements:
                chockolate_path = link_elements[0]
                chockolate_url = urljoin('https://online.metro-cc.ru', chockolate_path)
                product_links.add(chockolate_url)
        time.sleep(1)
    return list(product_links)


In [4]:
os.makedirs('images', exist_ok=True)
def parse_parametres(chockolate_url):
    session = requests.Session()
    response = session.get(chockolate_url)
    tree = html.fromstring(response.content)
    data = {}
    data['url'] = chockolate_url
    data['article'] = tree.xpath('//p[@class="product-page-content__article"]/text()')[0].replace('Артикул:', '').strip()
    data['name'] = tree.xpath('//meta[@itemprop="name"]/@content')[0].strip()
    
    price_elements = tree.xpath('//meta[@itemprop="price"]/@content')
    if not price_elements:
        return None
    data['price'] = price_elements[0].strip()

    image_url = tree.xpath('//meta[@itemprop="image"]/@content')[0].strip()
    image_response = session.get(image_url)
    os.makedirs('images', exist_ok=True)
    with open(f'images/{data["article"]}.jpg', 'wb') as f:
        f.write(image_response.content)

    attribute_items = tree.xpath('//li[contains(@class, "product-attributes__list-item")]')
    for item in attribute_items:
        name_element = item.xpath('.//span[contains(@class, "product-attributes__list-item-name-text")]//text()')
        value_element = item.xpath('.//span[contains(@class, "product-attributes__list-item-value")]//text()')
        
        if name_element and value_element:
            key = ' '.join(name_element).strip()
            value = ' '.join(value_element).strip()
            data[key] = value
    return data
    # pprint(data)

In [66]:
links = get_links("https://online.metro-cc.ru/category/sladosti_/shokolad-batonchiki", 10)

In [None]:
all_data = []
for link in links:
    data = parse_parametres(link)
    all_data.append(data)

df = pd.DataFrame(all_data)
df.to_csv('data_extra.tsv', index=False, sep='\t', encoding='utf-8')

In [5]:
df = pd.read_csv('data_extra.tsv', sep='\t')
cols_to_drop = [col for col in ['url', 'article', 'Энергетическая ценность, ккал'] if col in df.columns]
df = df.drop(columns=cols_to_drop)
df.to_csv('data.tsv', index=False, sep='\t', encoding='utf-8')

In [6]:
data_types = {
    'name': str,
    'price': float,
    'Бренд': [],
    'Страна-производитель': [],
    'Тип': [],
    'Вид': [],
    'Без сахара': bool,
    'Тип упаковки': [],
    'Начинка': [],
    'Количество штук в упаковке': int,
    'Белки, г': float,
    'Жиры, г': float,
    'Углеводы, г': float,
    'Ширина упаковки, см': float,
    'Высота упаковки, см': float,
    'Длина упаковки, см': float,
    'Вес, объем': int,
}
for name, type_name in data_types.items():
    if name == 'Начинка' or name == 'Вид':
        final_list = set()
        for col in df[name].dropna():
            final_list.update(str(col).split('; '))
        data_types[name] = list(final_list)
    elif type_name == []:
        data_types[name] = list(df[name].dropna().unique())

pprint(data_types)

{'name': <class 'str'>,
 'price': <class 'float'>,
 'Без сахара': <class 'bool'>,
 'Белки, г': <class 'float'>,
 'Бренд': ['ALPEN GOLD',
           'MALTESERS',
           'RITTER SPORT',
           'KINDER SURPRISE',
           'ПОБЕДА ВКУСА',
           'MILKA',
           'MARS',
           'STROBAR',
           'БАБАЕВСКИЙ',
           'HAMLET',
           'N NATURE',
           'RIOBA',
           'SNICKERS',
           'MILKY WAY',
           'АЛЕНКА',
           'ULKER',
           'VILLARS',
           'KINDER MAXI',
           'LINDT',
           'RED',
           'NUTS',
           'MERCI',
           'TWIX',
           'АПРИОРИ',
           'BUCHERON',
           'DOVE',
           'KINDER',
           'АККОНД',
           'BOUNTY',
           'СТЕП',
           'KINDER BUENO',
           'ВОЗДУШНЫЙ',
           'L.O.L.',
           'SCHOGETTEN',
           'MR.CHOKKY',
           'BELGIAN',
           'РОССИЯ ЩЕДРАЯ ДУША',
           'TOBLERONE',
           'KITKAT',
      

In [7]:
def create_json_data(data_types, df):
    header = []
    for name, type_name in data_types.items():
        col = {"feature_name": name}
        if name == 'Начинка' or name == 'Вид':
            col["type"] = "multicategory"
            col["values"] = type_name
        elif type(type_name) is list:
            col["type"] = "category"
            col["values"] = type_name
        elif type_name == str:
            col["type"] = "text"
        elif type_name == bool:
            col["type"] = "boolean"
        elif type_name == int:
            col["type"] = "integer"
        else:
            col["type"] = "numeric"
        header.append(col)
    data = []
    for _, row in df.iterrows():
        record = {}
        for col, _ in data_types.items():
            value = row[col]
            if pd.isna(value):
                value = None
            record[col] = value
        data.append(record)
    json_data = {
        "header": header,
        "data": data
    }
    return json_data

j = create_json_data(data_types, df)
with open("data.json", "w", encoding='UTF-8') as f:
    json.dump(j, f, ensure_ascii=False, indent=3)

In [8]:
def create_arff_data(data_types, df):
    arff_data = {
        "relation": "data",
        "attributes": [],
        "data": df.values.tolist()
    }
    for name, type_name in data_types.items():
        attr = [name, None]
        if type_name == str:
            attr[1] = "STRING"
        elif type_name == float or type_name == int:
            attr[1] = "NUMERIC"
        elif type_name == bool:
            attr[1] = ['Да', 'Нет']
        else:
            attr[1] = type_name
        arff_data["attributes"].append(tuple(attr))
    return arff_data

a = create_arff_data(data_types, df)
with open('data.arff', 'w', encoding='UTF-8') as f:
    arff.dump(a, f)

In [9]:
df = pd.read_csv('data.tsv', sep='\t')
print("Пропущенные значения в колонках: ", [col for col in df.columns if df[col].isna().any()])


Пропущенные значения в колонках:  ['Бренд', 'Вес, объем', 'Начинка', 'Длина упаковки, см']


In [10]:
def fill_empty_values():
    mask = (df['Количество штук в упаковке'] == 1) & (df['Длина упаковки, см'].isna())
    random_values = np.random.normal(0.8, 0.1, mask.sum())
    df.loc[mask, 'Длина упаковки, см'] = random_values.astype(np.float64)
    df['Длина упаковки, см'] = df['Длина упаковки, см'].round(2)

    df['Вес, объем'] = df['Вес, объем'].fillna(df['Вес, объем'].mean())
    df['Вес, объем'] = df['Вес, объем'].round().astype(int)

    df['Начинка'] = df['Начинка'].fillna('Шоколад')

In [11]:
def process_data():
    df_new = df.copy()
    for col in df.columns:
        if data_types[col] == int or data_types[col] == float:
            df_new[col] = pd.to_numeric(df_new[col], errors='coerce')
            df_new[col] = (df_new[col] - df_new[col].min()) / (df_new[col].max() - df_new[col].min())
        elif type(data_types[col]) is list:
            df_new[col] = df_new[col].fillna('')
            dummies = df_new[col].str.get_dummies(sep='; ')
            dummies.columns = [f"{col};{category}" for category in dummies.columns]
            df_new = pd.concat([df_new, dummies], axis=1)
            df_new = df_new.drop(columns=[col])
    return df_new

In [12]:
fill_empty_values()
df_final = process_data()
df_final.to_csv('data.csv', index=False)