In [None]:
import warnings
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import polars as pl
import glob
from pathlib import Path
from tqdm import tqdm
import math
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:

# Пути
orders_path = "./data/ml_ozon_recsys_train_final_apparel_orders_data/*.parquet"
tracker_path = "./data/ml_ozon_recsys_train_final_apparel_tracker_data/*.parquet"
output_dir = Path("./data/train_data")
output_dir.mkdir(exist_ok=True)

df_orders = (pl.scan_parquet(orders_path)
             .select(["user_id", "item_id", "last_status"])
             .with_columns([
                   pl.col("user_id").cast(pl.Int64),
                    pl.col("item_id").cast(pl.Int64),
                    pl.col("last_status")
             ])
             .filter(pl.col("last_status") != "proccesed_orders"))


tracker_files = sorted(glob.glob(tracker_path))
chunk_size = 1  

def encode_actions(df):
    return df.with_columns([
        (pl.col("action_type") == "view_description").cast(pl.Int32).alias("action_type_view_description"),
        (pl.col("action_type") == "to_cart").cast(pl.Int32).alias("action_type_to_cart"),
        (pl.col("action_type") == "page_view").cast(pl.Int32).alias("action_type_page_view"),
        (pl.col("action_type") == "favorite").cast(pl.Int32).alias("action_type_favorite"),
        (pl.col("action_type") == "unfavorite").cast(pl.Int32).alias("action_type_unfavorite"),
        (pl.col("action_type") == "review_view").cast(pl.Int32).alias("action_type_review_view"),
        (pl.col("action_type") == "remove").cast(pl.Int32).alias("action_type_remove"),
        pl.when(pl.col("last_status") == "canceled_orders").then(0)
          .when(pl.col("last_status") == "delivered_orders").then(1)
          .otherwise(0)
          .cast(pl.Int8)
          .alias("last_status")
    ]).drop("action_type")

for i in range(0, len(tracker_files), chunk_size):
    files_chunk = tracker_files[i:i+chunk_size]
    print(f"Processing chunk {i//chunk_size + 1}/{(len(tracker_files)+chunk_size-1)//chunk_size}")

    chunk_df = (pl.scan_parquet(files_chunk)
                .with_columns([
                    pl.col("user_id").cast(pl.Int64),
                    pl.col("item_id").cast(pl.Int64),
                ]))

    temp = (
        chunk_df
        .join(df_orders, on=["item_id","user_id"], how="left")
    )
    temp = encode_actions(temp)

    temp_agg = temp.group_by(["user_id","item_id"]).agg([
        pl.col([
            "action_type_view_description",
            "action_type_to_cart",
            "action_type_page_view",
            "action_type_favorite",
            "action_type_unfavorite",
            "action_type_review_view",
            "action_type_remove"
        ]).sum(),
        pl.col("last_status").first()
    ])

    output_file = output_dir / f"agg_chunk_{i//chunk_size}.parquet"
    temp_agg.collect(streaming=True).write_parquet(output_file)
    print(f"Saved {output_file}")


all_chunks = pl.scan_parquet(str(output_dir / "*.parquet"))

final_result = (
    all_chunks
    .group_by(["user_id","item_id"])
    .agg([
        pl.col([
            "action_type_view_description",
            "action_type_to_cart",
            "action_type_page_view",
            "action_type_favorite",
            "action_type_unfavorite",
            "action_type_review_view",
            "action_type_remove"
        ]).sum(),
        pl.col("last_status").first()
    ])
)



Processing chunk 1/200


  temp_agg.collect(streaming=True).write_parquet(output_file)


Saved data\train_data\agg_chunk_0.parquet
Processing chunk 2/200
Saved data\train_data\agg_chunk_1.parquet
Processing chunk 3/200
Saved data\train_data\agg_chunk_2.parquet
Processing chunk 4/200
Saved data\train_data\agg_chunk_3.parquet
Processing chunk 5/200
Saved data\train_data\agg_chunk_4.parquet
Processing chunk 6/200
Saved data\train_data\agg_chunk_5.parquet
Processing chunk 7/200
Saved data\train_data\agg_chunk_6.parquet
Processing chunk 8/200
Saved data\train_data\agg_chunk_7.parquet
Processing chunk 9/200
Saved data\train_data\agg_chunk_8.parquet
Processing chunk 10/200
Saved data\train_data\agg_chunk_9.parquet
Processing chunk 11/200
Saved data\train_data\agg_chunk_10.parquet
Processing chunk 12/200
Saved data\train_data\agg_chunk_11.parquet
Processing chunk 13/200
Saved data\train_data\agg_chunk_12.parquet
Processing chunk 14/200
Saved data\train_data\agg_chunk_13.parquet
Processing chunk 15/200
Saved data\train_data\agg_chunk_14.parquet
Processing chunk 16/200
Saved data\tr

: 

In [None]:
chunks_dir = Path("./data/train_data")
output_dir = Path("./data/train_matrix_dir")
output_dir.mkdir(exist_ok=True)

all_files = sorted(glob.glob(str(chunks_dir / "*.parquet")))

all_data = pl.scan_parquet(all_files)

agg_data = (
    all_data
    .group_by(["user_id","item_id"])
    .agg([
        pl.col([
            "action_type_view_description",
            "action_type_to_cart",
            "action_type_page_view",
            "action_type_favorite",
            "action_type_unfavorite",
            "action_type_review_view",
            "action_type_remove"
        ]).sum(),
        pl.col("last_status").first()
    ])
)

df_collected = agg_data.collect(streaming=True)

n_files = 20
rows_per_file = math.ceil(df_collected.height / n_files)

for i in range(n_files):
    start = i * rows_per_file
    end = min((i+1) * rows_per_file, df_collected.height)
    df_chunk = df_collected[start:end]
    df_chunk.write_parquet(output_dir / f"agg_10_{i+1}.parquet")
    print(f"Saved file {i+1} with rows {start}-{end}")

In [None]:
# Группировка паркетов в более сжатом виде 



chunks_dir = Path("./data/grouped3")

all_files = sorted(glob.glob(str(chunks_dir / "*.parquet")))


from tqdm import tqdm
out_path = Path("./data/grouped4")
out_path.mkdir(exist_ok=True)
n = 2

for i in tqdm(range(0, len(all_files), n)):
    batch = all_files[i:i+n]
    
    # читаем сразу все parquet из батча
    df = pl.scan_parquet(batch)
    agg_data = (
    df
    .group_by(["user_id","item_id"])
    .agg([
        pl.col([
            "action_type_view_description",
            "action_type_to_cart",
            "action_type_page_view",
            "action_type_favorite",
            "action_type_unfavorite",
            "action_type_review_view",
            "action_type_remove"
        ]).sum(),
        pl.col("last_status").first()
    ])
    )
    del df
    out_file = out_path / f"grouped_batch_{i//n + 1}.parquet"
    agg_data.sink_parquet(out_file)
    del agg_data

100%|██████████| 5/5 [15:27<00:00, 185.53s/it]


: 

In [6]:
# "page_view" "to_cart" "review_view" "unfavorite" "remove"    "favorite" "view_description"
#     1           10         5           -10           -5          8              3
#   0.361511  2.101614  0.022923       0.077276    -1.448458     -0.024284      -0.004144

actions_coefs = np.array([0.361511, 2.101614, 0.022923, 0.077276, -1.448458, -0.024284, -0.004144])
cols = ["action_type_page_view", "action_type_to_cart", "action_type_review_view", "action_type_unfavorite", "action_type_remove", "action_type_favorite", "action_type_view_description"]

In [26]:
def gen_matrix_R(train_data_path, output_path, actions_coefs, columns, n_parquets = 10):
    all_files = sorted(glob.glob(str(train_data_path /"*.parquet")))
    print(all_files)
    out_dir = Path(output_path)
    out_dir.mkdir(exist_ok=True)

    r = sum(pl.col(col)*coef for coef, col in zip(actions_coefs, columns))
    iter = 0
    for file in tqdm(all_files):
        iter += 1
        data = pl.scan_parquet(file).with_columns([
            r.cast(pl.Float32).alias("rating")
        ]).drop(columns).drop("last_status")
    
        data.sink_parquet(out_dir / f"{iter}.parquet")


In [27]:
gen_matrix_R(Path(".\data\grouped4"), Path(".\data\matrix_R_coord"),actions_coefs, cols)

['data\\grouped4\\grouped_batch_1.parquet', 'data\\grouped4\\grouped_batch_2.parquet', 'data\\grouped4\\grouped_batch_3.parquet', 'data\\grouped4\\grouped_batch_4.parquet', 'data\\grouped4\\grouped_batch_5.parquet']


100%|██████████| 5/5 [01:09<00:00, 13.81s/it]
