In [1]:
import pandas
import numpy as np
import sklearn as sk

from sklearn.preprocessing import OneHotEncoder
import time

%matplotlib inline

In [2]:
def log_progress(sequence, every=10):
    from ipywidgets import IntProgress
    from IPython.display import display

    progress = IntProgress(min=0, max=len(sequence), value=0)
    display(progress)
    
    for index, record in enumerate(sequence):
        if index % every == 0:
            progress.value = index
        yield record

# Препроцессинг

Изначальный датасет очень большой и некоторые операции не возможны на моем ноутбуке в связи с нехваткой оперативной памяти, поэтому поделим датасет на `n` маленьких

In [4]:
data = pandas.read_csv("train.csv")
data.head()

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
1,714,915655,34,1,4
2,316,262696,42,1,2
3,52,354280,4,1,10
4,581,218912,14,1,10


In [4]:
n_batches = 5
users = data["user_id"].unique()
users_batches = (users[i::n_batches] for i in range(n_batches))

for i, batch in enumerate(users_batches):
    i+=1
    print("batch %s from %s" % (i, n_batches))
    data[data["user_id"].isin(batch)].to_csv("data/raw/data_%s.csv" % i, index=False)

batch 1 from 5
batch 2 from 5
batch 3 from 5
batch 4 from 5
batch 5 from 5


# Пейплайн предобработки датасета

### Подготовка и обучение пейплайна

Считаем один блок данных, и протестируем на нем пейплайн.

In [2]:
dataset_name = "data/raw/data_1.csv"
data = pandas.read_csv(dataset_name)
data.head()

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
1,590,1029729,63,1,9
2,703,861305,16,1,9
3,803,733016,40,1,7
4,219,448027,1,1,6


### OneHotEncoder
Закодируем все просмотры пользователя в некоторый вектор интересов, для этого обучим OHE на категориях и подкатегориях товаров.

In [3]:
ohe = OneHotEncoder()

ohe.fit(data[["id1","id2"]])

id1_columns = ["id1_%s" % i for i in range(data["id1"].unique().shape[0])]
id2_columns = ["id2_%s" % i for i in range(data["id2"].unique().shape[0])] 

x_column_names = ["user_id"] + id1_columns + id2_columns
column_names = x_column_names + ["target"]

### Векторизация просмотров пользователя

Это функция для получения веткора интересов пользователя.
По своей сути вектор отображает вероятность, что человек заинтересуется определенной темой, а так же вероятность того, что он заинтересуется подтемой определенной темы

In [4]:
def vectorize_user(log):
    t = time.time()
    
    # Преобразуем все id в one hot
    values = ohe.transform(log[["id1", "id2"]])
    
    t = time.time()
    #суммируем все вектора и получаем вектор истории просмотров
    values = np.array(values.sum(axis=0))[0].astype("float")
    
    t = time.time()
    values[17:] = values[17:]/values[17:].sum()
    values[:17] = values[:17]/values[:17].sum()
    
    user_id = np.array([log["user_id"].get_values()[0]])
    vector = np.concatenate([user_id, values])
    
    user_vector = pandas.DataFrame(columns=x_column_names, data=[vector])
    
        
    return user_vector
    

Пример векторизации интересов пользователя:

In [5]:
data_ = data[data["user_id"] == 464300]
vectorize_user(data_)

Unnamed: 0,user_id,id1_0,id1_1,id1_2,id1_3,id1_4,id1_5,id1_6,id1_7,id1_8,...,id2_112,id2_113,id2_114,id2_115,id2_116,id2_117,id2_118,id2_119,id2_120,id2_121
0,464300.0,0.0,0.010563,0.193662,0.035211,0.105634,0.003521,0.021127,0.03169,0.267606,...,0.0,0.0,0.0,0.0,0.007042,0.024648,0.003521,0.0,0.056338,0.0


### Получение target значений

В качестве таргет значений мы будем предсказывать товары которые купит человек на следующей неделе, основываясь просмотрах товаров в прошлом.

Эта функция позволяет разделить данные на два блока: блок для векторизации истории, блок для получения таргет значений.
Если нет информации о следующей неделе, то возвращается _None_.

In [36]:
def split_x_y(log):
    split_date = log["date"].max()-7
    x_log, y_log = log[log["date"]<split_date], log[log["date"]>split_date]
    
    if not x_log.shape[0] or not y_log.shape[0]: return None, None
    
    return x_log, y_log

In [7]:
x_log, y_log = split_x_y(data_)
x_log.head()

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
6,806,464300,108,1,4
42,335,464300,96,1,8
51,33,464300,70,1,15
133,875,464300,4,1,10


In [8]:
y_log.head()

Unnamed: 0,id3,user_id,id2,date,id1
8873756,704,464300,34,51,4
8878447,409,464300,11,51,3
8885764,528,464300,19,51,8
8890311,545,464300,4,51,10
8908166,13,464300,120,51,2


Функция позволяет получить target значения, id товаров, которые купит человек.

In [9]:
def get_target_values(log):
    return log["id3"].unique()

In [10]:
get_target_values(split_x_y(data_)[1])

array([704, 409, 528, 545,  13,  29, 495, 171, 601, 217,  37, 327, 113,
        31, 575, 134, 599, 506, 451, 122, 204, 602, 241, 336, 324, 841])

### Получение датасетадля пользователя

Функция позволяет получить итоговый датасет для одного пользователя

In [11]:
def process_user_log(log):
    t = time.time()
    
    x_log, y_log = split_x_y(log)
    if x_log is None or y_log is None: return None
    
    user_vector = vectorize_user(x_log)
    target_values = get_target_values(y_log)
    
    user_data = pandas.concat([user_vector for i in range(target_values.size)])
    user_data["target"] = target_values
    
    return user_data

In [12]:
process_user_log(data_).head()

Unnamed: 0,user_id,id1_0,id1_1,id1_2,id1_3,id1_4,id1_5,id1_6,id1_7,id1_8,...,id2_113,id2_114,id2_115,id2_116,id2_117,id2_118,id2_119,id2_120,id2_121,target
0,464300.0,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,704
0,464300.0,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,409
0,464300.0,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,528
0,464300.0,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,545
0,464300.0,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,13


In [43]:
def get_dataset(log):
    start_time = time.time()
    
    users = log["user_id"].unique()
    
    dataset = pandas.DataFrame(columns=column_names)
    for user in users:
        user_dataset = process_user_log(log[log["user_id"] == user])
        if user_dataset is None:
            continue
        dataset = pandas.concat([dataset, user_dataset])
    
    dataset["user_id"] = dataset["user_id"].astype("int")
    print("Result time:", time.time()-start_time)
    return dataset

In [44]:
get_dataset(data[data["user_id"].isin(data["user_id"].unique()[:500])]).head()

Result time: 4.337202548980713


Unnamed: 0,user_id,id1_0,id1_1,id1_2,id1_3,id1_4,id1_5,id1_6,id1_7,id1_8,...,id2_113,id2_114,id2_115,id2_116,id2_117,id2_118,id2_119,id2_120,id2_121,target
0,464300,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,704
0,464300,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,409
0,464300,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,528
0,464300,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,545
0,464300,0.0,0.013453,0.147982,0.026906,0.080717,0.004484,0.026906,0.040359,0.300448,...,0.0,0.0,0.0,0.008969,0.013453,0.004484,0.0,0.040359,0.0,13


# Обработка датасета

In [45]:
dataset_name = "data/raw/data_1.csv"
data = pandas.read_csv(dataset_name)
data.head()

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
1,590,1029729,63,1,9
2,703,861305,16,1,9
3,803,733016,40,1,7
4,219,448027,1,1,6


In [46]:
user_id = data["user_id"].unique()[:15000]
data = data[data["user_id"].isin(user_id)]

In [47]:
train_data = get_dataset(data)

Result time: 647.1725318431854


In [48]:
train_data.to_csv("./data/train/data_1.csv", index=False)

In [49]:
data[data["user_id"] == 350455]

Unnamed: 0,id3,user_id,id2,date,id1
44,215,350455,23,1,7
93,385,350455,112,1,11
158,392,350455,92,1,10
385,685,350455,14,1,10
521,662,350455,51,1,9
606,800,350455,14,1,10
749,434,350455,96,1,8
830,730,350455,77,1,15
995,546,350455,51,1,9
1031,590,350455,63,1,9
