In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt

In [2]:
items = pd.read_csv("../../Data/items.csv", sep="|")
orders = pd.read_csv("../../Data/orders_before_dec.csv", sep="|")
orders["date"] = pd.to_datetime(orders["date"])

In [3]:
item_orders = pd.merge(orders, items, how="left", on="itemID")
item_orders["purchaseMonth"] = item_orders["date"].dt.month
item_orders["purchaseWeek"] = item_orders["date"].apply(lambda d: (d.day-1) // 7 + 1)
item_orders["purchaseWeek"] = item_orders["purchaseWeek"].replace(to_replace=5, value=0)
item_orders.head()

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,purchaseMonth,purchaseWeek
0,2020-06-01,38769,3477,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]",6,1
1,2020-06-01,42535,30474,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]",6,1
2,2020-06-01,42535,15833,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]",6,1
3,2020-06-01,42535,20131,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,...",6,1
4,2020-06-01,42535,4325,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882...",6,1


In [4]:
# Create column containing all buy dates per user-item combination
product_buy_dates = item_orders.groupby(["userID", "itemID"])["date"].apply(list).reset_index()

# Column names for all dates
size = len(max(product_buy_dates["date"], key=len))
columns = [word+str(number) for word, number in zip(["date_"] * size, range(size))]

# Split dates to own columns
split_dates = pd.DataFrame(product_buy_dates["date"].tolist(), columns=columns)

# Merge 
product_buy_dates = pd.concat([product_buy_dates, split_dates], axis=1)

# Drop list column
product_buy_dates = product_buy_dates.drop("date", axis=1)

In [5]:
product_buy_diffs = product_buy_dates.copy(deep=True)

for i in range(3, len(product_buy_dates.columns[2:])+2):
    new_column = "t_"+ str(i-3)
    product_buy_diffs[new_column] = (product_buy_diffs.iloc[:, i] - product_buy_diffs.iloc[:, i-1]).dt.days
    
product_buy_diffs = product_buy_diffs.drop(columns=columns)
product_buy_diffs = product_buy_diffs.fillna(999) #Needs to be adressed


In [6]:
user_item_data =  product_buy_diffs[["userID", "itemID"]].copy(deep=True)

user_item_data["max days"] = product_buy_diffs[product_buy_diffs.columns[2:]].max(axis=1)
user_item_data["min days"] = product_buy_diffs[product_buy_diffs.columns[2:]].min(axis=1)
user_item_data["avg_days"] = product_buy_diffs[product_buy_diffs.columns[2:]].mean(axis=1)
user_item_data["variance"] = product_buy_diffs[product_buy_diffs.columns[2:]].var(axis=1)

user_item_data = pd.merge(user_item_data, item_orders.drop(["date", "categories"], axis=1), on=["userID", "itemID"], how="left")

In [7]:
submission_dec = pd.read_csv("../../Data/test_dec.csv", sep="|")
submission_dec["purchaseMonth"] = 12

submission_dec = pd.merge(submission_dec, user_item_data.drop(["purchaseMonth", "purchaseWeek"], axis=1), on=["userID", "itemID"], how="left")
submission_dec = submission_dec.drop_duplicates(["userID", "itemID"])
submission_dec

Unnamed: 0,userID,itemID,prediction,purchaseMonth,max days,min days,avg_days,variance,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5
0,0,20664,2,12,999.0,126.0,959.318182,34642.227273,1,408,4,0,284,0,66
2,0,28231,2,12,999.0,999.0,999.000000,0.000000,1,193,4,3,468,3,108
3,13,2690,4,12,999.0,68.0,914.818182,74242.727273,1,406,4,3,491,0,66
6,15,1299,4,12,999.0,81.0,957.272727,38305.636364,1,1056,4,0,474,-1,108
8,15,20968,4,12,999.0,52.0,955.954545,40764.045455,1,1315,4,0,444,0,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23512,46118,20106,4,12,999.0,31.0,868.590909,112855.300866,1,1111,4,0,491,0,66
23516,46124,19677,4,12,999.0,26.0,867.500000,114739.595238,1,1006,4,0,491,3,154
23520,46125,12878,0,12,999.0,164.0,961.045455,31692.045455,1,1111,4,0,491,0,66
23522,46127,7963,2,12,999.0,38.0,869.363636,111519.956710,2,1111,4,0,485,3,154


In [8]:
user_item_target = user_item_data["purchaseWeek"]
user_item_body = user_item_data.drop(columns="purchaseWeek")

submission_target = submission_dec["prediction"]
submission_body = submission_dec.drop(columns="prediction")

In [9]:
# Temporal change of working directory to import function
import os
os.chdir("../../Helper_Functions")
from evaluation_pipeline import evaluate_models
# Needs to be current working directory
os.chdir("../EDA/Alex")

In [10]:
labels = evaluate_models(user_item_body, user_item_target, submission_body, submission_target)

Unnamed: 0_level_0,score,time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LogisticRegression,0.30526,53.217201
GaussianNB,0.30312,0.441861
KNeighborsClassifier,0.311204,8.49433
DecisionTreeClassifier,0.321904,12.529545
RandomForestClassifier,0.306068,329.322745
