In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

In [111]:
items = pd.read_csv("../Data/items.csv", sep="|")
orders = pd.read_csv("../Data/orders_before_dec.csv", sep="|")
orders["date"] = pd.to_datetime(orders["date"])

In [112]:
items.head()

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]"
1,28640,1366,10,1,537,0,101,
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3..."
3,21399,1090,10,1,511,0,0,[3270]
4,8504,768,4,1,484,0,66,[2470]


In [113]:
orders.head()

Unnamed: 0,date,userID,itemID,order
0,2020-06-01,38769,3477,1
1,2020-06-01,42535,30474,1
2,2020-06-01,42535,15833,1
3,2020-06-01,42535,20131,1
4,2020-06-01,42535,4325,1


In [114]:
item_orders = pd.merge(orders, items, how="left", on="itemID")
item_orders.head()

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,2020-06-01,38769,3477,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]"
1,2020-06-01,42535,30474,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]"
2,2020-06-01,42535,15833,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]"
3,2020-06-01,42535,20131,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,..."
4,2020-06-01,42535,4325,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882..."


In [115]:
print(item_orders["date"].min())
print(item_orders["date"].max())

2020-06-01 00:00:00
2020-11-30 00:00:00


## Cutoff method
As seen here:
* https://towardsdatascience.com/predicting-next-purchase-day-15fae5548027  
* https://github.com/evansdoe/online_retail/blob/main/online_retail.ipynb
 

In [116]:
# # Cut off at between october and november
# cutoff = pd.Timestamp(2020, 10, 31)

# pre_cutoff = item_orders[item_orders["date"] <= cutoff]
# post_cutoff = item_orders[item_orders["date"] > cutoff]

# print(pre_cutoff["date"].min())
# print(pre_cutoff["date"].max())

# print(post_cutoff["date"].min())
# print(post_cutoff["date"].max())

# print(len(item_orders) == len(pre_cutoff) + len(post_cutoff))

2020-06-01 00:00:00
2020-10-31 00:00:00
2020-11-01 00:00:00
2020-11-30 00:00:00
True


In [117]:
# pre_cutoff.head()

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,2020-06-01,38769,3477,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]"
1,2020-06-01,42535,30474,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]"
2,2020-06-01,42535,15833,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]"
3,2020-06-01,42535,20131,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,..."
4,2020-06-01,42535,4325,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882..."


In [118]:
# # Create dataframe of user item combinations
# user_item_view = pd.DataFrame(np.unique(pre_cutoff[["userID", "itemID"]], axis=0))
# user_item_view.columns = ["userID", "itemID"]

# # Create dataframe with userID, itemID and last purchase in pre_cutoff
# last_purchase = pre_cutoff.groupby(["userID", "itemID"])["date"].max().reset_index()
# last_purchase.columns = ["userID", "itemID", "PrePurchaseDate"]

# # Create dataframe with userID, itemID and first purchase in post_cutoff
# first_purchase = post_cutoff.groupby(["userID", "itemID"])["date"].min().reset_index()
# first_purchase.columns = ["userID", "itemID", "PostPurchaseDate"]

# # Merge both datasets
# purchase_dates = pd.merge(last_purchase, first_purchase, on=["userID", "itemID"], how="left")

# # Calculate the time difference in days:
# purchase_dates["NextPurchaseDay"] = (purchase_dates["PostPurchaseDate"] - purchase_dates["PrePurchaseDate"]).dt.days

# # Merge with user_item_view
# user_item_view = pd.merge(user_item_view, purchase_dates, on=["userID", "itemID"], how="left")[["userID", "itemID", "NextPurchaseDay"]]

# # Fill Na values with 999
# user_item_view.fillna(999, inplace=True)

# user_item_view.head()

Unnamed: 0,userID,itemID,NextPurchaseDay
0,0,1505,999.0
1,0,12468,999.0
2,0,12505,999.0
3,0,15083,999.0
4,0,20664,999.0


## Mean of first order week of every month

In [119]:
item_orders.head()

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,2020-06-01,38769,3477,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]"
1,2020-06-01,42535,30474,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]"
2,2020-06-01,42535,15833,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]"
3,2020-06-01,42535,20131,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,..."
4,2020-06-01,42535,4325,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882..."


In [120]:
# Can 100% be done cleaner but I am too stupid for it right now lol
order_by_week_per_month = pd.DataFrame(np.unique(item_orders[["userID", "itemID"]], axis=0), columns=["userID", "itemID"])

months = item_orders["date"].dt.strftime("%B").unique().tolist()

for month in months:
    # Filter dates to match current month
    temp = item_orders[item_orders["date"].dt.strftime("%B") == month]

    # Get first purchase for all user-item combinations and translate date to week of month
    first_purchase = temp.groupby(["userID", "itemID"])["date"].min().reset_index()
    first_purchase["date"] = first_purchase["date"].apply(lambda d: (d.day-1) // 7 + 1)
   
    # Change target variable to match requirements
    first_purchase = first_purchase.replace(to_replace=5, value=0)

    # Merge with two dataframes and adjust column names
    order_by_week_per_month = pd.merge(order_by_week_per_month, first_purchase, on=["userID", "itemID"], how="left")
    order_by_week_per_month.columns = order_by_week_per_month.columns[:-1].to_list() + [month]


In [121]:
order_by_week_per_month.head()

Unnamed: 0,userID,itemID,June,July,August,September,October,November
0,0,1505,,,,1.0,,
1,0,9325,,,,,,3.0
2,0,12468,,,1.0,,,
3,0,12505,,,3.0,,,
4,0,15083,,,1.0,,,


In [122]:
order_by_week_per_month["target"] = order_by_week_per_month[order_by_week_per_month[months] != 0].mean(axis=1).round(0)
order_by_week_per_month.fillna(0, inplace=True)
order_by_week_per_month.head()

Unnamed: 0,userID,itemID,June,July,August,September,October,November,target
0,0,1505,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0,9325,0.0,0.0,0.0,0.0,0.0,3.0,3.0
2,0,12468,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0,12505,0.0,0.0,3.0,0.0,0.0,0.0,3.0
4,0,15083,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [123]:
order_by_week_per_month["target"].value_counts()

2.0    172742
3.0    152997
4.0    149590
1.0    138172
0.0     53800
Name: target, dtype: int64

In [124]:
submission_dec = pd.read_csv("../Data/submission_dec_with_labels.csv")
submission_dec

Unnamed: 0,userID,itemID,prediction
0,0,20664,1
1,0,28231,3
2,13,2690,1
3,15,1299,2
4,15,20968,2
...,...,...,...
9740,46118,20106,1
9741,46124,19677,2
9742,46125,12878,1
9743,46127,7963,1


In [125]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression

ros = RandomOverSampler(random_state=42)

X, y = ros.fit_resample(order_by_week_per_month[["userID", "itemID"]], order_by_week_per_month["target"])


estimator = LogisticRegression()

estimator.fit(X, y)

labels = estimator.predict(submission_dec[["userID", "itemID"]])

print(len(labels))
print(np.unique(labels))

pred = submission_dec[["userID", "itemID"]]
pred["prediction"] = labels

9745
[0. 3. 4.]


In [126]:
def count_points(pred, gold):
    df = pd.merge(pred, gold, on=['userID', 'itemID'], suffixes=('_pred', '_gold'))
    df['points'] = df.apply(_compute_points_for_row, axis=1)
    return df['points'].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

In [127]:
# 'pred' and 'gold' are DataFrames with columns ['userID', 'itemID', 'prediction']
pred, gold = pred, submission_dec

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points

print(score)
print(max_points)

0.24816024697562553
27857


## Month as a feature


In [128]:
# Fetch unqiue user-item combinations
orders_with_month = pd.DataFrame(np.unique(item_orders[["userID", "itemID"]], axis=0), columns=["userID", "itemID"])

# Get month and purchhase week for every user-item combination
temp = item_orders.groupby(["userID", "itemID"])["date"].min().reset_index()
temp["month"] = temp["date"].dt.month
temp["purchaseWeek"] = temp["date"].apply(lambda d: (d.day-1) // 7 + 1)

# Merge both dataframes
orders_with_month = pd.merge(orders_with_month, temp[["userID", "itemID", "month", "purchaseWeek"]], on=["userID", "itemID"], how="left")

# Replace week 5 with 0
orders_with_month = orders_with_month.replace(to_replace=5, value=0)

orders_with_month.head()

Unnamed: 0,userID,itemID,month,purchaseWeek
0,0,1505,9,1
1,0,9325,11,3
2,0,12468,8,1
3,0,12505,8,3
4,0,15083,8,1


In [129]:
orders_with_month["purchaseWeek"].value_counts()

2    154164
3    151921
4    151695
1    151206
0     58315
Name: purchaseWeek, dtype: int64

In [130]:
submission_dec_with_month = submission_dec.copy(deep=True)
submission_dec_with_month["month"] = 12
submission_dec_with_month

Unnamed: 0,userID,itemID,prediction,month
0,0,20664,1,12
1,0,28231,3,12
2,13,2690,1,12
3,15,1299,2,12
4,15,20968,2,12
...,...,...,...,...
9740,46118,20106,1,12
9741,46124,19677,2,12
9742,46125,12878,1,12
9743,46127,7963,1,12


In [131]:
ros = RandomOverSampler(random_state=42)

X, y = ros.fit_resample(orders_with_month[["userID", "itemID", "month"]], 
                        orders_with_month["purchaseWeek"])

estimator = LogisticRegression()

estimator.fit(X, y)

labels = estimator.predict(submission_dec_with_month[["userID", "itemID", "month"]])

print(len(labels))
print(np.unique(labels))

pred = submission_dec_with_month[["userID", "itemID", "month"]]
pred["prediction"] = labels

9745
[0 4]


In [132]:
# 'pred' and 'gold' are DataFrames with columns ['userID', 'itemID', 'prediction']
pred, gold = pred, submission_dec

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points

print(score)
print(max_points)

0.26248339735075565
27857
