In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt

In [2]:
items = pd.read_csv("../../Data/items.csv", sep="|")
orders = pd.read_csv("../../Data/orders_before_dec.csv", sep="|")
orders["date"] = pd.to_datetime(orders["date"])

In [3]:
item_orders = pd.merge(orders, items, how="left", on="itemID")
item_orders["purchaseMonth"] = item_orders["date"].dt.month
item_orders["purchaseWeek"] = item_orders["date"].apply(lambda d: (d.day-1) // 7 + 1)
item_orders["purchaseWeek"] = item_orders["purchaseWeek"].replace(to_replace=5, value=0)
item_orders.head()

Unnamed: 0,date,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,purchaseMonth,purchaseWeek
0,2020-06-01,38769,3477,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]",6,1
1,2020-06-01,42535,30474,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]",6,1
2,2020-06-01,42535,15833,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]",6,1
3,2020-06-01,42535,20131,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,...",6,1
4,2020-06-01,42535,4325,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882...",6,1


In [4]:
item_orders["purchaseWeek"].value_counts()

4    177751
2    177272
3    174602
1    168798
0     68708
Name: purchaseWeek, dtype: int64

In [5]:
# Create column containing all buy dates per user-item combination
product_buy_dates = item_orders.groupby(["userID", "itemID"])["date"].apply(list).reset_index()

# Column names for all dates
size = len(max(product_buy_dates["date"], key=len))
columns = [word+str(number) for word, number in zip(["date_"] * size, range(size))]

# Split dates to own columns
split_dates = pd.DataFrame(product_buy_dates["date"].tolist(), columns=columns)

# Merge 
product_buy_dates = pd.concat([product_buy_dates, split_dates], axis=1)

# Drop list column
product_buy_dates = product_buy_dates.drop("date", axis=1)

In [6]:
product_buy_diffs = product_buy_dates.copy(deep=True)

for i in range(3, len(product_buy_dates.columns[2:])+2):
    new_column = "t_"+ str(i-3)
    product_buy_diffs[new_column] = (product_buy_diffs.iloc[:, i] - product_buy_diffs.iloc[:, i-1]).dt.days
    
product_buy_diffs = product_buy_diffs.drop(columns=columns)
product_buy_diffs = product_buy_diffs.fillna(999) #Needs to be adressed


In [7]:
product_buy_diffs

Unnamed: 0,userID,itemID,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_12,t_13,t_14,t_15,t_16,t_17,t_18,t_19,t_20,t_21
0,0,1505,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
1,0,9325,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
2,0,12468,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
3,0,12505,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
4,0,15083,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667296,46137,2667,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
667297,46137,20209,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
667298,46137,28343,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
667299,46137,28900,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0


In [8]:
user_item_data =  product_buy_diffs[["userID", "itemID"]].copy(deep=True)

user_item_data["max days"] = product_buy_diffs[product_buy_diffs.columns[2:]].max(axis=1)
user_item_data["min days"] = product_buy_diffs[product_buy_diffs.columns[2:]].min(axis=1)
user_item_data["avg_days"] = product_buy_diffs[product_buy_diffs.columns[2:]].mean(axis=1)
user_item_data["variance"] = product_buy_diffs[product_buy_diffs.columns[2:]].var(axis=1)

user_item_data = pd.merge(user_item_data, item_orders.drop(["date", "categories"], axis=1), on=["userID", "itemID"], how="left")

In [9]:
user_item_data

Unnamed: 0,userID,itemID,max days,min days,avg_days,variance,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,purchaseMonth,purchaseWeek
0,0,1505,999.0,999.0,999.0,0.0,1,286,4,0,82,0,144,9,1
1,0,9325,999.0,999.0,999.0,0.0,1,107,6,0,308,3,17,11,3
2,0,12468,999.0,999.0,999.0,0.0,1,1288,10,0,421,3,3,8,1
3,0,12505,999.0,999.0,999.0,0.0,1,53,4,3,474,0,-1,8,3
4,0,15083,999.0,999.0,999.0,0.0,1,286,4,0,82,0,144,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767126,46137,2667,999.0,999.0,999.0,0.0,1,1301,4,0,148,0,144,9,3
767127,46137,20209,999.0,999.0,999.0,0.0,1,703,4,0,291,0,44,8,2
767128,46137,28343,999.0,999.0,999.0,0.0,1,842,10,1,503,0,137,8,2
767129,46137,28900,999.0,999.0,999.0,0.0,2,1156,10,0,137,3,87,8,2


In [10]:
submission_dec = pd.read_csv("../../Data/test_dec.csv", sep="|")
submission_dec["purchaseMonth"] = 12

submission_dec = pd.merge(submission_dec, user_item_data.drop(["purchaseMonth", "purchaseWeek"], axis=1), on=["userID", "itemID"], how="left")
submission_dec = submission_dec.drop_duplicates(["userID", "itemID"])
submission_dec

Unnamed: 0,userID,itemID,prediction,purchaseMonth,max days,min days,avg_days,variance,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5
0,0,20664,2,12,999.0,126.0,959.318182,34642.227273,1,408,4,0,284,0,66
2,0,28231,2,12,999.0,999.0,999.000000,0.000000,1,193,4,3,468,3,108
3,13,2690,4,12,999.0,68.0,914.818182,74242.727273,1,406,4,3,491,0,66
6,15,1299,4,12,999.0,81.0,957.272727,38305.636364,1,1056,4,0,474,-1,108
8,15,20968,4,12,999.0,52.0,955.954545,40764.045455,1,1315,4,0,444,0,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23512,46118,20106,4,12,999.0,31.0,868.590909,112855.300866,1,1111,4,0,491,0,66
23516,46124,19677,4,12,999.0,26.0,867.500000,114739.595238,1,1006,4,0,491,3,154
23520,46125,12878,0,12,999.0,164.0,961.045455,31692.045455,1,1111,4,0,491,0,66
23522,46127,7963,2,12,999.0,38.0,869.363636,111519.956710,2,1111,4,0,485,3,154


In [11]:
user_item_target = user_item_data["purchaseWeek"]
user_item_body = user_item_data.drop(columns="purchaseWeek")

submission_target = submission_dec["prediction"]
submission_body = submission_dec.drop(columns="prediction")

In [12]:
user_item_body

Unnamed: 0,userID,itemID,max days,min days,avg_days,variance,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,purchaseMonth
0,0,1505,999.0,999.0,999.0,0.0,1,286,4,0,82,0,144,9
1,0,9325,999.0,999.0,999.0,0.0,1,107,6,0,308,3,17,11
2,0,12468,999.0,999.0,999.0,0.0,1,1288,10,0,421,3,3,8
3,0,12505,999.0,999.0,999.0,0.0,1,53,4,3,474,0,-1,8
4,0,15083,999.0,999.0,999.0,0.0,1,286,4,0,82,0,144,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767126,46137,2667,999.0,999.0,999.0,0.0,1,1301,4,0,148,0,144,9
767127,46137,20209,999.0,999.0,999.0,0.0,1,703,4,0,291,0,44,8
767128,46137,28343,999.0,999.0,999.0,0.0,1,842,10,1,503,0,137,8
767129,46137,28900,999.0,999.0,999.0,0.0,2,1156,10,0,137,3,87,8


In [13]:
submission_body

Unnamed: 0,userID,itemID,purchaseMonth,max days,min days,avg_days,variance,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5
0,0,20664,12,999.0,126.0,959.318182,34642.227273,1,408,4,0,284,0,66
2,0,28231,12,999.0,999.0,999.000000,0.000000,1,193,4,3,468,3,108
3,13,2690,12,999.0,68.0,914.818182,74242.727273,1,406,4,3,491,0,66
6,15,1299,12,999.0,81.0,957.272727,38305.636364,1,1056,4,0,474,-1,108
8,15,20968,12,999.0,52.0,955.954545,40764.045455,1,1315,4,0,444,0,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23512,46118,20106,12,999.0,31.0,868.590909,112855.300866,1,1111,4,0,491,0,66
23516,46124,19677,12,999.0,26.0,867.500000,114739.595238,1,1006,4,0,491,3,154
23520,46125,12878,12,999.0,164.0,961.045455,31692.045455,1,1111,4,0,491,0,66
23522,46127,7963,12,999.0,38.0,869.363636,111519.956710,2,1111,4,0,485,3,154


In [14]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from collections import Counter

print('Original dataset shape %s' % Counter(user_item_target))

ros = RandomOverSampler(random_state=42)

X, y = ros.fit_resample(user_item_body, 
                        user_item_target)

print('Original dataset shape %s' % Counter(y))

estimator = LogisticRegression()

estimator.fit(X, y)

labels = estimator.predict(submission_body)

print(len(labels))
print(np.unique(labels))

pred = submission_dec[["userID", "itemID"]].copy(deep=True)
pred["prediction"] = labels

Original dataset shape Counter({4: 177751, 2: 177272, 3: 174602, 1: 168798, 0: 68708})
Original dataset shape Counter({1: 177751, 3: 177751, 2: 177751, 0: 177751, 4: 177751})
9745
[0 1 3 4]


Feature names must be in the same order as they were in fit.



In [15]:
pred["prediction"].value_counts()

4    4095
0    3720
1    1852
3      78
Name: prediction, dtype: int64

In [16]:
def count_points(pred, gold):
    df = pd.merge(pred, gold, on=['userID', 'itemID'], suffixes=('_pred', '_gold'))
    df['points'] = df.apply(_compute_points_for_row, axis=1)
    return df['points'].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

In [17]:
# 'pred' and 'gold' are DataFrames with columns ['userID', 'itemID', 'prediction']
pred, gold = pred, submission_dec[["userID", "itemID", "prediction"]]

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points

print(score)
print(max_points)

0.3334105519198471
17267


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [19]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 4)]

# Number of features to consider every split
max_features = ["auto", "sqrt"]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {"n_estimators": n_estimators, 
                "max_features": max_features,
                "max_depth": max_depth,
                "min_samples_split": min_samples_split,
                "min_samples_leaf": min_samples_leaf,
                "bootstrap": bootstrap}

random_grid

{'n_estimators': [200, 800, 1400, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 35, 60, 85, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [2, 4],
 'bootstrap': [True, False]}

In [20]:
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier(random_state=42)

# # Random search of parameters, using 5 fold cross validation, 
# # search across 100 different combinations
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs=2)

# # Fit the random search model
# rf_random.fit(X, y)

# rf_random.best_params_

In [21]:
rf = RandomForestClassifier(n_estimators=1400, max_depth=110, min_samples_leaf=4, 
                            max_features="sqrt", min_samples_split=5, random_state=42)

rf.fit(X, y)

labels = rf.predict(submission_body)

print(len(labels))
print(np.unique(labels))

pred = submission_dec[["userID", "itemID"]].copy(deep=True)
pred["prediction"] = labels

Feature names must be in the same order as they were in fit.



9745
[0 1 2 3 4]


In [22]:
# 'pred' and 'gold' are DataFrames with columns ['userID', 'itemID', 'prediction']
pred, gold = pred, submission_dec[["userID", "itemID", "prediction"]]

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points

print(score)
print(max_points)

0.40470261191868884
17267


In [24]:
pred.to_csv("prediction_dec.csv", sep="|")