In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb



### We work with the reduced training set

In [2]:
df_train = pd.read_csv('reduced_training_data.csv')

### The method to create predictions

In [4]:
import xgboost as xgb
from scipy import sparse

In [5]:
%%time
def create_top_5_predictions(user_indices, 
                             product_categories, 
                             X_train, 
                             y_train, 
                             X_test, 
                             previously_seen_items,
                             max_depth_regression_tree = 3,
                             verbose = False,
                             sparse_computation = False,
                            ):
    """Automatically determine the predictions for the predictions
    
    Keyword arguments:
    user_ids -- A list of unique integers to guarantee consistency between different
    product_categories -- A list of the id3s for which predictions should be made
    X_train -- Feature vector, contains for each user_id in user_ids a vector of length len(product_categories)
    y_train -- Feature vector, needs to have the same dimensions as X_train
    X_test -- Feature vector, needs to have the same number of columns as X_train
    previously_seen_items -- Same shape as X_train, lists all entries that have been seen in the three weeks before the 
    prediction was made
    """
    prediction_frame = pd.DataFrame(index = user_indices, 
                                    columns = product_categories, 
                                    data = np.zeros((len(user_indices), len(product_categories))))

    if sparse_computation:
        # This option seems to be much faster but we need enough
        # data for internal consistency
        X_train_sparse = sparse.csr_matrix(X_train.values)
        X_test_sparse = sparse.csr_matrix(X_test.values)
    else:
        X_train_sparse = X_train.values
        X_test_sparse = X_test.values
       
    for id_to_predict in product_categories:
        if verbose and id_to_predict % 50 == 0:
            print("Currently predicting: %d " % id_to_predict)

        ### If you want to build a different predictor, change this here
        ### Maybe build a decision tree classifier      
        model = xgb.XGBRegressor(max_depth= max_depth_regression_tree)
        model.fit(X_train_sparse, y_train[id_to_predict])
        prediction_frame[id_to_predict] = model.predict(X_test_sparse)
        
        

    # Find all the values that are eligible
    # Fast way to compute which values can be used
    unseen_items = np.ones(previously_seen_items.shape) - (previously_seen_items.values > 0)*1
    valid_prediction = pd.DataFrame(data = np.multiply(unseen_items, prediction_frame.values),
                                    index = user_indices, 
                                    columns = product_categories)
    
    @np.vectorize
    def index_to_id3(x):
        return product_categories[x]

    # Compute the top 5 predictions
    top_5_categories = np.argpartition(valid_prediction.values, -5)[:, -5:]
    score = np.array([valid_prediction.values[i, top_5_categories[i,:]] for i in range(valid_prediction.shape[0])]).sum(axis=1)
    ### SUPERIMPOTANT: Need to convert array indices of products into product categories!!!
    result = pd.concat([pd.DataFrame(data = top_5_categories).apply(index_to_id3), pd.DataFrame(data = score)], axis=1)
    result.index = valid_prediction.index
    result.reset_index(inplace=True)
    result.columns = ['user_id', 'id3_1', 'id3_2', 'id3_3', 'id3_4', 'id3_5', 'score']
    
    return result

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.06 µs


### For the creation of the feature matrix

In [6]:
# This method creates a feature matrix with user ids as rows and product_ids as columns
# Each entry counts the number of different days the user (row index) as looked at id3 (column index)
# This method can be used to create training data. As some user_ids or id3 may not be present in different
# timeframes, indices or columns can be given in order to achieve consistency
def create_feature_matrix(df_train, 
                          day_begin, 
                          day_end, 
                          indices=None, 
                          columns=None,
                          column_name = 'id3'):
    df_selected_entries = df_train[ (df_train['date'] >= day_begin)
                                  & (df_train['date'] <= day_end)]
    feature_matrix = pd.pivot_table(df_selected_entries, values='date',
                                    columns=column_name, index='user_id',
                                    aggfunc = pd.Series.nunique)
    # We may need to add some additional indices if some users are not present during the time period
    if not indices is None:
        set_difference_indices = np.setdiff1d(indices, feature_matrix.index)
        if len(set_difference_indices) > 0:
            feature_matrix = pd.concat([feature_matrix, pd.DataFrame(index = set_difference_indices,
                                                                    columns = feature_matrix.columns)])

    # We may need to add some additional columns if some products are not present during the time period
    if not columns is None:
        set_difference_columns = np.setdiff1d(columns, feature_matrix.columns)
        if len(set_difference_columns) > 0:
            feature_matrix = pd.concat([feature_matrix, pd.DataFrame(index = feature_matrix.index,
                                                                     columns = set_difference_columns)],
                                       axis=1)
    
    feature_matrix.fillna(0, inplace=True)
    # Sort the index
    feature_matrix.sort_index(axis = 0, inplace=True)
    # Sort the columns
    feature_matrix.sort_index(axis = 1, inplace=True)
       
    return feature_matrix

In [8]:
def create_extended_feature_matrix(df_train, 
                                   day_begin, 
                                   day_end, 
                                   indices, 
                                   columns_id1,
                                   columns_id2,
                                   columns_id3):
    """ This concatenates features for all three columns
    
    """
    fm_id1 = create_feature_matrix(df_train, day_begin, day_end, indices, columns_id1, 'id1')
    fm_id2 = create_feature_matrix(df_train, day_begin, day_end, indices, columns_id2, 'id2')
    fm_id3 = create_feature_matrix(df_train, day_begin, day_end, indices, columns_id3, 'id3')
    
    return pd.concat([fm_id1, fm_id2, fm_id3], axis=1) 

In [None]:
def create_sampled_training_dataset(df_train,
                                    split_days, 
                                    train_period_length,
                                    target_period_length,
                                    indices, 
                                    columns_id1,
                                    columns_id2,
                                    columns_id3,
                                    sample_fraction=0.1):
    
    first_iteration = True
    for day in split_days:
        X_tmp = create_feature_matrix(df_train = df_train,
                                      day_begin = day - train_period_length,
                                      day_end = day - 1,
                                      indices,
                                      columns = columns)
        y_tmp = create_feature_matrix(df_train = df_train,
                                      day_begin = day,
                                      day_end = day + target_period_length - 1,
                                      indices = indices,
                                      columns = columns)

        if first_iteration:
            X_sampled = X_tmp
            y_sampled = y_tmp
            first_iteration = False
        else:
            X_sampled = pd.concat([X_sampled, X_tmp])
            y_sampled = pd.concat([y_sampled, y_tmp])
    
    # Create the sampling
    X_sampled['temp_index'] = np.array(range(X_sampled.shape[0]))
    y_sampled['temp_index'] = np.array(range(X_sampled.shape[0]))
    
    X_sampled = X_sampled.sample(frac=sample_fraction)
    X_sampled.sort_values(by='temp_index', inplace=True)
    y_sampled = y_sampled[y_sampled['temp_index'].isin(X_sampled['temp_index'])]
    y_sampled.sort_values(by='temp_index', inplace=True)
    
    X_sampled.drop('temp_index', axis=1, inplace=True)
    y_sampled.drop('temp_index', axis=1, inplace=True)
    
    return X_sampled, y_sampled

# The submission

In [9]:
%%time
day_split_test = 62
day_split_train = 55

# Sample the users to make everything a bit faster
sampled_user = pd.Series(df_train['user_id'].unique()).sample(frac=1)
df_user_sampled = df_train[df_train['user_id'].isin(sampled_user)]


columns_id1 = sorted(df_train['id1'].unique())
columns_id2 = sorted(df_train['id2'].unique())
columns_id3 = sorted(df_train['id3'].unique())
user_indices = sorted(df_user_sampled['user_id'].unique())
user_indices_all = sorted(df_train['user_id'].unique())


prediction_score = []
previously_seen_items = create_feature_matrix(df_train = df_train, 
                                              day_begin = day_split_test - 21, 
                                              day_end = day_split_test - 1,
                                              indices = user_indices_all, 
                                              columns = columns_id3,
                                              column_name = 'id3'
                                             )

prediction_period = 9
max_depth_regression_tree = 6




print("Now preparing the training")
X_train = create_extended_feature_matrix(df_train = df_user_sampled, 
                                        day_begin = day_split_train - prediction_period, 
                                        day_end = day_split_train - 1,
                                        indices = user_indices, 
                                        columns_id1 = columns_id1,
                                        columns_id2 = columns_id2,
                                        columns_id3 = columns_id3)   

# IMPORTANT, we only want to predict the third category
y_train = create_feature_matrix(df_train = df_user_sampled,
                               day_begin = day_split_train,
                               day_end = day_split_train + 6,
                               indices = user_indices, 
                               columns = columns_id3,
                               column_name = 'id3')
X_test = create_extended_feature_matrix(df_train = df_user_sampled, 
                                       day_begin = day_split_test - prediction_period, 
                                       day_end = day_split_test - 1,
                                       indices = user_indices, 
                                       columns_id1 = columns_id1,
                                       columns_id2 = columns_id2,
                                       columns_id3 = columns_id3)
print("Now computing")
result = create_top_5_predictions(user_indices_all,
                                  columns_id3,
                                  X_train,
                                  y_train,
                                  X_test,
                                  previously_seen_items,
                                  max_depth_regression_tree = max_depth_regression_tree,
                                  verbose = True,
                                  sparse_computation = True,
                                 )

print("Now printing")
to_submit = result.nlargest(n=53979, columns='score')
to_submit.to_csv('predictions/Prediction_AK_final_1.csv', 
                 columns=['user_id', 'id3_1', 'id3_2', 'id3_3', 'id3_4', 'id3_5'], 
                 index=False)

Now preparing the training
Now computing
Currently predicting: 0 
Currently predicting: 50 
Currently predicting: 100 
Currently predicting: 150 
Currently predicting: 200 
Currently predicting: 250 
Currently predicting: 300 
Currently predicting: 350 
Currently predicting: 400 
Currently predicting: 450 
Currently predicting: 500 
Currently predicting: 550 
Currently predicting: 600 
Currently predicting: 650 
Currently predicting: 700 
Currently predicting: 750 
Currently predicting: 800 
Currently predicting: 850 
Currently predicting: 900 
Now printing
CPU times: user 5h 29min 31s, sys: 5min 49s, total: 5h 35min 21s
Wall time: 5h 36min 13s
