In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

## Load data

In [None]:
article_map = pd.read_csv('article_image_mapping.csv', dtype={
    'article_id': str, 
    'product_type_name': str, 
    'directory': str,
    'filename': str,
    'image_exists': bool
})

article_map = article_map[article_map.image_exists]

In [None]:
data = pd.read_csv('data/transactions_train.csv', dtype={
    't_dat': str,
    'customer_id': str,
    'article_id': str,
    'price': float,
    'sales_channel_id': int
})

data.head()

In [None]:
# Keep only records of products which have images
data = pd.merge(data, article_map, how='inner', on='article_id')
data = data[['t_dat','customer_id','article_id']]

## Get number of customers

In [None]:
data.customer_id.nunique()

## Get latest 3 transactions of customers

In [None]:
data['t_dat'] = pd.to_datetime(data['t_dat'])

In [None]:
data = data.sort_values('t_dat')

In [None]:
data.shape

In [None]:
latest3 = data.groupby('customer_id').tail(3).reset_index(drop=True)
latest3.sort_values('customer_id', inplace=True)
latest3.head()

In [None]:
latest3.customer_id.nunique()

In [None]:
latest3['articles_string'] = latest3[['customer_id','article_id']].groupby(['customer_id'])['article_id'].transform(lambda x: ','.join(x))

In [None]:
latest3['commacount'] = latest3[['customer_id','article_id']].groupby(['customer_id'])['article_id'].transform(lambda x: len(x))

In [None]:
latest3 = latest3[latest3.commacount==3]

In [None]:
latest3_new = latest3[['customer_id','articles_string']].drop_duplicates()

In [None]:
latest3_new.customer_id.nunique()

In [None]:
latest3_new.to_csv('latest3.csv',index=False)