In [1]:
import pandas as pd

class DataParser:
  def __init__(self, filepaths):
    assert isinstance(filepaths, dict)
    assert 'articles' in filepaths
    assert 'customers' in filepaths
    assert 'transactions' in filepaths
    self._filepaths = filepaths
    self._raw = {}
    self._data = {}
    self._encoders = {
      'articles': self.encode_articles,
      'customers': self.encode_customers,
      'transactions': self.encode_transactions,
    }
  
  def load_data(self, index):
    assert index in ['articles', 'customers', 'transactions']
    self._raw[index] = pd.read_csv(self._filepaths[index])
  
  def encode_data(self, index):
    assert index in ['articles', 'customers', 'transactions']
    assert index in self._raw
    self._encoders[index]()

  def create_association(self, index, id, name):  
    association_dict = self._raw[index][[id, name]]
    association_dict = association_dict.set_index([id]).to_dict()[name]
    return pd.DataFrame.from_dict(association_dict, orient='index', columns=['name'])
  
  def encode_articles(self):
    selectors = [col for col in self._raw['articles'].columns if any(['id' in col, 'code' in col, 'no' in col])]
    self._data['articles'] = self._raw['articles'].loc[:, selectors].copy(deep=True)

    self._data['articles']['index_code'] = self._data['articles']['index_code'].astype('category')
    self._data['articles']['index_code'] = self._data['articles']['index_code'].cat.codes
  
  def fill_customers(self):
    self._raw['customers']['age'].fillna(0.0, inplace=True)
    self._raw['customers']['club_member_status'].fillna('NONE', inplace=True)
    self._raw['customers']['Active'].fillna(0.0, inplace=True)
    self._raw['customers']['FN'].fillna(0.0, inplace=True)
    self._raw['customers']['fashion_news_frequency'].fillna('NONE', inplace=True)
    self._raw['customers']['fashion_news_frequency'] = self._raw['customers']['fashion_news_frequency'].str.replace('NONE', 'None')
  
  def encode_customers(self):
    self.fill_customers()
    self._data['customers'] = self._raw['customers'].copy(deep=True)

    self._data['customers']['fashion_news_frequency'] = self._data['customers']['fashion_news_frequency'].astype('category')
    self._data['customers']['club_member_status'] = self._data['customers']['club_member_status'].astype('category')
    self._data['customers']['Active'] = self._data['customers']['Active'].astype('category')
    self._data['customers']['FN'] = self._data['customers']['FN'].astype('category')
    self._data['customers']['club_member_status'] = self._data['customers']['club_member_status'].cat.codes
    self._data['customers']['fashion_news_frequency'] = self._data['customers']['fashion_news_frequency'].cat.codes
  
  def encode_transactions(self):
    self._data['transactions'] = self._raw['transactions'].copy(deep=True)

  def get_data(self, index) -> pd.DataFrame:
    assert index in ['articles', 'customers', 'transactions']
    if index not in self._raw and index not in self._data:
      self.load_data(index)
      print(f'{index.upper()} loaded...')
    if index not in self._data:
      self.encode_data(index)
      del self._raw[index]
    return self._data[index]

In [2]:
import numpy as np

In [3]:
filepaths = {
    'articles': '../../data/articles.csv',
    'customers': '../../data/customers.csv',
    'transactions': '../../data/transactions_train.csv'
}

In [4]:
parser = DataParser(filepaths)

In [5]:
articles = parser.get_data('articles')

ARTICLES loaded...


In [6]:
customers = parser.get_data('customers')

CUSTOMERS loaded...


In [7]:
transactions = parser.get_data('transactions')

TRANSACTIONS loaded...


In [8]:
random_article_idx = np.random.randint(0, articles.shape[0], 7)
random_article_idx

array([95302, 86844, 57456, 49805, 17244, 88368, 55466])

In [9]:
selected_articles = articles.loc[random_article_idx, :]
selected_articles = selected_articles.reset_index().drop(columns=['index'])
selected_articles

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,865977019,865977,273,1010014,9,4,5,8716,7,4,77,1005
1,825173002,825173,254,1010016,22,2,8,1522,0,1,15,1010
2,716868002,716868,94,1010016,10,3,9,3828,2,1,64,1020
3,695359001,695359,245,1010016,17,2,13,1616,0,1,11,1003
4,573323004,573323,265,1010001,9,4,5,1344,3,2,53,1013
5,832505004,832505,306,1010016,17,4,13,3710,1,1,61,1017
6,710727003,710727,274,1010016,73,4,2,6565,5,4,41,1006


In [10]:
article_mask = transactions['article_id'].isin(selected_articles['article_id'])
selected_article_transactions = transactions[article_mask]

In [11]:
users_mask = transactions['customer_id'].isin(selected_article_transactions['customer_id'].unique()) & ~transactions['article_id'].isin(selected_articles['article_id'])
selected_user_transactions = transactions[users_mask]

In [12]:
selected_transactions = selected_article_transactions.append(selected_user_transactions, ignore_index=True)

In [13]:
articles_mask = articles['article_id'].isin(selected_transactions['article_id'].unique()) & ~articles['article_id'].isin(selected_articles['article_id'].unique())
selected_articles = selected_articles.append(articles[articles_mask])

In [14]:
customers_mask = customers['customer_id'].isin(selected_article_transactions['customer_id'].unique())
selected_customers = customers[customers_mask]

In [15]:
print(selected_transactions['article_id'].unique().shape, selected_transactions['customer_id'].unique().shape)
print(selected_articles['article_id'].unique().shape, selected_customers['customer_id'].unique().shape)

(44435,) (1849,)
(44435,) (1849,)


In [16]:
import turicreate as tc

In [17]:
del parser
del transactions
del articles
del customers

In [18]:
selected_transactions = selected_transactions.sort_values(by='t_dat')
train_mask, test_mask = ~selected_transactions['t_dat'].isin(selected_transactions['t_dat'].unique()[-7:]), selected_transactions['t_dat'].isin(selected_transactions['t_dat'].unique()[-7:])
train_data, test_data = selected_transactions[train_mask], selected_transactions[test_mask]

selected_transactions.shape, train_data.shape, test_data.shape


((194179, 5), (192692, 5), (1487, 5))

In [19]:
X = train_data.groupby(['customer_id', 'article_id'])[['t_dat']].count().reset_index().rename(columns={'t_dat': 'purchase_count'})
X = X.merge(X.groupby(['customer_id'])['purchase_count'].sum().reset_index().rename(columns={'purchase_count': 'total'}), how='outer', on='customer_id')
X['weights'] = (X['purchase_count'] / X['total']) * 100

In [20]:
X_test = test_data.groupby(['customer_id', 'article_id'])[['t_dat']].count().reset_index().rename(columns={'t_dat': 'purchase_count'})
X_test = X_test.merge(X_test.groupby(['customer_id'])['purchase_count'].sum().reset_index().rename(columns={'purchase_count': 'total'}), how='outer', on='customer_id')
X_test['weights'] = (X_test['purchase_count'] / X_test['total']) * 100

In [21]:
train = tc.SFrame(data=X[['customer_id', 'article_id', 'weights']].to_dict(orient='list'))
articles = tc.SFrame(data=selected_articles.to_dict(orient='list'))
customers = tc.SFrame(data=selected_customers.to_dict(orient='list'))

In [22]:
test = tc.SFrame(data=X_test[['customer_id', 'article_id', 'weights']].to_dict(orient='list'))

In [23]:
train.shape, articles.shape, customers.shape, test.shape

((162156, 3), (44435, 12), (1849, 7), (1328, 3))

## Model 1

In [24]:
model = tc.recommender.item_similarity_recommender.create(train, user_id='customer_id', item_id='article_id', target='weights', user_data=customers, item_data=articles, only_top_k=12, similarity_type='cosine', target_memory_usage=4589934592)

In [25]:
predictions = model.recommend(customers['customer_id'], 12)

In [26]:
predictions.print_rows(24)

+-------------------------------+------------+----------------------+------+
|          customer_id          | article_id |        score         | rank |
+-------------------------------+------------+----------------------+------+
| 001a07f8ad065d7ed4d560bd87... | 653538001  |  0.5234657067518967  |  1   |
| 001a07f8ad065d7ed4d560bd87... | 578997005  | 0.47645922349049497  |  2   |
| 001a07f8ad065d7ed4d560bd87... | 879166005  | 0.32102065819960374  |  3   |
| 001a07f8ad065d7ed4d560bd87... | 872013002  | 0.31920281740335316  |  4   |
| 001a07f8ad065d7ed4d560bd87... | 695325020  | 0.29276072061978853  |  5   |
| 001a07f8ad065d7ed4d560bd87... | 866613001  |  0.2893846401801476  |  6   |
| 001a07f8ad065d7ed4d560bd87... | 847237002  |  0.2884314243610089  |  7   |
| 001a07f8ad065d7ed4d560bd87... | 849648001  |  0.2865508978183453  |  8   |
| 001a07f8ad065d7ed4d560bd87... | 794191003  |  0.2821351427298326  |  9   |
| 001a07f8ad065d7ed4d560bd87... | 782742001  | 0.28048714307638317  |  10  |

In [27]:
predictions = predictions.to_dataframe().groupby('customer_id')[['article_id']].agg(lambda x: list(x)).reset_index()
predictions

Unnamed: 0,customer_id,article_id
0,001a07f8ad065d7ed4d560bd8703e17ce4329dc186cfac...,"[653538001, 578997005, 879166005, 872013002, 6..."
1,00228762ecff5b8d1ea6a2e52b96dafa198febddbc3bf3...,"[758118001, 758060001, 758064001, 928845001, 7..."
2,0026906d6c593cd34993ee283861e050272cb193026eb8...,"[729674002, 706487009, 577425021, 783386002, 7..."
3,003654da7d243503534e25d5a320125b51ad57c5b49aef...,"[716262002, 692783002, 624007001, 319906002, 7..."
4,0055238c01fd709813972b3366e956f35b27197c7cb35f...,"[918944001, 903311001, 876147001, 881203003, 6..."
...,...,...
5208,ff4c09948c61a6362ad0d1b1d0f264f0a29b313ad11ded...,"[584633002, 853752001, 697315001, 685504001, 6..."
5209,ff50bf86e1509f06af685ba965eca09f77c3d67d0ae9d0...,"[708107003, 762331003, 755153002, 721059003, 7..."
5210,ff69970a2d44c40f4f37020576bf4e87b250e429220529...,"[510075010, 510075007, 514865019, 549896006, 6..."
5211,ffcf4762f67665a97b16cfa85a0997dcdaf3dff28ac1c2...,"[849711003, 871852002, 862937008, 568601031, 7..."


In [28]:
test = test.to_dataframe().groupby('customer_id')[['article_id']].agg(lambda x: list(x)).reset_index()
test

Unnamed: 0,customer_id,article_id
0,0026906d6c593cd34993ee283861e050272cb193026eb8...,[918522001]
1,00d6f16c1f7b4325c65ebfeb20db8cf66975945f4c078b...,"[909529001, 909529002]"
2,00fe8fb9b5a49ed8c38699c1860847c266f288da271b1c...,"[877769001, 922037003, 926433001, 936428003]"
3,011fc4c3387f8c6eba0e7062aa47750b65d4dc2d5d6148...,"[778064028, 778064038, 852174001, 868823008, 8..."
4,01a4717d38b651e46dda7f1ab8d1494af2682a847fa9a5...,"[614854005, 835704008, 841185005, 888570001]"
...,...,...
800,fd3f2f3aa5652dcb2b7b4510f6f6f01038015c55650cf2...,"[674606048, 674606068, 858856001, 908081004, 9..."
801,fdfd48d0122ffb4833863ff1d500854f0cbf37da8ff93b...,[832307003]
802,ff15f008bdf2c7c9ce8c47cf05ffc9ef5ae5aeebebc64f...,"[742561002, 742561003, 797988001]"
803,ff50bf86e1509f06af685ba965eca09f77c3d67d0ae9d0...,[925978001]


In [29]:
preds_intersection_test = test.merge(predictions.rename(columns={'article_id': 'predictions'}), how='inner', on='customer_id')
preds_intersection_test['correct'] = preds_intersection_test.apply(lambda x: sum(_x in x['predictions'] for _x in x['article_id']), axis=1)
preds_intersection_test['correct'].sum()

1

In [36]:
preds_intersection_test[preds_intersection_test['correct'] == 1]['predictions'].to_numpy()

array([list([662369012, 741419004, 293433046, 895582015, 291338032, 858800004, 644797005, 652179004, 706837003, 412664025, 720650003, 880481001])],
      dtype=object)

In [37]:
preds_intersection_test[preds_intersection_test['correct'] == 1]['article_id'].to_numpy()

array([list([766495009, 863980001, 880481001, 881942001])], dtype=object)

In [39]:
selected_articles[selected_articles['article_id'] == 880481001]

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
98702,880481001,880481,254,1010016,9,4,5,1643,3,2,51,1002


In [41]:
similar = model.get_similar_items([880481001]).to_dataframe()

In [43]:
selected_articles[selected_articles['article_id'].isin(similar['similar'].append(pd.Series([880481001])))]

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
58507,720125051,720125,273,1010005,73,4,2,8310,9,26,5,1005
69763,758064001,758064,298,1010001,19,2,20,4242,1,1,60,1018
73492,772031001,772031,298,1010016,19,2,20,4242,1,1,60,1018
73493,772032001,772032,59,1010016,19,2,20,4242,1,1,60,1018
78256,793185029,793185,74,1010001,13,2,11,4343,2,1,66,1019
87279,826955001,826955,286,1010016,9,4,5,1338,1,1,61,1017
87280,826955002,826955,286,1010016,10,3,9,1338,1,1,61,1017
87283,826955011,826955,286,1010016,13,2,11,1338,1,1,61,1017
87284,826955013,826955,286,1010016,52,7,4,1338,1,1,61,1017
87458,827957002,827957,306,1010016,9,4,5,1338,1,1,61,1017


In [46]:
inspection = selected_transactions[selected_transactions['article_id'].isin(similar['similar'].append(pd.Series([880481001])))].groupby(['customer_id', 'article_id'])['t_dat'].count()
inspection.head(25)

customer_id                                                       article_id
019272f4940035ca856257aeecead172d3faf7315e96d04513a85ab89d407530  826955001     1
                                                                  827957002     1
0beea9e4ad809faa615082f964a381255899c9e6b99806d7f3f2174639d53810  772031001     2
                                                                  772032001     2
0dd625f10fefb44ffe057cb38de625de7e7590b8d83461b58a8032a4736ceba0  758064001     1
0e15b92db005a0568904a89e5bcf0d6c593b86d7a000312b1a59053fe3847f3a  772031001     1
0e7b1cfecec2fbfddbd1ade2be016a3f8899f322e9199a47e58c4a33457da21a  720125051     1
13361333975fd47f79fe144a83dfb1baef14b27bb174c0a7d9488bee04402299  772031001     1
                                                                  772032001     1
140a98e1ced790b986d0bcfb4f60e2df67ba6d5f94ed3492e3a9eb5548936959  827957002     1
17ad2dff81d886b0e80289b5fde8464e14c12ef90709f1e18d0193d1004c93be  758064001     1
19a4d85b5d7cedae4d000

## Model 2

In [24]:
model2 = tc.recommender.item_similarity_recommender.create(train, user_id='customer_id', item_id='article_id', target='weights', target_memory_usage=4589934592)

In [26]:
predictions = model2.recommend(customers['customer_id'], 12)

In [27]:
predictions = predictions.to_dataframe().groupby('customer_id')[['article_id']].agg(lambda x: list(x)).reset_index()
test = test.to_dataframe().groupby('customer_id')[['article_id']].agg(lambda x: list(x)).reset_index()
preds_intersection_test = test.merge(predictions.rename(columns={'article_id': 'predictions'}), how='inner', on='customer_id')
preds_intersection_test['correct'] = preds_intersection_test.apply(lambda x: sum(_x in x['predictions'] for _x in x['article_id']), axis=1)
preds_intersection_test['correct'].sum()

0

In [28]:
predictions.head()

Unnamed: 0,customer_id,article_id
0,00389391b154464ebcf31692b05b45eb7a00f3eb0795e0...,"[875672001, 834021006, 671973006, 703046002, 7..."
1,00420ecbf31e3b8acc308220be04c9c1e9b3911bf15e10...,"[788222001, 743803005, 783925002, 734647002, 7..."
2,004b58a7522e489246095b8073993b8e4603cb0ec7fda7...,"[752484002, 762158002, 759020003, 672758002, 6..."
3,006fb99924ac083e0d98f8af637a934a75adaeee74f1c0...,"[805144001, 509893021, 443860005, 667985014, 6..."
4,00993551a798a0c197a5c1247ebd975079b7088b0a9ce4...,"[669916001, 624645009, 575214004, 595841002, 6..."


In [29]:
similar = model2.get_similar_items([875672001])

In [30]:
similar

article_id,similar,score,rank
875672001,826069002,0.5,1
875672001,546579006,0.5,2
875672001,506098011,0.5,3
875672001,772193001,0.5,4
875672001,805983001,0.5,5
875672001,819207001,0.5,6
875672001,843869002,0.5,7
875672001,855126001,0.5,8
875672001,846347003,0.5,9
875672001,557051001,0.5,10
