In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import turicreate as tc

### 1. Read transactions file

In [2]:
transactions = pd.read_csv("../../data/transactions_train.csv")
transactions.drop(['t_dat', 'price', 'sales_channel_id'], axis=1, inplace=True)

In [3]:
main_articles = pd.read_csv('../extracted/main_articles.csv')
articles_to_keep = main_articles.article_id.to_list()
del main_articles

In [4]:
# keep only top 408 extracted articles
transactions = transactions[transactions.article_id.isin(articles_to_keep)]


In [5]:
transactions = transactions.groupby(['customer_id', 'article_id']).value_counts()
transactions = pd.DataFrame(transactions, columns=['count'])
transactions.reset_index(inplace=True)
transactions['count_unique'] = 1

In [6]:
transactions.head()

Unnamed: 0,customer_id,article_id,count,count_unique
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006,2,1
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,795440001,1,1
2,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,351484002,3,1
3,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,599580024,2,1
4,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,599580055,2,1


### 2. Pivot and scale

In [7]:
matrix = pd.pivot_table(transactions, values='count', index='customer_id', columns='article_id')
del transactions

In [8]:
matrix.shape

(749897, 408)

In [9]:
matrix = (matrix-matrix.min())/(matrix.max()-matrix.min())

In [10]:
matrix.reset_index(inplace=True)
matrix.index.names = ['scaled_freq']
transactions_norm = pd.melt(matrix, id_vars='customer_id', value_name='scaled_freq').dropna()

transactions_norm.shape

(2697966, 3)

In [11]:
transactions_norm.head()

Unnamed: 0,customer_id,article_id,scaled_freq
181,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,108775015,0.142857
317,001ae5408a043f64bccd32beffe2730151414cbdf18a6e...,108775015,0.071429
325,001ba9e81e13ce12a2585d9ebde923fe74429e9e12ea59...,108775015,0.0
401,0022a721371d5949d174ecba60346d89a9d6c08c0fba4f...,108775015,0.071429
404,002323971cbd38fad4512d5114676e5e17eb262db02320...,108775015,0.0


In [12]:
train, test = [tc.SFrame(x) for x in train_test_split(transactions_norm, test_size=.2)]
# del transactions_norm


In [13]:
# variables
_CUST = 'customer_id'
_ARTI = 'article_id'
_CUSTOMERS_REC = list(pd.read_csv('../../data/customers.csv')['customer_id'])
_NRECS = 10
_NDISPLAY = 30
_TARGET='scaled_freq'


In [16]:
def train_recc(train_data, mode, user_id, item_id, target, users_to_recc, n_display, n_rec):
    if mode == 'popularity':
        model = tc.popularity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target)
    elif mode == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target,
                                                         similarity_type='cosine')
    elif mode == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target,
                                                         similarity_type='pearson')
    # elif mode == 'ranking_factorization_recommender':
    #     model = 
    else:
        print("Invalid Mode from: 'popularity', 'cosine', 'pearson', 'ranking_factorization_recommender ")
        return None
    
    recommend = model.recommend(users=users_to_recc, k=n_rec, verbose=False)
    # recommend.print_rows(n_display)
    return model
    

In [17]:
popRec = train_recc(train_data=train, mode='popularity', user_id=_CUST, item_id=_ARTI, target=_TARGET, users_to_recc=_CUSTOMERS_REC, n_display=_NDISPLAY, n_rec=_NRECS)

In [24]:
popRec.evaluate_precision_recall(test, verbose=False)

{'precision_recall_by_user': Columns:
 	customer_id	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 6131232
 
 Data:
 +-------------------------------+--------+-----------+--------+-------+
 |          customer_id          | cutoff | precision | recall | count |
 +-------------------------------+--------+-----------+--------+-------+
 | b9cdfe2bd103ed80154c60fd2b... |   1    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   2    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   3    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   4    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   5    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   6    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   7    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   8    |    0.0    |  0.0   |   1   |
 | b9cdfe2bd103ed80154c60fd2b... |   9    |    0.0    |  

RMSE:
pop: 0.055406750752293024