In [78]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import turicreate as tc

import sys
sys.path.append("..")
customers = pd.read_csv('./assets/customer.csv') 
transactions = pd.read_csv('./assets/transaction.csv')

In [100]:
data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)
print(data.shape)
data.head

(133585, 3)


<bound method NDFrame.head of         customerId  productId  purchase_count
0                0          1               2
1                0         13               1
2                0         19               3
3                0         20               1
4                0         31               2
...            ...        ...             ...
133580       28596        211               3
133581       28596        255               1
133582       28598        212               1
133583       28604        282               1
133584       28605         92               1

[133585 rows x 3 columns]>

In [80]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

In [81]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')

In [82]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [83]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(133585, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


In [84]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

In [85]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [86]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [87]:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 

In [88]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [89]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.10310015082359314  |  1   |
|    1553    |     35    | 0.09922564029693604  |  2   |
|    1553    |     1     | 0.07530674338340759  |  3   |
|    1553    |     5     | 0.06649352610111237  |  4   |
|    1553    |     21    |  0.0630517303943634  |  5   |
|    1553    |     17    | 0.061228662729263306 |  6   |
|    1553    |     8     | 0.05764678120613098  |  7   |
|    1553    |     13    | 0.05419263243675232  |  8   |
|    1553    |     19    |  0.0525418221950531  |  9   |
|    1553    |     61    | 0.052282869815826416 |  10  |
|   20400    |     26    |         0.0          |  1   |
|   20400    |    287    |         0.0          |  2   |
|   20400    |     20    |         0.0          |  3   |
|   20400    |     46    |         0.0          |  4   |
|   20400    |    115    |     

In [90]:
users_to_recommend = list(customers[user_id])

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', 
                                            similarity_type='cosine')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.12324784994125366  |  1   |
|    1553    |     35    | 0.10447167158126831  |  2   |
|    1553    |     1     | 0.10348175764083863  |  3   |
|    1553    |     5     |  0.0906752586364746  |  4   |
|    1553    |     17    | 0.07659814357757569  |  5   |
|    1553    |     21    | 0.07491707801818848  |  6   |
|    1553    |     33    |  0.0668614387512207  |  7   |
|    1553    |     47    | 0.06058878898620605  |  8   |
|    1553    |     61    | 0.060317397117614746 |  9   |
|    1553    |     15    | 0.05949603319168091  |  10  |
|   20400    |     26    | 0.05812269449234009  |  1   |
|   20400    |     6     | 0.05361741781234741  |  2   |
|   20400    |    113    | 0.05312788486480713  |  3   |
|   20400    |     1     | 0.05210459232330322  |  4   |
|   20400    |     15    | 0.04

In [91]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(10000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1553,2,0.123248,1
1,1553,35,0.104472,2
2,1553,1,0.103482,3
3,1553,5,0.090675,4
4,1553,17,0.076598,5


In [92]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))

df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates().sort_values('customerId').set_index('customerId')


In [97]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('./assets/outputfinal.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [98]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(1000, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
4,226|247|230|248|125|204|155|294|72|276
11,226|247|230|248|125|204|155|294|276|129
12,226|247|230|248|125|204|155|294|72|276
16,226|247|230|248|125|204|155|294|72|276
21,226|247|230|248|125|204|155|294|72|276
