In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random
import datetime
from scipy import stats

In [2]:
df_train=pd.read_csv("train_5UKooLv.csv")
df_test=pd.read_csv('test_J1hm2KQ.csv',usecols =df_train.columns)

In [3]:
df_test.head()

Unnamed: 0,Country,CustomerID,InvoiceDate,InvoiceNo,Quantity,StockCode,UnitPrice
0,PX,127269,1/12/2010 8:28,127269,7,22633V,1.85
1,PX,227268,1/12/2010 8:34,227268,38,84879M,1.69
2,PX,227268,1/12/2010 8:34,227268,7,22748P,2.1
3,PX,227268,1/12/2010 8:34,227268,9,22749K,3.75
4,PX,227268,1/12/2010 8:34,227268,2,22622G,9.95


In [4]:
df_train.head()

Unnamed: 0,CustomerID,InvoiceNo,Quantity,InvoiceDate,UnitPrice,Country,StockCode
0,27270,27270,7,01/12/10 8:26,2.55,PX,85123AY
1,27270,27270,7,01/12/10 8:26,3.39,PX,71053R
2,27270,27270,9,01/12/10 8:26,2.75,PX,84406BH
3,27270,27270,7,01/12/10 8:26,3.39,PX,84029GV
4,27270,27270,7,01/12/10 8:26,3.39,PX,84029EX


In [5]:
print(df_test['StockCode'].nunique())
print(df_train['StockCode'].nunique())

3522
3810


In [6]:
print(len(df_train[df_train['Quantity']<0]))
print(len(df_test[df_test['Quantity']<0]))

5588
1762


In [7]:
df_train.loc[df_train[df_train['Quantity']<0].index, 'Quantity'] *=-1
df_test.loc[df_test[df_test['Quantity']<0].index, 'Quantity'] *=-1

print(len(df_train[df_train['Quantity']<0]))
print(len(df_test[df_test['Quantity']<0]))

0
0


In [8]:
df=pd.concat([df_train,df_test])

In [9]:
df.head()

Unnamed: 0,CustomerID,InvoiceNo,Quantity,InvoiceDate,UnitPrice,Country,StockCode
0,27270,27270,7,01/12/10 8:26,2.55,PX,85123AY
1,27270,27270,7,01/12/10 8:26,3.39,PX,71053R
2,27270,27270,9,01/12/10 8:26,2.75,PX,84406BH
3,27270,27270,7,01/12/10 8:26,3.39,PX,84029GV
4,27270,27270,7,01/12/10 8:26,3.39,PX,84029EX


In [10]:
#count the quantity of a given product purchased by every user
grp_data =pd.DataFrame( df.groupby(['CustomerID','StockCode'])['StockCode'].agg(total='count').reset_index()[['CustomerID','StockCode','total']])

In [11]:
grp_data.total.unique()

array([ 1,  2,  3,  4,  6,  5,  7,  8,  9, 14, 22, 10, 11])

In [12]:
df_code = pd.DataFrame(df['StockCode'].unique(),columns=['StockCode'])
codes = df_code['StockCode'].apply(lambda x:x[-2:])
df_code['alphabet_codes'] = codes.apply(lambda x:x[1] if x[0].isdigit() else x)
df_code.head()

Unnamed: 0,StockCode,alphabet_codes
0,85123AY,AY
1,71053R,R
2,84406BH,BH
3,84029GV,GV
4,84029EX,EX


In [13]:
num_code = df_code['StockCode'].apply(lambda x:x[0:3])
num_code = num_code.apply(lambda x:x[:-1] if x[-1].isdigit()==False else x)
df_code['num_code']=num_code
df_code.head()

Unnamed: 0,StockCode,alphabet_codes,num_code
0,85123AY,AY,851
1,71053R,R,710
2,84406BH,BH,844
3,84029GV,GV,840
4,84029EX,EX,840


##### Assumption: StockCode is of the form XXX-XX-AB

In [14]:
modal_pr =pd.DataFrame( df.groupby(['StockCode'])['UnitPrice'].agg(price=pd.Series.mode).reset_index()[['StockCode','price']])
modal_pr.head()

Unnamed: 0,StockCode,price
0,10002F,0.85
1,10080Q,0.39
2,10120M,0.21
3,10123CU,0.65
4,10124AW,0.42


In [15]:
df_code = df_code.merge(modal_pr,on=['StockCode'],how='inner')
df_code.head()

Unnamed: 0,StockCode,alphabet_codes,num_code,price
0,85123AY,AY,851,2.95
1,71053R,R,710,3.75
2,84406BH,BH,844,4.15
3,84029GV,GV,840,4.25
4,84029EX,EX,840,4.25


In [16]:
#Normalise purchase of each product for every user
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='total', index='CustomerID', columns='StockCode')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['CustomerID'], value_name='scaled_purchase_freq').dropna()



data_norm = normalize_data(grp_data)

In [17]:
data_norm.head()

Unnamed: 0,CustomerID,StockCode,scaled_purchase_freq
57,25470,10002F,0.0
76,31770,10002F,0.0
114,41760,10002F,0.0
151,50949,10002F,0.0
173,56340,10002F,0.0


In [18]:
from sklearn.model_selection import train_test_split
import time
import turicreate as tc

In [19]:
def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data
train_data, test_data = split_data(grp_data)

In [20]:
train_data.head()

CustomerID,StockCode,total
194508,22627E,1
222768,23166P,1
217278,84598D,1
166329,22925Q,1
302787,47580T,1
543555,48194E,1
33759,22704P,1
60039,22467G,4
99099,22745M,1
560835,23419X,1


In [21]:
#testing model
# constant variables to define field names include:
user_id = 'CustomerID'
item_id = 'StockCode'
users_to_recommend = list(train_data[user_id][0:3])
n_rec = 20 # number of items to recommend
n_display = 20 # to display the first few rows in an output dataset

In [22]:
df_code.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3860 entries, 0 to 3859
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   StockCode       3860 non-null   object
 1   alphabet_codes  3860 non-null   object
 2   num_code        3860 non-null   object
 3   price           3860 non-null   object
dtypes: object(4)
memory usage: 150.8+ KB


In [25]:
df_code.price=df_code.price.astype('str')
df_code.head()

Unnamed: 0,StockCode,alphabet_codes,num_code,price
0,85123AY,AY,851,2.95
1,71053R,R,710,3.75
2,84406BH,BH,844,4.15
3,84029GV,GV,840,4.25
4,84029EX,EX,840,4.25


In [34]:
customer = 25470
no_of_products = df[df['CustomerID']==customer]['StockCode'].nunique()
model.recommend(users=[25470], k=no_of_products)

CustomerID,StockCode,score,rank
25470,22423U,0.9394133687019348,1
25470,23301G,0.9129734635353088,2
25470,82583N,0.9022037386894226,3
25470,22722E,0.863328754901886,4
25470,22470E,0.8460075259208679,5
25470,22457C,0.8386316299438477,6
25470,84380W,0.827761709690094,7
25470,22993E,0.814311683177948,8
25470,20725N,0.7777132391929626,9
25470,22139Q,0.7560898661613464,10


In [26]:
user_data = None
item_data = tc.SFrame(df_code)
model = tc.ranking_factorization_recommender.create(tc.SFrame(train_data), 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target='total',user_data=user_data, item_data=item_data, 
                                                    solver='ials')

In [27]:
model.save('final_model_submission')

###### Implementing the model

In [28]:
test_user = pd.DataFrame(df_test['CustomerID'].unique(), columns=['CustomerID'])
test_user.head()

Unnamed: 0,CustomerID
0,127269
1,227268
2,327267
3,162738
4,252747


In [29]:
test_user['no_reco_to_made'] = test_user['CustomerID'].apply(lambda x:df[df['CustomerID']==x]['StockCode'].nunique())

In [30]:
test_user['Items'] = test_user['CustomerID'].apply(lambda x: list(model.recommend(users=[x], 
                                                                  k=int(test_user[test_user['CustomerID']==x]['no_reco_to_made']))
                                                                  ['StockCode']))

In [31]:
test_user.drop('no_reco_to_made',axis=1).to_csv('./new_submissions_data.csv',index=False,header=True)