In [28]:
import pandas as pd
import numpy as np
from scipy import sparse
import itertools
from datetime import datetime
import matplotlib.pyplot as plt
import random
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics import mean_squared_error

from math import sqrt
import warnings; warnings.simplefilter('ignore')

In [29]:
data = pd.read_excel('Latestpredictiondata.xlsx')

In [30]:
data.head(15)

Unnamed: 0,bilNumber,CustomerId,item,category,colour,size(inches),weight (g)
0,11755,90712762,gch-3470,gold chain,Gold,16,25
1,11756,51319198,lsp-0,loose pearls,Gold,16,25
2,11756,51319198,lsp-0,loose pearls,Gold,16,25
3,11756,51319198,pset-95885,pearl set,Gold,16,25
4,11756,51319198,pps-39735,pearl pendent set,Gold,16,25
5,11756,51319198,ptop-174643,pearl tops,Gold,16,25
6,11756,51319198,ptop-171494,pearl tops,Gold,16,25
7,11757,52135764,lsp-0,loose pearls,Gold,16,25
8,11757,52135764,pml-27323,pearl mala,Gold,16,25
9,11757,52135764,ptop-188696,pearl tops,Gold,16,25


In [5]:
# start data cleansing
# check variable type and missing value
print(data.head(15))

    bilNumber  CustomerId         item           category colour  \
0       11755    90712762     gch-3470         gold chain   Gold   
1       11756    51319198        lsp-0       loose pearls   Gold   
2       11756    51319198        lsp-0       loose pearls   Gold   
3       11756    51319198   pset-95885          pearl set   Gold   
4       11756    51319198    pps-39735  pearl pendent set   Gold   
5       11756    51319198  ptop-174643         pearl tops   Gold   
6       11756    51319198  ptop-171494         pearl tops   Gold   
7       11757    52135764        lsp-0       loose pearls   Gold   
8       11757    52135764    pml-27323         pearl mala   Gold   
9       11757    52135764  ptop-188696         pearl tops   Gold   
10      11757    52135764  ptop-185533         pearl tops   Gold   
11      11758       11758     gch-3238         gold chain   Gold   
12      11758       11758     gbrs-143      gold bracelet   Gold   
13      11758       11758    gtop-3690          

In [6]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3583 entries, 0 to 3582
Data columns (total 7 columns):
bilNumber       3583 non-null int64
CustomerId      3583 non-null int64
item            3583 non-null object
category        3583 non-null object
colour          3583 non-null object
size(inches)    3583 non-null int64
weight (g)      3583 non-null int64
dtypes: int64(4), object(3)
memory usage: 196.0+ KB
None


In [7]:
print(data.describe())

          bilNumber    CustomerId  size(inches)   weight (g)
count   3583.000000  3.583000e+03   3583.000000  3583.000000
mean   12288.436227  5.134430e+07     17.046888    43.555680
std      300.771793  3.020393e+07      0.705748    10.355903
min    11755.000000  3.013000e+03     16.000000    15.000000
25%    12032.000000  2.352930e+07     17.000000    45.000000
50%    12295.000000  4.951629e+07     17.000000    50.000000
75%    12563.000000  8.120266e+07     18.000000    50.000000
max    12779.000000  9.998586e+07     18.000000    50.000000


In [8]:
# check no. of user and item before data cleansing
print('Number of users: ', data.CustomerId.nunique())
print('Number of items: ', data.item.nunique())

Number of users:  853
Number of items:  3322


In [9]:
# aggregate the total quantity of each item over all time
group = data.groupby(['CustomerId', 'item','bilNumber'])

In [10]:
data[data['CustomerId'] == 11759]

Unnamed: 0,bilNumber,CustomerId,item,category,colour,size(inches),weight (g)
14,11759,11759,gbrs-143,gold bracelet,Gold,16,25
15,11759,11759,gch-3116,gold chain,Gold,16,25
16,11759,11759,gtop-3722,gold tops,Gold,16,25
17,11759,11759,drng-660,diamond ring...,Gold,16,25


In [11]:
data[data['CustomerId'] == 51319198]

Unnamed: 0,bilNumber,CustomerId,item,category,colour,size(inches),weight (g)
1,11756,51319198,lsp-0,loose pearls,Gold,16,25
2,11756,51319198,lsp-0,loose pearls,Gold,16,25
3,11756,51319198,pset-95885,pearl set,Gold,16,25
4,11756,51319198,pps-39735,pearl pendent set,Gold,16,25
5,11756,51319198,ptop-174643,pearl tops,Gold,16,25
6,11756,51319198,ptop-171494,pearl tops,Gold,16,25


In [75]:
data['category'] = (data['category'].fillna(''))
data['colour'] = (data['colour'].fillna(''))
data['size(inches)'] = str(data['size(inches)'].fillna(''))
data['weight (g)'] = str(data['weight (g)'].fillna(''))

In [76]:
items = data['item']

In [77]:
items['features'] = data['category'] + data['colour'] + data['size(inches)'] + data['weight (g)']

In [78]:
#items['features'] = data['item'] + data['category'] + data['colour'] + data['size(inches)'] + data['weight (g)']

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(items['features'])

In [16]:
tfidf_matrix.shape

(3583, 602)

In [17]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
cosine_sim[0]

array([1.        , 0.96117926, 0.96117926, ..., 0.96941984, 0.96941984,
       0.96811977])

In [19]:
data = data.reset_index()
item_name = data['item'] 
indices = pd.Series(data.index, index = data['item'])
indices.head(2)

item
gch-3470    0
lsp-0       1
dtype: int64

# Item Recommendation (Content Based Filtering)

In [20]:
# Function that get item recommendations based on the cosine similarity score of items
def item_recommendations(item): 
    idx = indices[item]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    user_indices = [i[0] for i in sim_scores]
    return item_name.iloc[user_indices] 

In [21]:
item_recommendations('pset-95885') 

20       pset-94886
23       pset-89182
24       pset-66401
27       pset-95928
28       pset-96843
30       pset-95816
33       pset-96579
4         pps-39735
25        pps-39725
29        pps-39854
2142    ptop-188285
2143    ptop-188475
2144    ptop-188261
2145    ptop-188416
2146    ptop-186898
2150    ptop-106725
2156    ptop-175625
2160    ptop-188282
2163    ptop-138280
2164    ptop-187329
Name: item, dtype: object

In [22]:
item_recommendations('gtop-3690')

16        gtop-3722
22        gtop-3677
5       ptop-174643
6       ptop-171494
9       ptop-188696
10      ptop-185533
21      ptop-188859
26      ptop-185265
31      ptop-185347
32      ptop-185373
36      ptop-187222
37      ptop-187200
38      ptop-187214
40      ptop-181447
43      ptop-188658
44      ptop-187112
2142    ptop-188285
2143    ptop-188475
2144    ptop-188261
2145    ptop-188416
Name: item, dtype: object

In [23]:
item_recommendations('gch-3238')

11         gch-3238
15         gch-3116
18         gch-3453
19         gch-3457
2142    ptop-188285
2143    ptop-188475
2144    ptop-188261
2145    ptop-188416
2146    ptop-186898
2150    ptop-106725
2156    ptop-175625
2160    ptop-188282
2163    ptop-138280
2164    ptop-187329
2177    ptop-186684
2178    ptop-186683
2179    ptop-189506
2180    ptop-189504
2181    ptop-189356
2183    ptop-189348
Name: item, dtype: object

In [59]:
def get_features(item_list):
    items_userwise = data['item'].isin(item_list)
    df1 = pd.DataFrame(data = data[items_userwise], columns=['item'])
    itemlist = df1['item'].tolist() 
    item_list = data['item'].isin(itemlist)     
    df_temp = pd.DataFrame(data = data[item_list], columns=['item','CustomerId','bilNumber','category'])
    return df_temp 

In [60]:
get_features(item_recommendations('pset-95885')) 

Unnamed: 0,item,CustomerId,bilNumber,category
4,pps-39735,51319198,11756,pearl pendent set
20,pset-94886,42694652,11761,pearl set
23,pset-89182,68687,11764,pearl set
24,pset-66401,68687,11764,pearl set
25,pps-39725,68687,11764,pearl pendent set
27,pset-95928,810937,11765,pearl set
28,pset-96843,37543539,11766,pearl set
29,pps-39854,37543539,11766,pearl pendent set
30,pset-95816,20830719,11767,pearl set
33,pset-96579,67678400,11768,pearl set


In [61]:
get_features(item_recommendations('gtop-3690'))

Unnamed: 0,item,CustomerId,bilNumber,category
5,ptop-174643,51319198,11756,pearl tops
6,ptop-171494,51319198,11756,pearl tops
9,ptop-188696,52135764,11757,pearl tops
10,ptop-185533,52135764,11757,pearl tops
16,gtop-3722,11759,11759,gold tops
21,ptop-188859,48320983,11762,pearl tops
22,gtop-3677,73556247,11763,gold tops
26,ptop-185265,68687,11764,pearl tops
31,ptop-185347,20830719,11767,pearl tops
32,ptop-185373,20830719,11767,pearl tops


In [33]:
get_features(item_recommendations('gtop-3690'))

Unnamed: 0,item,CustomerId,bilNumber,category,colour,size(inches),weight (g)
5,ptop-174643,51319198,11756,pearl tops,Gold,16,25
6,ptop-171494,51319198,11756,pearl tops,Gold,16,25
9,ptop-188696,52135764,11757,pearl tops,Gold,16,25
10,ptop-185533,52135764,11757,pearl tops,Gold,16,25
16,gtop-3722,11759,11759,gold tops,Gold,16,25
21,ptop-188859,48320983,11762,pearl tops,Gold,16,25
22,gtop-3677,73556247,11763,gold tops,Gold,16,25
26,ptop-185265,68687,11764,pearl tops,Gold,16,25
31,ptop-185347,20830719,11767,pearl tops,Gold,16,25
32,ptop-185373,20830719,11767,pearl tops,Gold,16,25


In [36]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(data, test_size=0.2)

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [38]:
train_data.shape 

(2866, 7)

In [39]:
test_data.shape 

(717, 7)

In [40]:
train_data.head()

Unnamed: 0,bilNumber,CustomerId,item,category,colour,size(inches),weight (g)
531,11921,98861521,ptop-188912,pearl tops,Red,16,30
3151,12681,43121092,ptop-189838,pearl tops,Light Gold,18,50
273,11840,57828497,pps-40341,pearl pendent set,Yellow,16,30
2843,12596,15436043,pset-97404,pearl set,Light Gold,18,50
3559,12770,48654539,pml-28187,pearl mala,Light Silver,18,50


In [53]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.as_matrix(columns = ['size(inches)','weight (g)'])
test_data_matrix = test_data.as_matrix(columns = ['size(inches)','weight (g)'])

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix.shape)

(2866, 2)
(717, 2)


In [54]:
from sklearn.metrics.pairwise import pairwise_distances
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])  

[[1.         0.78467103]
 [0.78467103 1.        ]]


In [55]:
# Function to predict items 
def predict(item, similarity, type='user'):
    if type == 'user':
        mean_user_item = item.mean(axis=1)
        # Use np.newaxis so that mean_user_items has same format as items
        items_diff = (item - mean_user_item[:, np.newaxis])
        pred = mean_user_item[:, np.newaxis] + similarity.dot(items_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item': 
        pred = item.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred 

# Evaluation using RMSE

In [56]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual)) 

In [57]:
# Predict items on the training data with both similarity score
item_prediction = predict(train_data_matrix, item_correlation, type='item')

In [58]:
# RMSE on the train data
print('Item-based  RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

Item-based  RMSE: 12.407364036637277


The RMSE (Root Mean Square Error) 
- It is a frequently used measure of the differences between values by a model and the values actually observed.