In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse


In [12]:
df = pd.read_csv('orders.csv')

In [13]:
df.head()

Unnamed: 0,product_id,product_name,alias_id
0,p-00002,Strictly Roses,User-c49d6fd0-4282-4963-a271-b4dd922ac37d
1,p-00001,Mixed Bouquet,User-c49d6fd0-4282-4963-a271-b4dd922ac37d
2,p-00001,Mixed Bouquet,User-6c3f3257-870e-4b65-8e28-c7f17438d0be
3,p-00001,Mixed Bouquet,User-5614b542-7a3d-4d27-85e6-9004a99d40c3
4,p-00006,Red Wine,User-ff07107a-2b88-4f99-9531-023bd42b7029


In [16]:
df = df.drop('product_name', axis=1)

In [25]:
df = pd.DataFrame(df.groupby('alias_id')['product_id'].apply(list))

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('product_id')),
                          columns=mlb.classes_,
                          index=df.index))

In [35]:
df

Unnamed: 0_level_0,p-00001,p-00002,p-00006,p-00007,p-00009,p-00010,p-00013,p-00014,p-00015,p-00016,...,p-00032,p-00033,p-00034,p-00035,p-00036,p-00037,p-00038,p-00039,p-00050,p-lambo
alias_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User-000ffe87-2b39-4128-94b4-faf2e3a1d94f,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-009ea0b2-2a91-4ed6-9d4b-5b1b8b800f28,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-00bfecda-7872-411d-ba13-b7fe9a5a3dd4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-0207ac9d-6beb-49dc-af86-f65ca234308c,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-027150bc-ee7f-4b50-99e0-fbee89a07828,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
User-fe76a105-ab99-407b-984a-58ca3306f4ec,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-ff07107a-2b88-4f99-9531-023bd42b7029,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-ff280000-4bdf-499a-8a5e-fa9c1a940ce5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
User-ff7f9013-0243-4adb-91ee-decdda9fd88c,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(df).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = df.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(df)

# Lets get the top 11 similar artists for Beyonce
print (data_matrix.loc['p-00001'].nlargest(11))


p-00001    1.000000
p-00002    0.260250
p-00009    0.160980
p-00010    0.156174
p-00030    0.119280
p-00006    0.116405
p-00032    0.110432
p-00017    0.098773
p-00007    0.093743
p-00013    0.091121
p-00019    0.082824
Name: p-00001, dtype: float64


In [37]:
#------------------------
# USER-ITEM CALCULATIONS
#------------------------

user = 'User-c49d6fd0-4282-4963-a271-b4dd922ac37d' # The id of the user for whom we want to generate recommendations
user_index = df[df.index == user].index.tolist()[0] # Get the frame index

# Get the artists the user has likd.
known_user_likes = df.ix[user_index]
known_user_likes = known_user_likes[known_user_likes >0].index.values

# Users likes for all items as a sparse vector.
user_rating_vector = df.ix[user_index]

# Calculate the score.
score = data_matrix.dot(user_rating_vector).div(data_matrix.sum(axis=1))

# Remove the known likes from the recommendation.
score = score.drop(known_user_likes)

# Print the known likes and the top 20 recommendations.
print(known_user_likes)
print(score.nlargest(20))

['p-00001' 'p-00002' 'p-00007' 'p-00009' 'p-00013']
p-00017    0.242488
p-00006    0.221159
p-00010    0.220343
p-00034    0.219822
p-00030    0.182531
p-00050    0.181458
p-00031    0.174555
p-00026    0.163401
p-00032    0.135123
p-00014    0.130081
p-00019    0.129550
p-00037    0.126169
p-00035    0.115718
p-00039    0.102831
p-00028    0.100969
p-00025    0.097843
p-lambo    0.081419
p-00016    0.077674
p-00027    0.058953
p-00024    0.053629
dtype: float64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]
