In [1]:
# A simple Content Based Recommendation System

In [2]:
# A content based recommendation system for Items with their description. The description is used as the feature 
#using TF -IDF

In [1]:
import numpy as np

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

ds = pd.read_csv("sample-data-content.csv")

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])


In [6]:
tfidf_matrix.shape

(500, 52262)

In [7]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
cosine_similarities.shape

(500, 500)

In [19]:
cosine_similarities[0:3,0:3]

array([[ 1.        ,  0.10110642,  0.06487353],
       [ 0.10110642,  1.        ,  0.4181664 ],
       [ 0.06487353,  0.4181664 ,  1.        ]])

In [31]:
similar_indices = cosine_similarities[0].argsort()[::-1][:100]

In [32]:
similar_indices

array([  0,  18, 493,  17, 171, 441, 170,  20, 494,  24, 495, 486,  19,
       340, 175, 487, 364, 339,  59, 439, 496, 172, 440, 412, 442, 173,
       358,  21,  60, 311,  22,   1, 359, 174, 328,  23, 405, 265, 276,
       213, 235, 445, 449, 413,  99, 468, 258, 418, 330,  11, 357, 324,
       302, 259, 146, 327, 499, 264,  91,  88, 391, 109, 390,  45,  61,
       199, 200,  87, 329, 179, 498,  63, 334,   8, 301, 323, 415,   2,
       203, 163, 325, 201, 406, 465,  86, 275,  89, 482, 208, 481, 480,
        36,  33, 414,  56, 104, 137, 286, 167,  29])

### Iterate through each item's similar items and store the 100 most-similar. Stops at 100 because well...how many similar products do you really need to show?

Similarities and their scores are stored in a dictionary as a list of Tuples, indexed to their item id.

In [33]:
results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[::-1][:100]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['id']] = similar_items[1:]


In [34]:
results

{1: [(0.22037921472617453, 19),
  (0.16938950913002357, 494),
  (0.16769458065321555, 18),
  (0.16485527745622977, 172),
  (0.14812615460586401, 442),
  (0.14577863284367545, 171),
  (0.1413764236536125, 21),
  (0.13884463426216978, 495),
  (0.13879533331363048, 25),
  (0.13813550299091404, 496),
  (0.13481110970996832, 487),
  (0.13225329613833622, 20),
  (0.13028260329762048, 341),
  (0.12768743540103286, 176),
  (0.12671622868413698, 488),
  (0.12319623660641409, 365),
  (0.12155681060658907, 340),
  (0.11800704948227406, 60),
  (0.11786722607586674, 440),
  (0.11657908072337515, 497),
  (0.11184896270837259, 173),
  (0.11069752245804719, 441),
  (0.10857685392562949, 413),
  (0.10572078621963336, 443),
  (0.10553058093119776, 174),
  (0.10403103809186293, 359),
  (0.10338035552770783, 22),
  (0.10290746221687935, 61),
  (0.10286246471301803, 312),
  (0.10166673618893814, 23),
  (0.10110641701157386, 2),
  (0.10082418508282549, 360),
  (0.099140299683494942, 175),
  (0.0988297651993

In [18]:
#function to get a friendly item name from the description field, given an item ID
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary. No real logic here.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")


In [19]:
# Recommendation for a single Item

In [20]:
recommend(item_id=1, num=5)

Recommending 5 products similar to Active classic boxers...
-------
Recommended: Cap 1 boxer briefs (score:0.220379214726)
Recommended: Active boxer briefs (score:0.16938950913)
Recommended: Cap 1 bottoms (score:0.167694580653)
Recommended: Cap 1 t-shirt (score:0.164855277456)
Recommended: Cap 3 bottoms (score:0.148126154606)
