In [58]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [59]:
df = pd.read_csv("recomm-content.csv")

In [60]:
df.shape

(500, 2)

In [61]:
df.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [62]:
tf = TfidfVectorizer(analyzer='word',
                    ngram_range=(1,3),
                    min_df=0,
                    stop_words='english')

tfidf_matrix = tf.fit_transform(df['description'])

In [63]:
tfidf_matrix.toarray() #Count of words in the bag of words - now every row is converted into a vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [65]:
cosine_similarities

array([[1.        , 0.10110642, 0.06487353, ..., 0.06097409, 0.06546914,
        0.06955608],
       [0.10110642, 1.        , 0.4181664 , ..., 0.03550042, 0.06936414,
        0.06480538],
       [0.06487353, 0.4181664 , 1.        , ..., 0.03402428, 0.0455137 ,
        0.05038512],
       ...,
       [0.06097409, 0.03550042, 0.03402428, ..., 1.        , 0.04187121,
        0.04958298],
       [0.06546914, 0.06936414, 0.0455137 , ..., 0.04187121, 1.        ,
        0.36281626],
       [0.06955608, 0.06480538, 0.05038512, ..., 0.04958298, 0.36281626,
        1.        ]])

In [66]:
results = {}

In [67]:
for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['id'][i]) for i in similar_indices]
    
    results[row['id']] = similar_items[1:]

In [68]:
def item(id):
    return df.loc[df['id'] == id]['description'].tolist()[0].split('-')[0]

In [70]:
#Just reads the results out of the dictionary
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")
        
        

In [71]:
recommend(item_id=11, num=5)

Recommending 5 products similar to Baby sunshade top ...
-------
Recommended: Sunshade hoody  (score:0.21330296021085024)
Recommended: Baby baggies apron dress  (score:0.10975311296284812)
Recommended: Runshade t (score:0.09988151262780731)
Recommended: Runshade t (score:0.09530698241688207)
Recommended: Runshade top  (score:0.08510550093018411)
