In [53]:
import pandas as pd

df = pd.read_csv('datasets/users_interactions.csv')

df.shape

(72312, 8)

In [54]:
df['rating'] = 0
df.head()

df.loc[df['eventType'] == 'VIEW','rating'] = 1
df.loc[df['eventType'] == 'LIKE','rating'] = 2
df.loc[df['eventType'] == 'FOLLOW','rating'] = 3
df.loc[df['eventType'] == 'BOOKMARK', 'rating'] = 4
df.loc[df['eventType'] == 'COMMENT CREATED', 'rating'] = 5

display(df['rating'].unique())
df.head()

array([1, 3, 4, 2, 5], dtype=int64)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,rating
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1


In [55]:
print(f"Number of movies: {df.contentId.nunique()}")
print(f"Number of users: {df.personId.nunique()}")
print(f"Number of ratings: {df.shape[0]}")
print(f"Possible ratings: {df.contentId.nunique() * df.personId.nunique()}")
print(f"Sparseness: {1 - (df.shape[0] / (df.contentId.nunique() * df.personId.nunique()))}")

Number of movies: 2987
Number of users: 1895
Number of ratings: 72312
Possible ratings: 5660365
Sparseness: 0.9872248521075938


In [56]:
print('\nNum ratings by rater')
print('Min:\t\t', df['personId'].value_counts().min())
print('Quartile 1:\t', df['personId'].value_counts().quantile(.25))
print('Median:\t\t', df['personId'].value_counts().quantile(.5))
print('Quartile 3:\t', df['personId'].value_counts().quantile(.75))
print('Max:\t\t', df['personId'].value_counts().max())


Num ratings by rater
Min:		 1
Quartile 1:	 3.0
Median:		 10.0
Quartile 3:	 32.0
Max:		 1885


In [57]:
value_counts = df['personId'].value_counts() 
print(value_counts.shape)
keep_list = value_counts[value_counts > 10]
print(keep_list.shape)

df = df.loc[df.personId.isin(keep_list.index)]
df.shape

(1895,)
(894,)


(67974, 9)

In [58]:
# Whittle down to df_triple to only work with necessary values for collaborative filtering
df_triple = df[['personId', 'contentId', 'rating']] 
display(df_triple.head())
df_triple.shape

Unnamed: 0,personId,contentId,rating
0,-8845298781299428018,-3499919498720038879,1
1,-1032019229384696495,8890720798209849691,1
2,-1130272294246983140,310515487419366995,1
3,344280948527967603,310515487419366995,3
4,-445337111692715325,-7820640624231356730,1


(67974, 3)

In [59]:
# Check for duplicate ratings:
print(f"Duplicate ratings: {df_triple.duplicated(subset=['personId', 'contentId']).sum()}")

Duplicate ratings: 30428


In [60]:
# Clean duplicates:
print(f'Total rows before dropping duplicates: {df_triple.shape[0]}')

# keep='last' if you want to keep the last record
# keep=False if you want to drop all duplicates
df_triple.drop_duplicates(subset=['personId', 'contentId'], keep='first', inplace=True)

# These print statements are just a 'sanity check' to make sure we truly didn't have duplicates
print(f'Total rows after dropping duplicates: {df_triple.shape[0]}')

Total rows before dropping duplicates: 67974
Total rows after dropping duplicates: 37546


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_triple.drop_duplicates(subset=['personId', 'contentId'], keep='first', inplace=True)


In [61]:
# Let's make all four objects in a function
def create_matrix(df, user, item, rating):
  import numpy as np
  from scipy.sparse import csr_matrix

  U = df[user].nunique()  # Number of users for the matrix
  I = df[item].nunique()  # Number of items for the matrix

  # Map user and item IDs to matrix indices
  user_mapper = dict(zip(np.unique(df[user]), list(range(U))))
  item_mapper = dict(zip(np.unique(df[item]), list(range(I))))

  # Map matrix indices back to IDs
  user_inv_mapper = dict(zip(list(range(U)), np.unique(df[user])))
  item_inv_mapper = dict(zip(list(range(I)), np.unique(df[item])))

  # Create a list of index values for the csr_matrix for users and movies
  user_index = [user_mapper[i] for i in df[user]]
  item_index = [item_mapper[i] for i in df[item]]

  # Build the final matrix which will look like: (itemId, userId) rating
  X = csr_matrix((df[rating], (item_index, user_index)), shape=(I, U))

  return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

# Call the function and store the objects needed to calculate similarity and make recommendations
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_matrix(df_triple, 'personId', 'contentId', 'rating')

print(X) # (movieId, userId)   rating
print(user_mapper)
print(user_inv_mapper)
print(item_mapper)
print(item_inv_mapper)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 37546 stored elements and shape (2982, 894)>
  Coords	Values
  (0, 116)	2
  (0, 165)	1
  (0, 388)	1
  (0, 521)	2
  (0, 878)	1
  (1, 1)	2
  (1, 110)	1
  (1, 432)	1
  (1, 525)	1
  (1, 569)	5
  (1, 705)	1
  (1, 723)	1
  (2, 23)	1
  (2, 80)	1
  (2, 104)	1
  (2, 166)	1
  (2, 303)	1
  (2, 311)	1
  (2, 385)	1
  (2, 417)	1
  (2, 430)	2
  (2, 462)	1
  (2, 539)	1
  (2, 578)	1
  (2, 613)	4
  :	:
  (2980, 373)	1
  (2980, 419)	1
  (2980, 449)	1
  (2980, 495)	1
  (2980, 519)	1
  (2980, 561)	1
  (2980, 579)	1
  (2980, 586)	1
  (2980, 613)	1
  (2980, 619)	1
  (2980, 624)	1
  (2980, 646)	2
  (2980, 659)	1
  (2980, 660)	1
  (2980, 720)	1
  (2980, 747)	1
  (2980, 811)	1
  (2980, 859)	1
  (2980, 880)	1
  (2980, 889)	1
  (2981, 86)	1
  (2981, 281)	1
  (2981, 462)	1
  (2981, 570)	1
  (2981, 747)	1
{-9223121837663643404: 0, -9207251133131336884: 1, -9199575329909162940: 2, -9188188261933657343: 3, -9172914609055320039: 4, -9156344805277471150: 5, -9

In [62]:
def recommend(itemId, X, item_mapper, item_inv_mapper, k, metric='cosine', messages=True):
  from sklearn.neighbors import NearestNeighbors

  rec_ids = []                # Make a list for the recommended item IDs we'll get later
  item = item_mapper[itemId]  # Get the index of the movie ID passed into the function
  item_vector = X[item]       # Get the vector of user ratings for the movie ID passed into the function

  # Fit the clustering algorithm based on the user-item matrix X
  knn = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric).fit(X)

  # Call the trained knn cluster model to return the nearest neighbors of the item_vector passed in
  rec = knn.kneighbors(item_vector.reshape(1,-1), return_distance=True)
  rec_indeces = rec[1][0]     # Parse out the list of indeces of the recommended items
  rec_distances = rec[0][0]   # Parse out the recommendation strength calculated as the distance from the cluster center
  rec_distances = np.delete(rec_distances, 0) # Drop the first number in the list because it is the distance of itemId from itself

  # We need to replace the recommended item indeces with their original item IDs
  for i in range(1, knn.n_neighbors): # n_neighbors is the number of neighbors to return
    rec_ids.append(item_inv_mapper[rec_indeces[i]])

  # It may help to see what this is. The distance list is first and the recommended item indeces are second
  if messages:
    print(f'List of recommended item indeces:\n{rec_indeces}\n')
    print(f'List of recommended item IDs:\n{rec_ids}\n')
    print(f'List of recommended item similarity to selected item:\n{rec_distances}\n')

  # Return two lists: the original item IDs of the recommendations and their similarity scores
  return rec_ids, rec_distances

      # Use any movie id here to get recommendations
contentId = 310515487419366995
similar_ids, rec_distances = recommend(contentId, X, item_mapper, item_inv_mapper, k=5, metric='cityblock')

display(similar_ids)
display(rec_distances)

List of recommended item indeces:
[1534  647  418   53  610 2111]

List of recommended item IDs:
[-5331421579623797776, -6727357771678896471, -9002191823887486987, -5573840044320724169, 3980405883168341377]

List of recommended item similarity to selected item:
[47. 48. 48. 48. 48.]



[-5331421579623797776,
 -6727357771678896471,
 -9002191823887486987,
 -5573840044320724169,
 3980405883168341377]

array([47., 48., 48., 48., 48.])

In [63]:
df_triple.head()

Unnamed: 0,personId,contentId,rating
0,-8845298781299428018,-3499919498720038879,1
1,-1032019229384696495,8890720798209849691,1
2,-1130272294246983140,310515487419366995,1
3,344280948527967603,310515487419366995,3
4,-445337111692715325,-7820640624231356730,1


In [73]:
# How many recommendations per article would you like?
k = 5

# Get a list of recommendations for all movies; you can store this list as a "trained model" of sorts
df_recommendations = pd.DataFrame(index=item_mapper)
for i in range(1, k):
  df_recommendations[f'Recommendation {i}'] = None

for row in df_recommendations.itertuples():
  # Get a ranked list of recommendati ons
  rec_ids, rec_distances = recommend(row[0], X, item_mapper, item_inv_mapper, k=k, messages=False)

  # Get the title of the 'liked' movie in this row
  # df_recommendations.at[row[0], 'If you liked'] = row[0]

  # Get the titles of the recommended movies
  for i, r in enumerate(rec_ids):    
    df_recommendations.at[row[0], f'Recommendation {i+1}'] = r

# Store df_recommendations in a relational DB.
#
# from sqlalchemy import create_engine
# engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
#                       .format(user="root",
#                               pw="12345",
#                               db="employee"))
# df_recommendations.to_sql('book_details', con = engine, if_exists = 'append', chunksize = 1000)

df_recommendations.tail()

Unnamed: 0,Recommendation 1,Recommendation 2,Recommendation 3,Recommendation 4,Recommendation 5
9213260650272029784,-1443593727366384026,3770082505811422124,-4608017393995766765,-9194572880052200111,8.961596e+17
9215261273565326920,-4295420427690447901,5965544132537436406,-7447329442374055391,-8158171393641147845,-8.819557e+18
9217155070834564627,-1706114177222872702,4303142677581479651,1362723651715193305,-720777800522871309,-7.454908e+18
9220445660318725468,-6893108632164641498,-3920124114454832425,-3980612444042641628,1333053624108726661,-9.128652e+18
9222265156747237864,5274322067107287523,-5954724412982705962,-555187669877280641,-7702672626132856079,-7.38055e+18


In [75]:
df_recommendations.to_csv('Article_Recommendations.csv')