# Content Based Recommender system : Prediction using similar users' ratings
## Books for mystery, thriller, and crime
https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

In [1]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
import numpy as np


In [2]:
import os

os.chdir('/Users/adshah/Documents/Python/Jupyter Notebooks/')


'/Users/adshah/Documents/Python/Jupyter Notebooks'

### Load Data

#### 1) General information for all books

In [None]:
book_id_df = pd.read_csv('data/book_id_map.csv')
print(book_id_df.shape)
book_id_df.head(3)

In [None]:
user_id_df = pd.read_csv('data/user_id_map.csv')
print(user_id_df.shape)
user_id_df.head(3)

In [None]:
authors_df = pd.read_csv('data/goodreads_book_authors.csv')
print(authors_df.shape)
authors_df.head(3)

In [None]:
all_interactions_df = pd.read_csv('data/goodreads_interactions.csv')
print(all_interactions_df.shape)
all_interactions_df.head(10)

#### 2) Specific information for mystery, thriller, and crim books

In [None]:
books_df = pd.read_csv('data/goodreads_books_mystery_thriller_crime.csv')
books_df.authors = books_df.authors.str.split(pat = ",").str[0].str.split(pat=":").str[1].str.split(pat="'").str[1]
books_df = books_df.rename(columns={'authors': 'author_id'})
books_df.similar_books = books_df.similar_books.str.replace("^\[|\]$","")
print(books_df.shape)
books_df.head(3)

In [None]:
interactions_df = pd.read_csv('data/goodreads_interactions_mystery_thriller_crime.csv')
print(interactions_df.shape)
interactions_df.head(3)

In [None]:
reviews_df = pd.read_csv('data/goodreads_reviews_mystery_thriller_crime.csv')
print(reviews_df.shape)
reviews_df.head(3)

In [None]:
#map users to book data
books_users=interactions_df[interactions_df['is_read']==True].merge(books_df,how='left',on='book_id')

#### We now use the same data cleaning steps as we performed in the "Test Train Split" notebook. Note that the training data that we create there pnly includes user_id|book_id|rating columns which are appropriate for collaborative filtering methods.

In [None]:
grouped_df=books_users.groupby('user_id').book_id.nunique()

In [None]:
grouped_df=grouped_df.reset_index()


In [None]:
users_irrelevant=grouped_df[grouped_df['book_id']<3]['user_id']

In [None]:
books_users=books_users[~books_users['user_id'].isin(users_irrelevant)]

### NLP : map all users to a n-dimensional space based on their book reading habits. This can be captured by using text processing techniques on book titles and descriptions.
 

In [None]:
#create a new column book info that concatenates book title and description. This is the primary column we will be using
books_users['book_info']=books_users['title'].map(str)+' '+books_users['description'].map(str)

A major problem in splitting text into words is not being able to say that words like "reading" and "read" are same. To improve this, we tried both lemmatization and stemming techniques. However, finally decided to do Snowball stemming before TF-IDF vectorization. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem

eng_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfIdfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfIdfVectorizer, self).build_analyzer()
        return lambda doc: ([eng_stemmer.stem(w) for w in analyzer(doc)])

In [None]:
#Code to try count vectorizer
# vectorizer =  StemmedCountVectorizer(min_df=0.1,max_df=0.8,analyzer="word", stop_words='english') #tune
# X = vectorizer.fit_transform(books_users['book_info'])
# print(vectorizer.get_feature_names())

In [None]:
vectorizer =  StemmedTfIdfVectorizer(min_df=0.1,max_df=0.8,analyzer="word", stop_words='english') #tune
X = vectorizer.fit_transform(books_users['book_info'])
print(vectorizer.get_feature_names())

In [None]:
df=pd.DataFrame(data=X.toarray(),columns=vectorizer.get_feature_names())

In [None]:
df.index=books_users['user_id'].map(str)+'-'+books_users['book_id'].map(str)

In [None]:
df.info()
#create pickle file for faster data retrieval
df.to_pickle('data/tfidf_vect_df.pkl')

Now, we have tf-idf vectors for every user-book combination. However, we want to combine them to get one vector per user for K Means. In the following code, we create the mean tf-idf vector.

In [None]:
tfidf_df = pd.read_pickle('data/tfidf_vect_df.pkl')
tfidf_df=tfidf_df.reset_index()
tfidf_df[['user_id','book-id']]=tfidf_df['index'].str.split('-',expand=True,n=1)
tfidf_df_temp=tfidf_df.copy()
del tfidf_df_temp['book-id']
tfidf_avg_df=tfidf_df_temp.groupby('user_id').mean()
tfidf_avg_df.to_pickle('data/final_tfidf_avg_df.pkl')

In [3]:
tfidf_avg_df = pd.read_pickle('data/final_tfidf_avg_df.pkl')

The data is now ready for clustering. We use 2 clustering methods - KMeans and DBSCAN. 
DBSCAN is a better algorithm to run here (no initialization for the number of clusters required). However due to the size of the dataset, it was computationally heavy.
To find the best "k" for KMeans, we use "silhouette scores" for K=2 to 500 and chose the optimal K accordingly

In [14]:
X=tfidf_avg_df.to_numpy()

In [17]:
from sklearn.metrics import silhouette_score
range_clusters=[2,5]
for n in range_clusters:
    kmeans = MiniBatchKMeans(n_clusters=n,random_state=0,batch_size=6,max_iter=10,init='k-means++')
    cluster_labels=kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
#     print("For n_clusters ="+ str(n) + "The average silhouette_score is :" + silhouette_avg)
    print(n)
    print("The average silhouette_score is :")
    print(silhouette_avg)

2
The average silhouette_score is :
0.09503862496960176
5
The average silhouette_score is :
0.10322853262673345


In [None]:
from sklearn.metrics import silhouette_score
range_clusters=[20,35,50,100,200,350,500]
for n in range_clusters:
    kmeans = MiniBatchKMeans(n_clusters=n,random_state=0,batch_size=6,max_iter=10,init='k-means++')
    cluster_labels=kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
#     print("For n_clusters ="+ str(n) + "The average silhouette_score is :" + silhouette_avg)
    print(n)
    print("The average silhouette_score is :")
    print(silhouette_avg)

The above hyperparameter tuning gave an optimal K of 5.

In [None]:
kmeans = MiniBatchKMeans(n_clusters=5,random_state=0,batch_size=6,max_iter=10,init='k-means++')
cluster_labels=kmeans.fit_predict(X)

In [None]:
#code to generate final cluster assignments to users
user_cluster=pd.DataFrame(columns=['user_id','cluster'])
user_cluster['user_id']=tfidf_avg_df.index
user_cluster['cluster']=cluster_labels
user_cluster.set_index('user_id',inplace=True)
user_cluster.to_csv('user_cluster_final.csv')

We also tried dimensionality reduction to increase the speed of DBSCAN run. However, it led to memory errors on HPC.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
X_pca=pca.fit_transform(X)

In [None]:
from sklearn.cluster import DBSCAN
import time
start=time.time()
db=DBSCAN(eps=0.8,min_samples=4).fit(X_pca)
end=time.time()
print("Total time required to go over full data: ",end-start)