#Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import json
import sys
import os
import gc
import pickle
from nltk.stem.porter import PorterStemmer
import ast
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from sklearn.feature_selection import SelectKBest, chi2
import re
import numpy as np
nltk.download('vader_lexicon')
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import gensim.downloader as api

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#Initial Process

##<h1> Data Pre Processing </h1>


In [3]:

class data_set_pre_processing:
  def __init__(self):
    # Mentioning File Paths
    self.book_json_path="/content/drive/MyDrive/Recommendation System Project 1/goodreads_books_children.json"
    self.userReview_json_path="/content/drive/MyDrive/Recommendation System Project 1/goodreads_reviews_children.json"
    self.folder_path="/content/test"
    self.goodreads_books_pkl_path="/content/test/goodreads_books_children.pkl"
    self.goodreads_books_pkl_raw_path="/content/test/goodreads_books_children_raw.pkl"
    self.goodreads_reviews_children_pkl_path="/content/test/goodreads_reviews_children.pkl"
    self.book_df=None
    # Object Initialize For Stemminization Process
    self.ps=PorterStemmer()

  # To Create Test Folder to store our files
  def create_folder(self):
      if not os.path.exists(self.folder_path):
        os.makedirs(self.folder_path)
  # it will convert the authors column format
  def convert(self,obj):
    obj=str(obj)
    l=[]
    for i in ast.literal_eval(obj):
      l.append(i["author_id"])
    return " ".join(l)
  # to do the stemmation Process
  def stem(self,text):
    pattern = r'\b\d+\b'
    text_without_numbers = re.sub(pattern, '', text)
    words = word_tokenize(text_without_numbers)
    stemmed_words = [self.ps.stem(word) for word in words]
    return ' '.join(stemmed_words)



  #Transfer processed Book  data to pkl file
  def convert_books_data(self):
    with open( self.goodreads_books_pkl_path, "wb") as f:
      temp=[]
      # Reading the csv file in chunks
      for chunk in pd.read_json(self.book_json_path,lines=True,chunksize=10000):

        chunk["authors"]=chunk["authors"].apply(self.convert)
        chunk=chunk[chunk["language_code"]=="eng"]
        temp.append(chunk["book_id"])
        chunk["description"]=chunk["description"].apply(self.stem)
        chunk["title"]=chunk["title"].apply(self.stem)
        chunk["publisher"]=chunk["publisher"].apply(self.stem)
        pickle.dump(chunk[['book_id','title','average_rating','authors','publisher', 'description']],f)

      self.book_df=pd.concat(temp,ignore_index=True)
      print("Books Data Processed")
  # Transfer raw Book Data data to pkl file
  def book_data(self):
    df=pd.read_json(self.book_json_path,lines=True)
    with open( self.goodreads_books_pkl_raw_path, "wb") as f:
      pickle.dump(df[df["language_code"]=="eng"],f)
    print("Raw Book Data Processed")

  # Transfer processed user review   data to pkl file
  def convert_user_review_data(self):
   with open( self.goodreads_reviews_children_pkl_path, "wb") as f:
    # Reading the csv file in chunks
      for chunk in pd.read_json(self.userReview_json_path,lines=True,chunksize=5000):
        chunk=chunk[chunk["book_id"].isin(self.book_df["book_id"])]
        pickle.dump(chunk[['user_id', 'book_id',"review_text"]],f)
      print("User Review  Data Processed")

  # All the Pre Processing Methods are called here
  def do_data_initialization(self):
    self.create_folder()
    print("Processing Book Dataset")
    self.convert_books_data()
    self.book_df=self.read_in_chunks()
    print("Processing User Review Dataset")
    self.convert_user_review_data()
    print("Processing Book Raw Data")
    self.book_data()
  # To Read the goodreads_books pickle file
  def read_in_chunks(self):
    chunks=[]
    with open( self.goodreads_books_pkl_path,'rb') as f:
      while True:
        try:
            chunks.append( pickle.load(f))
        except EOFError:
            break
    return pd.concat(chunks,ignore_index=True)
  # To Read the goodreads_reviews pickle file
  def get_user_data(self):
    chunks=[]
    with open( self.goodreads_reviews_children_pkl_path,'rb') as f:
      while True:
        try:
            chunks.append( pickle.load(f))
        except EOFError:
            break
    return pd.concat(chunks,ignore_index=True)





pre_processing=data_set_pre_processing()
pre_processing.do_data_initialization()

#pre_processing.read_in_chunks()

Processing Book Dataset
Books Data Processed
Processing User Review Dataset
User Review  Data Processed
Processing Book Raw Data
Raw Book Data Processed


BOOK Dataset

In [None]:
pre_processing.read_in_chunks()

User's Review Dataset

In [None]:
pre_processing.get_user_data()

##Exploratory Data Analysis

In [None]:
books_df = pre_processing.read_in_chunks()
reviews_df = pre_processing.get_user_data()

In [None]:
print("Exploratory Data Analysis for Books Dataset:")
print()
print()
print(books_df.info())
print()
print(books_df.describe())

In [None]:
#Distribution of average_ratings
sns.histplot(books_df['average_rating'], bins=20, kde=True, color='gold')
plt.title('Distribution of average_ratings ')
plt.xlabel('average_rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Top 10 Author
sns.countplot(x='authors', data=books_df, order=books_df['authors'].value_counts().index[:10], color='#FFD700')
plt.title('Top 10 Authors with Most Books')
plt.xlabel('Author')
plt.ylabel('Number of Books')
plt.xticks(rotation=45)
plt.show()


In [None]:
print("Exploratory Data Analysis for User Reviews Dataset:")
print()
print(reviews_df.info())
print()
print(reviews_df.describe())

In [None]:
#Top 10 Users with Most_Reviews
sns.countplot(x='user_id', data=reviews_df, order=reviews_df['user_id'].value_counts().index[:10], color='#FFD700')
plt.title('Top 10 Users with Most_Reviews')
plt.xlabel('User ID')
plt.ylabel('Number of Reviews')
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#Distribution of Word Counts in Reviews
word_counts = reviews_df['review_text'].apply(lambda x: len(x.split()))
sns.histplot(word_counts, bins=50, kde=True, color='#FFD700')
plt.title('Distribution of Word Counts in Reviews')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation between Book_id vs Average_Rating
correlation_books = books_df.corr()
sns.heatmap(correlation_books, annot=True,cmap='Pastel2')
plt.title('Correlation between Book_id vs Average_Rating')
plt.show()

In [None]:
#Outlier Detection for Average_Rating
sns.boxplot(x='average_rating', data=books_df, color='#FFD700')
plt.title('Outlier Detection for Average_Rating')
plt.show()

In [None]:
#Distribution of Average Ratings
sns.histplot(books_df['average_rating'], bins=20, kde=True, color='#FFD700')
plt.title('Distribution of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Top 10 Authors with Most_Books
sns.countplot(x='authors', data=books_df, order=books_df['authors'].value_counts().index[:10], color='#FFD700')
plt.title('Top 10 Authors with Most_Books')
plt.xlabel('Author')
plt.ylabel('Number of Books')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Correlation Matrix for Reviews Dataset (if applicable)
correlation_reviews = reviews_df.corr()
sns.heatmap(correlation_reviews, annot=True, cmap='Pastel2')
plt.title('Correlation Matrix for Reviews Dataset')
plt.show()



In [None]:
#Detection for Word Counts in Reviews
sns.boxplot(x=word_counts, data=reviews_df, color='#FFD700')
plt.title('Outlier Detection for Word Counts in Reviews')
plt.show()



In [None]:
#Distribution of Text Length in Reviews
reviews_df['text_length'] = reviews_df['review_text'].apply(len)
sns.histplot(reviews_df['text_length'], bins=50, kde=True, color='#FFD700')
plt.title('Distribution of Text Length in Reviews')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()



In [None]:
#Top 10 Users with Most Reviews
sns.countplot(x='user_id', data=reviews_df, order=reviews_df['user_id'].value_counts().index[:10], color='#FFD700')
plt.title('Top 10 Users with Most Reviews')
plt.xlabel('User ID')
plt.ylabel('Number of Reviews')
plt.xticks(rotation='vertical')

plt.show()



In [None]:
#Distribution of Word Counts in Reviews
word_counts = reviews_df['review_text'].apply(lambda x: len(x.split()))
sns.histplot(word_counts, bins=50, kde=True, color='#FFD700')
plt.title('Distribution of Word Counts in Reviews')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

##<h1>Item's Representation</h1>



### Using Term-Frequency Times Inverse Document-Frequency

In [4]:
class BookData_ItemRepresentation:
  def __init__(self):
    self.goodreads_books_pkl_path="/content/test/goodreads_books_children.pkl"
    self.tfidf_vectorizer= TfidfVectorizer(stop_words='english',max_features =5000)
    self.BookData_ItemRepresentation_path = '/content/test/BookData_ItemRepresentation.pkl'

  def get_book_data(self):
    chunk_size = 5000
    json_reader = pd.read_json(self.Book_data_file_path, lines=True, chunksize=chunk_size)
    return json_reader

  def get_TfIdf_Vectorizer(self):
    book_data_list = []
    with open(self.goodreads_books_pkl_path,'rb') as f:
      while True:
        try:
          book_data_list.append(pickle.load(f))
        except EOFError:
            break

    book_data=pd.concat(book_data_list,ignore_index=True)
    #print("book_data",book_data.shape)
    self.tfidf_vectorizer.fit(book_data['description'] )
    #print()
    del book_data_list
    del book_data
    gc.collect()
    #print("Tf-Idf Fit Done!")
    return self.tfidf_vectorizer.get_feature_names_out()
  def get_features(self):
    return  self.tfidf_vectorizer.get_feature_names_out()
  def get_item_representation(self):
    first_iteration=True

    print("Book Item Representation processing")
    with open(self.goodreads_books_pkl_path,'rb') as f:
      with open(self.BookData_ItemRepresentation_path, "wb") as file:
        i=0
        while True:
          try:
            i+=1

            chunk=pickle.load(f)
            #tfidf_matrix=self.tfidf_vectorizer.transform(chunk['description'] + ' ' + chunk['title']+' '+chunk['authors']+' '+chunk['publisher']).toarray()
            tfidf_matrix=pd.DataFrame(self.tfidf_vectorizer.transform(chunk['description'] + ' ' + chunk['title']+' '+chunk['publisher']).toarray())
            other_features = chunk[['book_id']]
            tfidf_matrix.reset_index(drop=True, inplace=True)
            other_features.reset_index(drop=True, inplace=True)
            feature_names = self.tfidf_vectorizer.get_feature_names_out()
            new_array = np.insert(feature_names, 0, "book_id")

            joined_df = pd.concat([other_features,tfidf_matrix], axis=1)
            joined_df.columns = new_array
            pickle.dump(joined_df, file)

            del chunk
            del tfidf_matrix
            del other_features
            del joined_df
            gc.collect()  # Perform garbage collection to free up memory

          except EOFError:
            print("Book Item Representation Task Completed")
            break

    print("Book Item Representation Processed and stored")

  def read_book_rep(self):
    l=[]
    with open(self.BookData_ItemRepresentation_path,'rb') as f:
      while True:
        try:
            l.append( pickle.load(f))
        except EOFError:
            break
    return pd.concat(l,ignore_index=True)



In [5]:

bookData_ItemRepresentation=BookData_ItemRepresentation()
i=bookData_ItemRepresentation.get_TfIdf_Vectorizer()
bookData_ItemRepresentation.get_item_representation()
i.shape


Book Item Representation processing
Book Item Representation Task Completed
Book Item Representation Processed and stored


(5000,)

In [None]:
bookData_ItemRepresentation.read_book_rep()

###Using Word2Vec

In [44]:
class BookData_ItemRepresentation_Word2vec:
    def __init__(self):
        self.goodreads_books_pkl_path = "/content/test/goodreads_books_children.pkl"
        self.word2vec_model_path = "word2vec_model.bin"  # Path to your Word2Vec model
        self.BookData_ItemRepresentation_path = '/content/test/BookData_ItemRepresentation_Word2Vec.pkl'
        self.word2vec_model = None

    def load_word2vec_model(self):
        # Load Word2Vec model
        self.word2vec_model = api.load("word2vec-google-news-300")
    # for doing Books Item Representation using Word2vec
    def get_item_representation(self):
        self.load_word2vec_model()
        print("Book Item Representation processing")
        with open(self.goodreads_books_pkl_path, 'rb') as f:
            with open(self.BookData_ItemRepresentation_path, "wb") as file:
                i = 0
                while True:
                    try:
                        i += 1
                        chunk = pickle.load(f)
                        embeddings = []
                        for idx, row in chunk.iterrows():
                            # Create an average embedding for each row (book)
                            description_embedding = self.get_average_embedding(row['description'])
                            embeddings.append(description_embedding)
                        embeddings_df = pd.DataFrame(embeddings)
                        other_features = chunk[['book_id']]
                        embeddings_df.reset_index(drop=True, inplace=True)
                        other_features.reset_index(drop=True, inplace=True)
                        joined_df = pd.concat([other_features, embeddings_df], axis=1)
                        pickle.dump(joined_df, file)
                        del chunk
                        del embeddings
                        del embeddings_df
                        del other_features
                        del joined_df
                        gc.collect()
                    except EOFError:
                        print("Book Item Representation Task Completed")
                        break

        print("Book Item Representation Processed and stored")

    def get_average_embedding(self, text):
        words = text.split()
        embeddings = []
        for word in words:
            if word in self.word2vec_model.key_to_index:
                embeddings.append(self.word2vec_model[word])
        if embeddings:
            avg_embedding = sum(embeddings) / len(embeddings)
        else:
            avg_embedding = [0] * 5000
        return avg_embedding

bookData_ItemRepresentation = BookData_ItemRepresentation_Word2vec()
bookData_ItemRepresentation.get_item_representation()


Book Item Representation processing
Book Item Representation Task Completed
Book Item Representation Processed and stored


##<h1>User Profiles</h1>

In [3]:
# This class is used to Build the user Profiles
class Build_user_profiles():
  def __init__(self,rep_path,user_profile_path):
    self.goodreads_books_pkl_path="/content/test/goodreads_books_children.pkl"
    self.BookData_ItemRepresentation_path = rep_path
    self.goodreads_reviews_children_pkl_path="/content/test/goodreads_reviews_children.pkl"
    self.user_profile_path=user_profile_path
  #method to build user profile
  def build_user_profiles(self):
    print("Building User Profiles")
    chunk_count=0
    # Fetching unique user id
    with open(self.goodreads_reviews_children_pkl_path,'rb') as f:
      while True:
        try:
          chunk_count+=1
          #print("chunk :",chunk_count)

          user_profiles=[]
          unique_user=pickle.load(f)["user_id"].unique()
          #print("users",unique_user.shape)


          # Fetching user Historical data
          user_data=[]
          with open(self.goodreads_reviews_children_pkl_path,'rb') as f2:
            while True:
              try:
                condition = lambda x: x['user_id'].isin(unique_user)
                user_data.append(pickle.load(f2)[condition][["user_id","book_id"]])

              except EOFError:
                break
            user_data=pd.concat(user_data,ignore_index=True)
          # Fetching Books Items Representation
          book_rep=[]
          with open (self.BookData_ItemRepresentation_path,'rb') as f3:
            while True:
              try:
                condition = lambda x: x['book_id'].isin(user_data["book_id"])
                book_rep.append(pickle.load(f3)[condition])

              except EOFError:
                break
            book_rep=pd.concat(book_rep,ignore_index=True)


          merged_data = pd.merge(user_data, book_rep, on='book_id')

          user_profiles = merged_data.groupby('user_id', as_index=False).mean()
          user_profiles.drop(columns=['book_id'],inplace=True)
          print("User_profile:",user_profiles.shape)
          with open(self.user_profile_path, "ab") as f4:
            pickle.dump(user_profiles, f4)

        except EOFError:
            break
  # To Read User review Data
  def read_user(self):
    user_profiles=[]
    with open(self.user_profile_path,'rb') as f:
      while True:
        try:

          user_profiles.append(pickle.load(f))

        except EOFError:
            break

    return pd.concat(user_profiles,ignore_index=True)




User-Profile For TF-IDF

In [None]:
user_profile=Build_user_profiles('/content/test/BookData_ItemRepresentation.pkl',"/content/test/users_profile.pkl")
user_profile.build_user_profiles()
user_profile.read_user()

Building User Profiles


User_Profile For Word2Vec

In [None]:
user_profile=Build_user_profiles('/content/test/BookData_ItemRepresentation_Word2Vec.pkl',"/content/test/users_profile_word2vec.pkl")
user_profile.build_user_profiles()
user_profile.read_user()

#<h1>Content Based Filtering</h1>

In [8]:
class ContentBasedFiltering:
  def __init__(self,rep_path,user_profile_path):
    self.goodreads_books_pkl_path="/content/test/goodreads_books_children.pkl"
    self.BookData_ItemRepresentation_path =rep_path
    self.goodreads_reviews_children_pkl_path="/content/test/goodreads_reviews_children.pkl"
    self.user_profile_path=user_profile_path
    self.goodreads_books_pkl_raw_path="/content/test/goodreads_books_children_raw.pkl"

  def fetch_user_profile(self,user_id):
    with open(self.user_profile_path,'rb') as f2:
      while True:
        try:
          condition = lambda x: x['user_id']==user_id
          chunk=pickle.load(f2)[condition]
          if chunk.empty==False:
            return chunk


        except EOFError:
          break

  def find_most_similar_books(self, user_id, top_n=5):
    user_profile = self.fetch_user_profile(user_id)
    similar_books=[]

    with open(self.BookData_ItemRepresentation_path, 'rb') as f3:
        count=5
        while True:
            try:

                item_representation = pickle.load(f3)
                # Assuming 'book_id' is the first column
                book_ids = item_representation['book_id']
                # Drop the 'book_id' column before computing similarity
                item_representation = item_representation.drop(columns=['book_id'])
                # Compute cosine similarity between user profile and item representations

                sim=  cosine_similarity(user_profile.drop(columns=['user_id']), item_representation)
                sim= pd.DataFrame(sim.T)
                sim_score=pd.concat([book_ids,sim], axis=1)

                sim_score=sim_score.sort_values(by=0,ascending=False)
                sim_score=sim_score[sim_score[0]>0]
                sim_score=sim_score.head(50)
                similar_books.append( sim_score)

            except EOFError:
                break
    similar_books=pd.concat(similar_books,ignore_index=True)
    similar_books=similar_books.sort_values(by=0,ascending=False)
    #print(similar_books)
    return similar_books.head(50)

  def recommend_books(self,user_id):
    similar_books=self.find_most_similar_books("00b185a14389ccfc559a1a80bda39ea5")

    user_data=[]
    with open(self.goodreads_reviews_children_pkl_path,'rb') as f2:
      while True:
        try:
          condition = lambda x: x['user_id']==user_id
          user_data.append(pickle.load(f2)[condition])
        except EOFError:
          break
    user_data=pd.concat(user_data,ignore_index=True)
    similar_books=similar_books[~similar_books["book_id"].isin(user_data["book_id"])]
    book_details=[]
    with open(self.goodreads_books_pkl_raw_path,'rb') as f2:
      condition = lambda x: x['book_id'].isin(similar_books["book_id"])
      return user_data,pickle.load(f2)[condition]

  def recommend_books(self,user_id):
      similar_books=self.find_most_similar_books("00b185a14389ccfc559a1a80bda39ea5")
      user_data=[]
      with open(self.goodreads_reviews_children_pkl_path,'rb') as f2:
        while True:
          try:
            condition = lambda x: x['user_id']==user_id
            user_data.append(pickle.load(f2)[condition])
          except EOFError:
            break
      user_data=pd.concat(user_data,ignore_index=True)
      similar_books=similar_books[~similar_books["book_id"].isin(user_data["book_id"])]
      book_details=[]
      with open(self.goodreads_books_pkl_raw_path,'rb') as f2:
        condition = lambda x: x['book_id'].isin(similar_books["book_id"])
        return user_data,pickle.load(f2)[condition]








## Recommending Using TF-IDF

In [41]:
contentBasedFiltering=ContentBasedFiltering('/content/test/BookData_ItemRepresentation.pkl',"/content/test/users_profile.pkl")
user_data,recomended_data=contentBasedFiltering.recommend_books("d1e368a7d2870eb6fbf6e0d350568a2d")

      book_id         0
50     197084  1.000000
400   2357078  0.991987
200  10773901  0.929980
401   8050363  0.928120
500  25387648  0.352332
..        ...       ...
645  22047662  0.081141
646  33215506  0.081125
647  26158605  0.079917
648  15870506  0.079684
649  21518819  0.078994

[650 rows x 2 columns]


In [45]:
user_data

Unnamed: 0,user_id,book_id,review_text
0,d1e368a7d2870eb6fbf6e0d350568a2d,28686885,The Ratso brothers want to be tough just like ...
1,d1e368a7d2870eb6fbf6e0d350568a2d,31491773,"You know the song that goes, ""The cat came bac..."
2,d1e368a7d2870eb6fbf6e0d350568a2d,34129054,Baabwaa & Wooliam are sheep who like to knit a...
3,d1e368a7d2870eb6fbf6e0d350568a2d,16702384,Have you ever flipped through a book and found...
4,d1e368a7d2870eb6fbf6e0d350568a2d,31522122,"This one is not as good as the first one, but ..."
...,...,...,...
299,d1e368a7d2870eb6fbf6e0d350568a2d,6949680,Super cute for storytimes
300,d1e368a7d2870eb6fbf6e0d350568a2d,10146304,Great for storytimes!
301,d1e368a7d2870eb6fbf6e0d350568a2d,1081530,"Short, sweet story about a polar bear and a du..."
302,d1e368a7d2870eb6fbf6e0d350568a2d,911579,Frances likes bread and jam. She knows this to...


In [46]:
recomended_data

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
1636,0375812350,30,[],US,eng,"[{'count': '147', 'name': 'to-read'}, {'count'...",,False,4.07,B0075WPFBU,...,5.0,,2001.0,https://www.goodreads.com/book/show/972527.Summer,https://images.gr-assets.com/books/1320417714m...,972527,350,957424,Summer,Summer
4906,0544472705,98,[],US,eng,"[{'count': '122', 'name': 'to-read'}, {'count'...",,False,3.38,B01912OMZ6,...,3.0,,2016.0,https://www.goodreads.com/book/show/25897689-t...,https://images.gr-assets.com/books/1436842448m...,25897689,397,45780658,Treat,Treat
7447,0553510975,3,[],US,eng,"[{'count': '79', 'name': 'to-read'}, {'count':...",,False,3.92,,...,9.0,,2017.0,https://www.goodreads.com/book/show/30008955-i...,https://images.gr-assets.com/books/1498832985m...,30008955,17,50417083,Imagine That!: How Dr. Seuss Wrote the Cat in ...,Imagine That!: How Dr. Seuss Wrote the Cat in ...
9765,0448455854,210,[404950],US,eng,"[{'count': '700', 'name': 'to-read'}, {'count'...",,False,4.24,B004XFYIF6,...,7.0,,2011.0,https://www.goodreads.com/book/show/10641892-w...,https://s.gr-assets.com/assets/nophoto/book/11...,10641892,1134,15550688,Who Was Dr. Seuss?,Who Was Dr. Seuss?
12622,0001713221,1690,[],US,eng,"[{'count': '60393', 'name': 'to-read'}, {'coun...",,False,4.16,B00480OHWK,...,,,1962.0,https://www.goodreads.com/book/show/197084.Are...,https://s.gr-assets.com/assets/nophoto/book/11...,197084,177134,1837885,Are You My Mother?,Are You My Mother?
14301,0375822984,107,[],US,eng,"[{'count': '329', 'name': 'to-read'}, {'count'...",,False,4.19,B004KABEY2,...,1.0,,2004.0,https://www.goodreads.com/book/show/86337.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,86337,425,1154467,The Boy on Fairfield Street,The Boy on Fairfield Street
17124,0001716123,17,[],US,eng,"[{'count': '237', 'name': 'to-read'}, {'count'...",,False,3.85,B00ESF28OE,...,,Classic Collection,,https://www.goodreads.com/book/show/316575.The...,https://s.gr-assets.com/assets/nophoto/book/11...,316575,274,266965,The Cat's Quizzer,The Cat's Quizzer
18833,0394834607,5,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,4.06,,...,7.0,A Random House Pictureback,1982.0,https://www.goodreads.com/book/show/2610926-ol...,https://images.gr-assets.com/books/1282541174m...,2610926,17,2635574,Old Mother Hubbard,Old Mother Hubbard
19298,0007158521,67,[],US,eng,"[{'count': '2262', 'name': 'to-read'}, {'count...",,False,4.34,B00ESF285S,...,5.0,,2003.0,https://www.goodreads.com/book/show/24682.Oh_T...,https://images.gr-assets.com/books/1286845488m...,24682,830,2125304,"Oh, The Places You’ll Go!","Oh, The Places You’ll Go!"
22142,0881034193,1,[],US,eng,"[{'count': '4099', 'name': 'to-read'}, {'count...",,False,3.96,B00ESF2790,...,2.0,I Can Read It All by Myself Beginner Books,1963.0,https://www.goodreads.com/book/show/2711332-ho...,https://s.gr-assets.com/assets/nophoto/book/11...,2711332,9,894153,Hop on Pop,Hop on Pop


##Recommending Using Word2Vec

In [None]:
contentBasedFiltering=ContentBasedFiltering(( '/content/test/BookData_ItemRepresentation_Word2Vec.pkl',"/content/test/users_profile_word2vec.pkl"))
user_data_W2V,recomended_data_W2V=contentBasedFiltering.recommend_books("00b185a14389ccfc559a1a80bda39ea5")

In [None]:
user_data_W2V

In [None]:
recomended_data_W2V

#Sentimental Analysis

In [47]:
import pickle
import pandas as pd
from textblob import TextBlob

class ReviewsSentiment:
    def __init__(self):
        self.goodreads_reviews_children_path = "/content/test/goodreads_reviews_children.pkl"

    def get_reviews(self, rec_books):
        books_review = []
        with open(self.goodreads_reviews_children_path, 'rb') as f3:
            while True:
                try:
                    condition = lambda x: x['book_id'].isin(recomended_data["book_id"])
                    user_data = (pickle.load(f3)[condition][["book_id", "review_text"]])
                    books_review.append(user_data.groupby('book_id', as_index=False)['review_text'].agg(
                        lambda x: ' '.join(x)))
                except EOFError:
                    break

        books_review = pd.concat(books_review, ignore_index=True)
        books_review = books_review.groupby('book_id', as_index=False)['review_text'].agg(lambda x: ' '.join(x))
        return books_review

    def get_sentiment_score(self, review):
        blob = TextBlob(review)
        return blob.sentiment.polarity

    def get_sentiment(self, rec_books):
        books_review = self.get_reviews(rec_books)
        books_review['sentiment_score'] = books_review['review_text'].apply(self.get_sentiment_score)
        books_review['sentiment'] = books_review['sentiment_score'].apply(lambda score: 'positive' if score >= 0 else 'negative')

        positive_books = books_review[books_review['sentiment'] == 'positive']
        negative_books = books_review[books_review['sentiment'] == 'negative']
        return positive_books, negative_books

##Sentiment For TF-IDF

In [48]:
reviewsSentiment=ReviewsSentiment()
positive_books,negative_books=reviewsSentiment.get_sentiment(recomended_data)

In [49]:
positive_books

Unnamed: 0,book_id,review_text,sentiment_score,sentiment
0,7769,This is a fun way to teach kids their ABC's. T...,0.223611,positive
1,11301,I have the biggest smile on my face right now!...,0.298739,positive
2,23772,The lesson here is that you cannot know whethe...,0.249558,positive
3,24682,"I don't remember a lot about this one, beyond ...",0.238906,positive
4,86337,Love this story of how Dr. Seuss became Dr. Se...,0.27147,positive
5,197084,Five stars because it has been a favorite of m...,0.257996,positive
6,233093,It was no time to play. It was no time for fun...,0.208832,positive
7,275331,That pesky Cat in The Hat come back and causes...,0.25,positive
8,316575,My son read it to me. Proper view soon. A book...,0.212259,positive
9,420404,"Not a favorite, the three stories were forgett...",0.240173,positive


In [50]:
negative_books

Unnamed: 0,book_id,review_text,sentiment_score,sentiment
20,2610926,caroline really likes this nursery rhyme for s...,-0.4,negative
27,8050363,"Much as I like other Dr Seuss books as a kid, ...",-0.058333,negative
36,21096725,5 Stars \n Read this as part of the Getting Gr...,-0.125,negative


##Sentiment For Word2Vec

In [None]:
reviewsSentiment=ReviewsSentiment()
positive_books,negative_books=reviewsSentiment.get_sentiment(recomended_data_W2V)

In [None]:
positive_books

In [None]:
negative_books

#Evaluation

In [39]:
class Evaluation:
  def __init__(self,user_id):
    self.user_id=user_id
    self.goodreads_reviews_children_pkl_path="/content/test/goodreads_reviews_children.pkl"
    self.BookData_ItemRepresentation_path = '/content/test/BookData_ItemRepresentation.pkl'
    self.train_df=None
    self.test_df=None
  def build_profile(self):

    user_data=[]
    with open(self.goodreads_reviews_children_pkl_path,'rb') as f2:
      while True:
        try:
          condition = lambda x: x['user_id']==self.user_id
          user_data.append(pickle.load(f2)[condition][["user_id","book_id"]])

        except EOFError:
          break
    user_data=pd.concat(user_data,ignore_index=True)

    self.train_df,self.test_df= train_test_split(user_data, test_size=0.2, random_state=42)
    #return self.train_df"
    #print("train/n",self.train_df)
    #print("test/n",self.test_df)
    book_rep=[]
    with open (self.BookData_ItemRepresentation_path,'rb') as f3:
      while True:
        try:
          condition = lambda x: x['book_id'].isin(self.train_df["book_id"])
          book_rep.append(pickle.load(f3)[condition])

        except EOFError:
          break
    book_rep=pd.concat(book_rep,ignore_index=True)
    merged_data = pd.merge(self.train_df, book_rep, on='book_id')
    user_profiles = merged_data.groupby('user_id', as_index=False).mean()
    user_profiles.drop(columns=['book_id'],inplace=True)



    return user_profiles
  def find_most_similar_books(self, top_n=5):
    user_profile = self.build_profile()
    similar_books=[]

    with open(self.BookData_ItemRepresentation_path, 'rb') as f3:

        while True:
            try:

                item_representation = pickle.load(f3)
                book_ids = item_representation['book_id']
                # Drop the 'book_id' column before computing similarity
                item_representation = item_representation.drop(columns=['book_id'])
                # Compute cosine similarity between user profile and item representations

                sim=  cosine_similarity(user_profile.drop(columns=['user_id']), item_representation)
                sim= pd.DataFrame(sim.T)
                sim_score=pd.concat([book_ids,sim], axis=1)

                sim_score=sim_score.sort_values(by=0,ascending=False)
                sim_score=sim_score[sim_score[0]>0]
                sim_score=sim_score.head(50)
                similar_books.append( sim_score)

            except EOFError:
                break
    similar_books=pd.concat(similar_books,ignore_index=True)
    similar_books=similar_books.sort_values(by=0,ascending=False)
    #print(similar_books)
    return similar_books.head(self.test_df.count()["user_id"])
  def evaluate_model(self):
    similar_books=self.find_most_similar_books(self.user_id)
    actual_interactions = self.test_df['book_id'].tolist()
    recommended_books = similar_books["book_id"].tolist()
    precision = precision_score(actual_interactions, recommended_books, average='micro')
    recall = recall_score(actual_interactions, recommended_books, average='micro')
    print(actual_interactions)
    print

    return precision, recall

In [40]:
evaluation=Evaluation("d1e368a7d2870eb6fbf6e0d350568a2d")
evaluation.evaluate_model()

      book_id         0
350  20624747  0.425118
351  20422389  0.413213
550  24235881  0.410534
250  23732456  0.410534
500  25898721  0.403943
..        ...       ...
645  20663014  0.225104
646   7081605  0.224233
647   5029487  0.222665
648  23666341  0.222543
649  21518819  0.222146

[650 rows x 2 columns]
[20708745, 17347630, 21494049, 13544433, 28250952, 1786469, 10585139, 16718342, 14999726, 28965129, 32671333, 25734206, 9630712, 17243953, 18964778, 232575, 26631256, 17269361, 18528345, 21470943, 887572, 6531754, 7777, 31934478, 22551791, 839077, 17784209, 413259, 820274, 25810044, 13237162, 13607373, 923858, 2000859, 32204328, 12616156, 28478909, 25785628, 30095473, 27134686, 26244910, 25689038, 18118592, 24885887, 20561869, 105551, 6101845, 12065943, 10980153, 409371, 17262305, 16250904, 22484277, 28963877, 13650167, 41890, 237665, 25663825, 1081530, 17471114, 20518873]


(0.0, 0.0)