BASELINE from ONO

In [1]:
! pip install WordCloud



In [2]:
! pip install missingno



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import missingno as msno
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [6]:
import pandas as pd

# Load data from CSV file using full path
book_features = pd.read_csv('/Users/liliyachvileva/Desktop/neuefische/ds-capstone/groupwork/ds-capstone/notebooks/data/books_data.csv', low_memory=False)






In [8]:


# Load data from CSV file
ratings = pd.read_csv('/Users/liliyachvileva/Desktop/neuefische/ds-capstone/groupwork/ds-capstone/notebooks/data/Books_rating.csv', low_memory=False)


In [None]:
book_features.head()

In [None]:
ratings.head()

In [None]:
book_features.describe()

In [None]:
ratings.describe()


##Checking missing values##

In [None]:
# Visualize missing values
msno.matrix(book_features)
plt.show()

msno.matrix(ratings)
plt.show()

# Calculate missing values
print(book_features.isnull().sum())
print(ratings.isnull().sum())

##Data Cleaning##

*Handling missing data and duplicates*

In [None]:
# Handle missing values in books dataframe
book_features = book_features.dropna(subset=['Title', 'description', 'authors'])

# Handle missing values in ratings dataframe
ratings = ratings.dropna(subset=['Title', 'review/score'])

# Remove duplicates if any
book_features = book_features.drop_duplicates(subset=['Title'])
ratings = ratings.drop_duplicates(subset=['Id'])

Checking for data types and converting

In [None]:
# Check data types
print(book_features.dtypes)
print(ratings.dtypes)

# Convert data types if necessary
book_features['publishedDate'] = pd.to_datetime(book_features['publishedDate'], errors='coerce')
ratings['review/time'] = pd.to_datetime(ratings['review/time'], unit='s')


Exploratory Data Analysis

In [None]:
# Distribution of published dates
plt.figure(figsize=(10,6))
book_features['publishedDate'].dt.year.value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Published Dates')
plt.xlabel('Year')
plt.ylabel('Number of Books')
plt.show()

In [None]:
# Top categories
top_categories = book_features['categories'].value_counts().head(10)
plt.figure(figsize=(10,6))
top_categories.plot(kind='bar')
plt.title('Top 10 Book Categories')
plt.xlabel('Category')
plt.ylabel('Number of Books')
plt.show()

In [None]:

# Rating count distribution
plt.figure(figsize=(10,6))
plt.hist(book_features['ratingsCount'], bins=10)
plt.title('Distribution of Ratings Count')
plt.xlabel('Ratings Count')
plt.ylabel('Frequency')
plt.show()

In [None]:

# Authors Analysis
top_authors = book_features['authors'].value_counts().head(10)
plt.figure(figsize=(10,6))
top_authors.plot(kind='bar')
plt.title('Top 10 Authors')
plt.xlabel('Author')
plt.ylabel('Number of Books')
plt.show()

In [None]:

# Word count of description
text = ' '.join(book_features['description'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Book Descriptions')
plt.show()

In [None]:

# Distribution of review scores
plt.figure(figsize=(10,6))
plt.hist(ratings['review/score'], bins=10)
plt.title('Distribution of Review Scores')
plt.xlabel('Review Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of review/helpfulness
ratings['helpfulness_numerator'] = ratings['review/helpfulness'].apply(lambda x: int(x.split('/')[0]))
ratings['helpfulness_denominator'] = ratings['review/helpfulness'].apply(lambda x: int(x.split('/')[1]))
ratings['helpfulness_ratio'] = ratings['helpfulness_numerator'] / ratings['helpfulness_denominator']

plt.figure(figsize=(10,6))
plt.hist(ratings['helpfulness_ratio'].dropna(), bins=10)
plt.title('Distribution of Review Helpfulness Ratio')
plt.xlabel('Helpfulness Ratio')
plt.ylabel('Frequency')
plt.show()

In [None]:

# Top reviewers:
top_reviewers = ratings['profileName'].value_counts().head(10)
plt.figure(figsize=(10,6))
top_reviewers.plot(kind='bar')
plt.title('Top 10 Reviewers')
plt.xlabel('Reviewer')
plt.ylabel('Number of Reviews')
plt.show()

In [None]:

# Checking for correlation between price and review score:
plt.figure(figsize=(10,6))
sns.scatterplot(data=ratings, x='Price', y='review/score')
plt.title('Price vs Review Score')
plt.xlabel('Price')
plt.ylabel('Review Score')
plt.show()

##Features Engineering##

Calculating Rating Count and Average Rating

In [None]:
# Calculate the rating count for each book
rating_counts = ratings.groupby('Title').size().reset_index(name='rating_count')

# Calculate the average rating for each book
average_ratings = ratings.groupby('Title')['review/score'].mean().reset_index(name='average_rating')

# Merge these features into the books dataframe
book_features = book_features.merge(rating_counts, on='Title', how='left')
book_features = book_features.merge(average_ratings, on='Title', how='left')

# Fill NaN values with 0 for rating_count and with the average rating for average_rating
book_features['ratingsCount'] = book_features['ratingsCount'].fillna(0)
book_features['average_rating'] = book_features['average_rating'].fillna(book_features['average_rating'].mean())

In [None]:
book_features[['Title', 'rating_count', 'average_rating']].head()


Summary statistics for the new features

In [None]:
book_features[['rating_count', 'average_rating']].describe()

Checking the distribution of the new features

In [None]:

plt.figure(figsize=(10, 6))
plt.hist(book_features['rating_count'], bins=10)
plt.title('Distribution of Rating Counts')
plt.xlabel('Rating Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(book_features['average_rating'], bins=10)
plt.title('Distribution of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:

# Combine relevant text features for vectorization
book_features['description'] = book_features['description'].fillna('').str.strip().str.lower()
book_features['Title'] = book_features['Title'].fillna('')
book_features['authors'] = book_features['authors'].fillna('')
book_features['categories'] = book_features['categories'].fillna('')
book_features['combined_text'] = book_features['Title'] + ' ' + book_features['description'] + ' ' + book_features['authors'] + ' ' + book_features['categories']


# Verify the combined text column
print(book_features[['Title', 'combined_text']].head())

In [None]:

# Reduce book_data by 50%
book_data_reduced = book_features.sample(frac=0.98, random_state=42)

# Reduce book_rating by 50%
book_rating_reduced = ratings.sample(frac=0.98, random_state=42)

# Check the size of the reduced data
print(f'Reduced book_data size: {book_data_reduced.shape}')
print(f'Reduced book_rating size: {book_rating_reduced.shape}')

In [None]:

# Creating a TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Transform documents into TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(book_data_reduced['combined_text'])

# Use Nearest Neighbors to find similarities
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)


NearestNeighbors(algorithm='brute', metric='cosine')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [None]:
# Create a new mapping from book titles to their indices for the reduced dataset
indices_reduced = pd.Series(book_data_reduced.index, index=book_data_reduced['Title']).drop_duplicates()

In [None]:
def get_recommendations_nn(title, n_recommendations=10):
    # Get the index of the book that matches the title
    idx = indices_reduced[title]

    # Get the TF-IDF vector for the book
    book_vec = tfidf_matrix[idx]

    # Find the nearest neighbors
    distances, indices_nn = nn.kneighbors(book_vec, n_neighbors=n_recommendations+1)

    # Get the indices of the most similar books
    book_indices = indices_nn[0][1:]

    # Return the top most similar books
    return book_data_reduced[['Title', 'authors', 'categories', 'average_rating', 'ratingsCount']].iloc[book_indices]


In [None]:
# usage
example_title = book_data_reduced['Title'].iloc[1]
recommendations = get_recommendations_nn(example_title)
recommendations

In [None]:
example_title = book_data_reduced['Title'].iloc[5]
example_title

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def evaluate_recommendations_nn(title, true_similar_titles):
    # Get the recommendations
    recommended_books = get_recommendations_nn(title)
    
    # Extract the titles of the recommended books
    recommended_titles = recommended_books['Title'].tolist()
    
    # Calculate precision, recall, and F1 score
    y_true = [1 if title in true_similar_titles else 0 for title in recommended_titles]
    y_pred = [1] * len(recommended_titles)  # All recommended books are considered as predicted positive
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

In [None]:
# Example ground truth similar titles
true_similar_titles = ['Book1', 'Book2', 'Book3', 'Book4', 'Book5', 'Book6', 'Book7', 'Book8', 'Book9', 'Book10']

# Example usage
precision, recall, f1 = evaluate_recommendations_nn(example_title, true_similar_titles)
print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

LILI BASELINE 