# 1. Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, normalize, MinMaxScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os

# Setting up NLTK with a specific directory
nltk_data_dir = os.path.join(os.path.dirname(os.getcwd()), "data", "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)

file_path = "../data/Engineering_books_data.csv"
data = pd.read_csv(file_path)

# These columns do not provide valuable information for our system
columns_to_drop = ['image', 'download_link', 'file']
data = data.drop(columns=columns_to_drop)

data.fillna({
    "desc": "Unknown",
    "author": "Unknown",
    "language": "Unknown",
    "publisher": "Unknown",
    "pages": data["pages"].median(),
    "year": data["year"].median()
}, inplace=True)

data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aberkipek23\Desktop\4th Year of
[nltk_data]     SE\DM\projects\project2\engineering-books-recommender-
[nltk_data]     system\data\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aberkipek23\Desktop\4th Year of
[nltk_data]     SE\DM\projects\project2\engineering-books-recommender-
[nltk_data]     system\data\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aberkipek23\Desktop\4th Year of
[nltk_data]     SE\DM\projects\project2\engineering-books-recommender-
[nltk_data]     system\data\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,title,author,desc,pages,publisher,year,language
0,"Soil Mechanics of Earthworks, Foundations and ...",Unknown,\n This is the third volume...,354.0,Elsevier Science,1988.0,english
1,Foundation Engineering: Design and Constructio...,B.B.K. Huat,\n Residual soils are found...,256.0,Taylor & Francis,2006.0,english
2,Practical Problems in Soil Mechanics and Found...,Sheng Y. Peng,Unknown,346.0,Elsevier Science Ltd,1985.0,english
3,Underwater Embankments on Soft Soil A Case His...,William F. van Impe,Unknown,154.0,Unknown,2007.0,english
4,Eco- and Ground Bio-Engineering: The Use of Ve...,Alexia Stokes,\n This volume brings toget...,420.0,Unknown,2007.0,english


# Preprocessing

In [2]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[\d\W_]+', ' ', text)
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized_words)

data['clean_desc'] = data['desc'].apply(preprocess_text)

data.head()

Unnamed: 0,title,author,desc,pages,publisher,year,language,clean_desc
0,"Soil Mechanics of Earthworks, Foundations and ...",Unknown,\n This is the third volume...,354.0,Elsevier Science,1988.0,english,third volume handbook cover whole field soil m...
1,Foundation Engineering: Design and Constructio...,B.B.K. Huat,\n Residual soils are found...,256.0,Taylor & Francis,2006.0,english,residual soil found many part world like soil ...
2,Practical Problems in Soil Mechanics and Found...,Sheng Y. Peng,Unknown,346.0,Elsevier Science Ltd,1985.0,english,unknown
3,Underwater Embankments on Soft Soil A Case His...,William F. van Impe,Unknown,154.0,Unknown,2007.0,english,unknown
4,Eco- and Ground Bio-Engineering: The Use of Ve...,Alexia Stokes,\n This volume brings toget...,420.0,Unknown,2007.0,english,volume brings together paper geotechnical civi...


# 2. Feature Engineering

In [8]:
# TF-IDF for book descriptions
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_desc'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# One-hot encoding for categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
author_matrix = encoder.fit_transform(data[['author']])
language_matrix = encoder.fit_transform(data[['language']])
publisher_matrix = encoder.fit_transform(data[['publisher']])
print(f"Author matrix shape: {author_matrix.shape}")
print(f"Language matrix shape: {language_matrix.shape}")
print(f"Publisher matrix shape: {publisher_matrix.shape}")

# Normalize the matrices
tfidf_matrix_normalized = normalize(tfidf_matrix, norm='l2')
author_matrix_normalized = normalize(author_matrix, norm='l2')
language_matrix_normalized = normalize(language_matrix, norm='l2')
publisher_matrix_normalized = normalize(publisher_matrix, norm='l2')

# Normalize numeric features (pages and year)
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(data[['pages', 'year']])
numeric_features_sparse = csr_matrix(numeric_features)
print(f"Numeric features shape: {numeric_features_sparse.shape}")

# Combine all features with tunable weights
weights = {'tfidf': 0.8, 'author': 0.1, 'language': 0.05, 'publisher': 0.05, 'numeric': 0.05}
combined_features = hstack([
    weights['tfidf'] * tfidf_matrix_normalized,
    weights['author'] * author_matrix_normalized,
    weights['language'] * language_matrix_normalized,
    weights['publisher'] * publisher_matrix_normalized,
    weights['numeric'] * numeric_features_sparse
])

print(f"Combined feature matrix shape: {combined_features.shape}")

# Percentage of zero elements in a matrix
# A sparse matrix has most of its elements as zeros which makes it memory-efficient for storing and processing.
sparsity = 1.0 - (combined_features.nnz / float(combined_features.shape[0] * combined_features.shape[1]))
print(f"Sparsity: {sparsity:.2%}")
# Note: High sparsity is a good indication!

TF-IDF matrix shape: (2375, 3000)
Author matrix shape: (2375, 1799)
Language matrix shape: (2375, 10)
Publisher matrix shape: (2375, 563)
Numeric features shape: (2375, 2)
Combined feature matrix shape: (2375, 5374)
Sparsity: 99.06%


# 3. Recommendation Function

In [10]:
# Computing similarity matrix
similarity_matrix = cosine_similarity(combined_features)

# Global recommended set
global_recommended_set = set()

def recommend_books_with_diversity_and_coverage(title, combined_features, data, similarity_matrix, top_n=5):
    indices = data.index[data['title'].str.contains(title, case=False, na=False)].tolist()
    if not indices:
        print(f"Title '{title}' not found in the dataset.")
        return []

    idx = indices[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Precomputing similarity for top candidates only
    top_candidates = similarity_scores[:50]
    submatrix_indices = [candidate[0] for candidate in top_candidates]
    similarity_submatrix = cosine_similarity(combined_features[submatrix_indices])

    recommendations = []
    recommended_titles = set()

    # Frequency of recommendation for novelty
    frequency = data['title'].value_counts(normalize=True)

    for i, score in top_candidates:
        book_title = data.iloc[i]["title"]
        original_author = data.iloc[i]["author"]
        if book_title in recommended_titles or i == idx:
            continue

        # Extracting indices of already recommended books
        recommended_indices = [rec['index'] for rec in recommendations]

        # Diversity penalty
        diversity_penalty = sum(similarity_submatrix[submatrix_indices.index(i), submatrix_indices.index(j)]
                                for j in recommended_indices) * 0.4
        adjusted_score = max(0, score - diversity_penalty)

        # Frequency penalty
        adjusted_score *= (1 - frequency.get(book_title, 0))

        # Random exploration
        random_factor = np.random.uniform(0, 0.3)
        adjusted_score += random_factor

        if adjusted_score > 0:
            recommendations.append({
                "index": i,
                "title": book_title,
                "author": original_author,
                "similarity": adjusted_score
            })
            recommended_titles.add(book_title)

        if len(recommendations) >= top_n:
            break

    global_recommended_set.update(recommended_titles)
    return recommendations

# 4. Metric Calculations

In [11]:
# Coverage metric
def calculate_coverage(catalog_size):
    return len(global_recommended_set) / catalog_size

# Diversity metric
def calculate_diversity(recommended_features):
    similarity_matrix = cosine_similarity(recommended_features)
    diversity = 1 - np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)])
    return diversity

# Novelty metric
def calculate_novelty(recommended_titles, catalog):
    frequency = catalog['title'].value_counts(normalize=True)
    novelty_scores = [1 - frequency.get(item, 0) for item in recommended_titles]
    return np.mean(novelty_scores) if novelty_scores else 0

# Freshness metric
def calculate_freshness(recommended_titles, recent_titles):
    fresh_items = set(recommended_titles) - set(recent_titles)
    return len(fresh_items) / len(recommended_titles) if recommended_titles else 0

# 5. Interactive Recommender System

In [14]:
def interactive_recommendation_system():
    print("Welcome to the Engineering Book Recommendation System!")
    recent_recommendations = []

    while True:
        print("\nWould you like to:")
        print("1. Select a book from a random sample")
        print("2. Enter a book title directly")
        print("0. Exit")

        choice = input("\nEnter your choice (1, 2, or 0): ").strip()
        if choice == "0":
            print("\nExiting the system. Goodbye!")
            break

        if choice == "1":
            print("\nHere are a few sample book titles from the dataset:")
            sample_titles = data['title'].sample(5).tolist()
            for idx, title in enumerate(sample_titles, start=1):
                print(f"{idx}. {title}")

            book_choice = input("\nEnter the number of the book you'd like to choose: ").strip()
            if book_choice.isdigit() and 1 <= int(book_choice) <= len(sample_titles):
                title = sample_titles[int(book_choice) - 1]
            else:
                print("\nInvalid choice. Please try again.")
                continue
        elif choice == "2":
            title = input("\nEnter the book title: ").strip()
        else:
            print("\nInvalid choice. Please try again.")
            continue

        recommendations = recommend_books_with_diversity_and_coverage(title, combined_features, data, similarity_matrix, top_n=5)

        if not recommendations:
            print(f"\nNo matches found for '{title}'. Please try again.")
            continue

        print("\nRecommendations:")
        for rec in recommendations:
            print(f"- {rec['title']} by {rec['author']} (Similarity: {rec['similarity']:.4f})")

        recommended_titles = [rec["title"] for rec in recommendations]
        recommended_features = combined_features[[rec['index'] for rec in recommendations]]

        coverage = len(global_recommended_set) / len(data)
        diversity = calculate_diversity(recommended_features.toarray())
        novelty = calculate_novelty(recommended_titles, data)
        freshness = calculate_freshness(recommended_titles, recent_recommendations)

        print("\nMetrics:")
        print(f"Coverage: {coverage:.4f}")
        print(f"Diversity: {diversity:.4f}")
        print(f"Novelty: {novelty:.4f}")
        print(f"Freshness: {freshness:.4f}")

        recent_recommendations = recommended_titles

In [15]:
# Running the system
interactive_recommendation_system()

Welcome to the Engineering Book Recommendation System!

Would you like to:
1. Select a book from a random sample
2. Enter a book title directly
0. Exit



Enter your choice (1, 2, or 0):  2

Enter the book title:  sdgs


Title 'sdgs' not found in the dataset.

No matches found for 'sdgs'. Please try again.

Would you like to:
1. Select a book from a random sample
2. Enter a book title directly
0. Exit



Enter your choice (1, 2, or 0):  1



Here are a few sample book titles from the dataset:
1. Electromagnetic Field Theory Fundamentals
2. Principles of Compiler Design : Express Learning
3. Manufacturing automation : metal cutting mechanics, machine tool vibrations, and CNC design
4. Introduction to thermodynamics and kinetic theory of matter
5. Data Analysis From Scratch With Python: Beginner Guide using Python, Pandas, NumPy, Scikit-Learn, IPython, TensorFlow and Matplotlib



Enter the number of the book you'd like to choose:  1



Recommendations:
- Electromagnetic field theory fundamentals by Bhag Singh Guru (Similarity: 1.1969)
- Electromagnetic Field Theory Fundamentals by Bhag Singh Guru (Similarity: 0.6261)
- Introduction to Finite Element Analysis and Design by Nam H. Kim (Similarity: 0.2512)
- Numerical Solution of Differential Equations: Introduction to Finite Difference and Finite Element Methods by Zhilin Li (Similarity: 0.2804)
- An Introduction to Nonlinear Finite Element Analysis: with applications to heat transfer, fluid mechanics, and solid mechanics by J. N. Reddy (Similarity: 0.1933)

Metrics:
Coverage: 0.0021
Diversity: 0.6078
Novelty: 0.9994
Freshness: 1.0000

Would you like to:
1. Select a book from a random sample
2. Enter a book title directly
0. Exit



Enter your choice (1, 2, or 0):  2

Enter the book title:  An Introduction to Nonlinear Finite Element Analysis: with applications to heat transfer, fluid mechanics, and solid mechanics



Recommendations:
- An Introduction to Nonlinear Finite Element Analysis by Junuthula Narasimha Reddy (Similarity: 0.8463)
- Introduction to Finite Element Analysis and Design by Nam H. Kim (Similarity: 0.3741)
- Finite Element Analysis and Design of Metal Structures by Ehab Ellobody (Similarity: 0.1205)
- Fundamentals of Finite Element Analysis by David V. Hutton (Similarity: 0.0834)
- Fundamentals of Finite Element Analysis: Linear Finite Element Analysis by Ioannis Koutromanos (Similarity: 0.2078)

Metrics:
Coverage: 0.0038
Diversity: 0.5510
Novelty: 0.9996
Freshness: 0.8000

Would you like to:
1. Select a book from a random sample
2. Enter a book title directly
0. Exit



Enter your choice (1, 2, or 0):  1



Here are a few sample book titles from the dataset:
1. How to Prove It: A Structured Approach
2. Programming and Customizing the Avr Microcontroller
3. Signals and Systems
4. Highway engineering : planning, design, and operations
5. Power Quality in Power Systems and Electrical Machines



Enter the number of the book you'd like to choose:  3



Recommendations:
- Signals, Systems, and Transforms, 4th Edition   by Charles L. Phillips (Similarity: 0.5689)
- Signals and Systems Laboratory with MATLAB by Palamides (Similarity: 0.4758)
- Discrete-Time Signal Processing by Alan V Oppenheim (Similarity: 0.3530)
- Discrete-Time Signal Processing (3rd Edition) by Alan V. Oppenheim (Similarity: 0.0771)
- Signal Analysis: Wavelets, Filter Banks, Time-Frequency Transforms and Applications by Alfred Mertins (Similarity: 0.1853)

Metrics:
Coverage: 0.0059
Diversity: 0.6157
Novelty: 0.9992
Freshness: 1.0000

Would you like to:
1. Select a book from a random sample
2. Enter a book title directly
0. Exit



Enter your choice (1, 2, or 0):  0



Exiting the system. Goodbye!
