In [23]:
import requests
import csv

def fetch_books(query, api_key, total_results=1000):
    # Define the base URL for the Google Books API
    base_url = "https://www.googleapis.com/books/v1/volumes"

    # Set the maximum number of results per request (Google Books API limit is 40)
    max_results_per_request = 40

    # Calculate the number of requests needed based on the total number of desired results
    num_requests = total_results // max_results_per_request

    # Open a CSV file for writing book information
    with open('008106.csv', 'w', newline='', encoding='utf-8') as csv_file:
        # Define the fieldnames for the CSV file
        fieldnames = ['BookID', 'Title', 'Authors', 'Description', 'AverageRating', 'Price', 'PublishedDate', 'Categories', 'Language']
        
        # Create a CSV writer with the specified fieldnames
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        # Write the header row to the CSV file
        writer.writeheader()

        # Set to store processed book IDs and avoid duplicate entries
        processed_book_ids = set()

        # Loop through the required number of requests
        for i in range(num_requests):
            # Calculate the start index for each request
            start_index = i * max_results_per_request
            
            # Define parameters for the API request
            params = {
                'q': query,  # Search query
                'key': api_key,  # Google Books API key
                'maxResults': min(max_results_per_request, total_results - start_index),
                'startIndex': start_index
            }

            try:
                # Make a GET request to the Google Books API
                response = requests.get(base_url, params=params)
                response.raise_for_status()  # Check if the request was successful
                data = response.json()

                # Check if the response contains 'items' (books)
                if 'items' in data:
                    # Loop through each book in the response
                    for book in data['items']:
                        book_id = book['id']

                        # Check if the book has already been processed
                        if book_id in processed_book_ids:
                            continue

                        # Extract relevant information about the book
                        title = book['volumeInfo']['title']
                        authors = ', '.join(book['volumeInfo'].get('authors', ['Unknown Author']))
                        description = book['volumeInfo'].get('description', 'No description available')
                        average_rating = book['volumeInfo'].get('averageRating', 'Not rated')
                        price = book['saleInfo']['retailPrice']['amount'] if 'retailPrice' in book['saleInfo'] else 'Not available'
                        published_date = book['volumeInfo'].get('publishedDate', 'Not available')
                        categories = ', '.join(book['volumeInfo'].get('categories', []))
                        language = book['volumeInfo'].get('language', 'Unknown')

                        # Write book data to the CSV file
                        writer.writerow({
                            'BookID': book_id,
                            'Title': title,
                            'Authors': authors,
                            'Description': description,
                            'AverageRating': average_rating,
                            'Price': price,
                            'PublishedDate': published_date,
                            'Categories': categories,
                            'Language': language
                        })

                        # Add book ID to the set of processed book IDs
                        processed_book_ids.add(book_id)

            except requests.exceptions.HTTPError as err:
                print(f"Error: {err}")

if __name__ == "__main__":
    # Replace 'YOUR_API_KEY' with your actual Google Books API key
    api_key = 'AIzaSyBjNhuKCXrtS8ouxWERJqHhqY2tMUWi8lk'
    
    # Get user input for the book title or author to search
    search_query = input("Enter the book title or author: ")
    
    # Call the fetch_books function with the provided query and API key
    fetch_books(search_query, api_key, total_results=1000)


Enter the book title or author: data
Error: 429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v1/volumes?q=data&key=AIzaSyBjNhuKCXrtS8ouxWERJqHhqY2tMUWi8lk&maxResults=40&startIndex=0
Error: 429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v1/volumes?q=data&key=AIzaSyBjNhuKCXrtS8ouxWERJqHhqY2tMUWi8lk&maxResults=40&startIndex=40
Error: 429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v1/volumes?q=data&key=AIzaSyBjNhuKCXrtS8ouxWERJqHhqY2tMUWi8lk&maxResults=40&startIndex=120
Error: 429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v1/volumes?q=data&key=AIzaSyBjNhuKCXrtS8ouxWERJqHhqY2tMUWi8lk&maxResults=40&startIndex=240
Error: 429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v1/volumes?q=data&key=AIzaSyBjNhuKCXrtS8ouxWERJqHhqY2tMUWi8lk&maxResults=40&startIndex=840
Error: 429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v

## So far we have extracted the data here now we have to 

In [5]:
import pandas as pd

In [6]:
combined_df = pd.read_csv('combined_books_final_supercleaned.csv', encoding='latin-1')

In [7]:
df=combined_df
df

Unnamed: 0,BookID,Title,Authors,Description,AverageRating,Price,PublishedDate,Categories,Language
0,gO3s93lRZM8C,The Art of Flying,Judy Hoffman,Fortuna Dalliance is practical. Rational. Clev...,,6.99,10/29/2013,Juvenile Fiction,en
1,1fPoAgAAQBAJ,The Gondola Maker,Laura Morelli,Award-winning historical fiction set in 16th-c...,,12.99,03-03-2014,Fiction,en
2,mdV1zgEACAAJ,Fandom and the Law,Marc H. Greenberg,"""An analysis based on the two major iterations...",,,05-02-2022,Law,en
3,m-vCCgAAQBAJ,Art and Idea in the Novels of China MiAville,Carl Freedman,This book offers (in the first six chapters) c...,,,09-07-2015,Literary Criticism,en
4,lOqfEAAAQBAJ,The Art of Desire,"Stacey Abrams, Selena Montgomery",Trouble comes in threes... One doomed love aff...,,14.99,09-05-2023,Fiction,en
...,...,...,...,...,...,...,...,...,...
53966,5L7sAAAAMAAJ,Zoology 2,DK,"See the animal kingdom in all its glory, from ...",,19.99,08-10-2019,Science,en
53967,4AzfCgAAQBAJ,Zuleika Dobson,Max Beerbohm,This satirical novel of life and love at Oxfor...,,6.99,11/24/2015,Fiction,en
53968,Zx9_AQAACAAJ,Zumba,Beto Perez,TIRED OF LOGGING HOURS AT THE GYM AND NOT GETT...,,,2014,Physical fitness,en
53969,ZGUux-s69UoC,"Zurich International Chess Tournament, 1953",David Bronstein,Perceptive coverage of all 210 games from the ...,,9.99,15-04-2013,Games & Activities,en


In [8]:
import pandas as pd

# Assuming 'combined_df' is your DataFrame and 'Categories' is the column you want to stratify on
# Replace these with your actual DataFrame and column names

# Sample data loading (you might have already loaded the data using pd.read_csv)
# combined_df = pd.read_csv('combined_books_final_supercleaned.csv', encoding='latin-1')

# Specify the column for stratification
stratify_column = 'Categories'

# Calculate the number of samples for each category to achieve a 70% stratified sample
stratified_sample = combined_df.groupby(stratify_column, group_keys=False).apply(lambda x: x.sample(frac=0.80, random_state=42))

# Store the stratified sample in a new DataFrame
df = pd.DataFrame(stratified_sample)

# Reset the index of the final DataFrame
df.reset_index(drop=True, inplace=True)
df


Unnamed: 0,BookID,Title,Authors,Description,AverageRating,Price,PublishedDate,Categories,Language
0,k7izAAAAIAAJ,Our Natural History,Daniel B. Botkin,Botkin uses the experiences of Lewis and Clark...,,,1995,1804-1806 Lewis and Clark Expedition,en
1,fsN2AAAACAAJ,The American Civil War,Hans Halberstadt,What was it really like to be a soldier in the...,,,2001,1860-1869,en
2,LX8oOQAACAAJ,Causes of World War II,Jim Corrigan,"""Discusses and explains the events of the 1920...",,,2005,1941,en
3,2MkUAQAAIAAJ,Contemporary Popular Writers,Dave Mote,"""Included are authors, both living and dead, w...",,,1997,20th century,en
4,752857932,The Pusher,Ed McBain,Two a.m. in the bitter cold of winter: the you...,3.89,,2003,87th Precinct (Imaginary place),
...,...,...,...,...,...,...,...,...,...
37264,hBEk5FLHeb0C,Zen,Peter Oldmeadow,Zen is a form of Buddhism with origins in the ...,,,2001,Zen Buddhism,en
37265,EPXEoMtND24C,The Other Side of Zen,Duncan RyA«ken Williams,"""Popular understanding of Zen Buddhism typical...",,,2005,Zen Buddhism,en
37266,xSAOAQAAMAAJ,African Laughter,Doris Lessing,A rich and penetrating portrait of Lessing's h...,,,1992,Zimbabwe,en
37267,0T0_AAAAYAAJ,Introductory Zoology,Lincoln Coles Pettit,A textbook introducing the student to zoology ...,,,1962,Zoologia,en


In [9]:
from cleantext import clean

def clean_text(text):
    return clean(str(text), fix_unicode=True, to_ascii=True, lower=True, no_punct=False)

# Apply the clean_text function to each element in the DataFrame
df_cleaned = df.applymap(clean_text)

# Display the cleaned DataFrame
print(df_cleaned)

             BookID                         Title                     Authors  \
0      k7izaaaaiaaj           our natural history            daniel b. botkin   
1      fsn2aaaacaaj        the american civil war            hans halberstadt   
2      lx8ooqaacaaj        causes of world war ii                jim corrigan   
3      2mkuaqaaiaaj  contemporary popular writers                   dave mote   
4         752857932                    the pusher                   ed mcbain   
...             ...                           ...                         ...   
37264  hbek5flheb0c                           zen             peter oldmeadow   
37265  epxeomtnd24c         the other side of zen  duncan rya..."ken williams   
37266  xsaoaqaamaaj              african laughter               doris lessing   
37267  0t0_aaaayaaj          introductory zoology        lincoln coles pettit   
37268     756616344                        animal  david burnie;don e. wilson   

                           

In [10]:
df_cleaned.to_csv('cleaned_data.csv', index=False)

In [11]:
df=pd.read_csv('cleaned_data.csv')
df

Unnamed: 0,BookID,Title,Authors,Description,AverageRating,Price,PublishedDate,Categories,Language
0,k7izaaaaiaaj,our natural history,daniel b. botkin,botkin uses the experiences of lewis and clark...,,,1995,1804-1806 lewis and clark expedition,en
1,fsn2aaaacaaj,the american civil war,hans halberstadt,what was it really like to be a soldier in the...,,,2001,1860-1869,en
2,lx8ooqaacaaj,causes of world war ii,jim corrigan,"""discusses and explains the events of the 1920...",,,2005,1941,en
3,2mkuaqaaiaaj,contemporary popular writers,dave mote,"""included are authors, both living and dead, w...",,,1997,20th century,en
4,752857932,the pusher,ed mcbain,two a.m. in the bitter cold of winter: the you...,3.89,,2003,87th precinct (imaginary place),
...,...,...,...,...,...,...,...,...,...
37264,hbek5flheb0c,zen,peter oldmeadow,zen is a form of buddhism with origins in the ...,,,2001,zen buddhism,en
37265,epxeomtnd24c,the other side of zen,"duncan rya...""ken williams","""popular understanding of zen buddhism typical...",,,2005,zen buddhism,en
37266,xsaoaqaamaaj,african laughter,doris lessing,a rich and penetrating portrait of lessing's h...,,,1992,zimbabwe,en
37267,0t0_aaaayaaj,introductory zoology,lincoln coles pettit,a textbook introducing the student to zoology ...,,,1962,zoologia,en


### Text Preprocessing:
1. Tokenization: Split the text into individual words or phrases (tokens).
2. Lowercasing: Convert all text to lowercase to ensure consistency.
3. Removing Stop Words: Eliminate common words that don't contribute much to the meaning.
4. Removing Punctuation and Special Characters: Clean the text from non-alphabetic characters.
5. Lemmatization or Stemming: Reduce words to their base or root form.

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import warnings

warnings.filterwarnings("ignore")

import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

# Here we have created a function for performing all the necessary processing on the textual data before modeling 
def preprocess_text(text):
    if pd.isnull(text):
        return ''  # Return empty string for NaN values
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Combine tokens back to text
    processed_text = ' '.join(tokens)

    return processed_text


df['Processed_Authors'] = df['Authors'].apply(preprocess_text)


df['Categories'] = df['Categories'].fillna('')


df['Processed_Categories'] = df['Categories'].apply(preprocess_text)

# Combine features into a single column
df['Combined_Features'] = (
    df['Description'] +
    ' ' +
    df['Processed_Authors'] +
    ' ' +
    df['Processed_Categories']
)


df['Combined_Features'] = df['Combined_Features'].fillna('')

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Combined_Features'])


cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# This function is made to input the query and get recommendations
def recommend_books(search_query, cosine_sim=cosine_sim):
    # This line is for processing the search query
    processed_query = preprocess_text(search_query)

    # Creating a temporary DataFrame with the search query
    query_df = pd.DataFrame({'Combined_Features': [processed_query]})

    # applying TF-IDF Vectorization on the query
    query_vector = tfidf_vectorizer.transform(query_df['Combined_Features'])

    # Calculating the cosine similarity between the query and all books
    sim_scores = linear_kernel(query_vector, tfidf_matrix).flatten()

    # Get indices of the most relavant 10 books based on similarity
    book_indices = sim_scores.argsort()[::-1][:10]

    return df['Title'].iloc[book_indices]



In [14]:
# Example usage:
search_query = "python"
recommended_books = recommend_books(search_query)
print(recommended_books)

5826           non-programmers tutorial for python 2 and 3
5635                          python data science handbook
24687                               python for informatics
5741                                        python workout
5724                      python crash course, 2nd edition
5901                              python for data analysis
5885                                       python for kids
5872               think like a programmer, python edition
24585            the complete monty python's flying circus
5722     pcap - certified associate in python programmi...
Name: Title, dtype: object



### TF-IDF Vectorization:
Use TF-IDF Vectorization to convert the preprocessed text into numerical vectors.
   
### Handling NaN Values:
Replace any NaN values in the 'combined_features' column with an empty string.

### Cosine Similarity Calculation:
Calculate cosine similarity between each pair of books based on their TF-IDF vectors using the linear kernel.

### Building the Recommendation System:
Create a function, `get_recommendations`, to recommend books based on cosine similarity.
Retrieve the index of the input book title and calculate similarity scores with all other books.
Sort the scores in descending order to identify the most similar books.
Return the top 10 recommended book titles (excluding the input book) based on their similarity scores.

### Example Usage:
Demonstrate how to use the recommendation function for a specific book title.

In [15]:
import pickle

# Save the model
model_data = {
    'df': df,
    'tfidf_vectorizer': tfidf_vectorizer,
    'tfidf_matrix': tfidf_matrix,
    'cosine_sim': cosine_sim
}

with open('book_recommendation_model.pkl', 'wb') as model_file:
    pickle.dump(model_data, model_file)

# Load the model
with open('book_recommendation_model.pkl', 'rb') as model_file:
    loaded_model_data = pickle.load(model_file)

In [16]:
# Extract necessary components from the loaded model data
loaded_df = loaded_model_data['df']
loaded_tfidf_vectorizer = loaded_model_data['tfidf_vectorizer']
loaded_tfidf_matrix = loaded_model_data['tfidf_matrix']
loaded_cosine_sim = loaded_model_data['cosine_sim']

# Example query after loading the model
search_query = "java"
recommended_books = recommend_books(search_query, cosine_sim=loaded_cosine_sim)
print(recommended_books)

24826                                           visual j++
1887                                          future tense
5912                                     the java tutorial
3387     programming open service gateways with java em...
887                  jrun web application construction kit
8258     portuguese eurasian communities in southeast asia
5549            data structures and abstractions with java
5675                                      kotlin in action
5563                                                  java
5716                                       netty in action
Name: Title, dtype: object
