# Importing Packages: 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

warnings.filterwarnings("ignore")
%matplotlib inline

# Reading cleaned dataset : 

In [2]:
df = pd.read_csv('cleaned_dataset.csv')

In [3]:
df.drop("Unnamed: 0" , inplace = True , axis = 1)

In [47]:
df.head(30)

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,num_pages,ratings_count
0,Gilead,A Novel,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,The One Tree,A Novel,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,Rage of angels,A Novel,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,The Four Loves,A Novel,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0
5,The Problem of Pain,A Novel,Clive Staples Lewis,Christian life,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0
6,An Autobiography,A Novel,Agatha Christie,"Authors, English",Donation.,1977.0,4.27,560.0,3975.0
7,Empires of the Monsoon,A History of the Indian Ocean and Its Invaders,Richard Hall,"Africa, East",Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0
8,The Gap Into Madness,Chaos and Order,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0
9,Master of the Game,A Novel,Sidney Sheldon,Adventure stories,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0


# Creating Model :

In [34]:
vectorizer = TfidfVectorizer()
X_new = vectorizer.fit_transform([x.lower() for x in df['title']])

In [85]:
query = 'space'
query_vec = vectorizer.transform([query])
similarity = cosine_similarity(query_vec, X_new).flatten()

In [86]:
test = np.argsort(-similarity) # sorting descending and get the index
result = df.iloc[test]
result['title'].head()

4535                           Space
5086            The Poetics of Space
5679              The Crack in Space
6494               Ministry of Space
4217    The Nature of Space and Time
Name: title, dtype: object

In [87]:
len(result)

6810

# Better Model :

In [88]:
match_idx = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[match_idx])
correct_indices = match_idx[indices]
result = df.iloc[correct_indices]

result['title'].head()

4535                           Space
5086            The Poetics of Space
5679              The Crack in Space
6494               Ministry of Space
4217    The Nature of Space and Time
Name: title, dtype: object

In [84]:
df['score'] = np.log(df['average_rating'] *  df['ratings_count'])

In [90]:
result = df.iloc[correct_indices]
overall = result['score']  * similarity[correct_indices]
result.loc[overall.sort_values(ascending=False).index].head()

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,num_pages,ratings_count,score
5086,The Poetics of Space,A Novel,Gaston Bachelard,Philosophy,Shows how our perceptions of home shape our th...,1994.0,4.19,282.0,6625.0,10.231306
5679,The Crack in Space,A Novel,Philip K. Dick,Fiction,All kinds of problems erupt when a repairman d...,1966.0,3.46,188.0,1791.0,8.731798
6494,Ministry of Space,A Novel,Warren Ellis;Chris Weston;Laura DePuy,Imperialism,"In this alternative history tale, the British ...",2005.0,3.71,100.0,1298.0,8.479612
4535,Space,A Novel,James Albert Michener,Fiction,No Description For that book,1992.0,3.97,815.0,35.0,4.934114
5878,"Have Space Suit, Will Travel",A Novel,Robert A. Heinlein,Fiction,"Teenager Clifford ""Kip"" Russell wins second pr...",2005.0,3.89,276.0,22399.0,11.375181


# Function to get them all :

In [93]:
def search_engine(word, limit=5):
    word = re.sub('[^a-zA-Z0-9 ]','', word.lower()) # match everyting that's not alphabet and digit and remove it
    query_vec = vectorizer.transform([word])
    similarity = cosine_similarity(query_vec, X_new).flatten()
    
    filtered = np.where(similarity != 0)[0]
    indices = np.argsort(-similarity[filtered])
    correct_indices = filtered[indices]
    result = df.iloc[correct_indices]
    
    if not len(result):
        return 'result not found'
    
    overall =  result['score'] *  similarity[correct_indices] 
    
    return result.loc[overall.sort_values(ascending=False).index].head(limit)

In [105]:
search_engine('fiction')

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,num_pages,ratings_count,score
4005,The Art of Fiction,Notes on Craft for Young Writers,John Gardner,Language Arts & Disciplines,A guide to creative writing examines diverse f...,1991.0,4.0,224.0,4877.0,9.87858
1196,Complete Shorter Fiction,A Novel,Oscar Wilde,Fiction,"For the first time in one volume, this complet...",1998.0,4.16,288.0,1167.0,8.487707
1275,Non-Fiction,A Novel,Chuck Palahniuk,Curiosities and wonders,"Chuck Palahniuk's world has been, well, differ...",2004.0,3.57,233.0,212.0,6.629152
6261,Write Great Fiction - Plot & Structure,A Novel,James Scott Bell,Language Arts & Disciplines,Craft an Engaging Plot How does plot influence...,2004.0,4.1,234.0,4304.0,9.778287
2435,The Norton Anthology of Short Fiction,A Novel,Ronald Verlin Cassill;Richard Bausch,Fiction,The classroom standard for readers and aspirin...,2006.0,4.17,1776.0,645.0,7.897166
