In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv(
    "/home/aca/Documents/S6_Lab/Projects/libraryDataAnalysis/pattern_mining/unq_bks.csv", index_col=0)


In [3]:
# Searching books in dataset to find exact matches. 
def search_exact(data = None, auth="", pub="", title=""):

    if (data is None): 
        print("No dataframe selected")
        return 
    
    # New dataframe for storing the search result
    res = pd.DataFrame()

    # Finding non empty arguments
    fields = []
    if (title != ""):
        fields.append({"title": title})
    if (pub != ""):
        fields.append({"publisher": pub})
    if (auth != ""):
        fields.append({"author": auth})

    # Number of non empty arguments
    _sz = len(fields)

    if _sz == 0:
        print("You need to specify atleast one field")
        return

    # Iterate through non empty arguments. Combine them using AND logic. 
    _key = list(fields[0].keys())[0] 
    
    res = data.loc[data[_key] == fields[0][_key]]

    if _sz > 1:
        for i in fields[1:]:
            _key = list(i.keys())[0]
            res = res.loc[res[_key] == i[_key]]

    return res  


In [4]:
# Verification of search_exact. Status: Finished 
res_exact = search_exact(
    data=df, 
    title="OPERATING SYSTEM CONCEPTS", 
    auth="SILBERCHATZ & GALVIN", 
    pub="WILEY INDIA"
    )

if(res_exact.shape[0] == 0):
    print("No exact matches found. Check your spelling")
else:
    print(res_exact)

                          title                author    publisher
3130  OPERATING SYSTEM CONCEPTS  SILBERCHATZ & GALVIN  WILEY INDIA


In [5]:
# function to combine features
def combine_features(data=None, valid_args=None):

    if (data is None or valid_args is None):
        print("No dataframe selected/No arguments passed to combine")
        return
    
    
    _keys = list(valid_args.keys())

    # If you're passing scalar values, you have to pass an index
    # https://stackoverflow.com/a/17840195/12616968
    _row = pd.DataFrame(valid_args, index=[0])

    data = pd.concat([data, _row], axis=0, ignore_index=True)

    features = []

    sz = data.shape[0]

    try:
        for i in range(sz):
            fea = ""
            for j in _keys:
                fea += data[j][i] + " "
            features.append(fea)
    except:
        print(i)
    finally:
        data["combined"] = features
        return data

  


# Search similar books based on cosine similarity.
def search_similar(data=None, auth="", pub="", title=""):

    if (data is None):
        print("No data frame selected")
        return

    valid_args = {}


    if (title != ""):
        valid_args["title"] = title
    if (pub != ""):
        valid_args["publisher"] = pub
    if (auth != ""):
        valid_args["author"] = auth

    _temp = combine_features(data=df[list(valid_args.keys())], valid_args=valid_args)

    cm = CountVectorizer().fit_transform(_temp["combined"])

    # get cosine similarity mtx
    cs = cosine_similarity(cm)
    # print
    # print(type(cs))

    index = cs.shape[0]-1 # index of the added row

    a = list(enumerate((cs[index])))
    
    # Sort scores in descending order. More score means higher similarity
    sorted_scores = sorted(a, key=lambda x: x[1], reverse=True)

    return sorted_scores[1:]
    
    


In [None]:
res_similar = search_similar(
    data=df, 
    title="10 MINUTE GUIDE TO MICROSOFT EXCEL 2000",
    auth="FULTON,JENNIFER",
    pub="PHI"
    )

res_similar

In [7]:
# Printing similar results
j = 0
count = 15

for i in res_similar:

    try:
        bk_title = df[df.index == i[0]]['title'].values[0]
        bk_author = df[df.index == i[0]]['author'].values[0]
        bk_pub = df[df.index == i[0]]['publisher'].values[0]
    except IndexError:
        # Index Error happens because we have added an additional row that wasn't 
        # originally in dataframe. In that case we skip and continue with next index.
        continue

    print(f"""
    Number: {j+1}
    Title:  {bk_title} 
    Author: {bk_author} 
    Publisher: {bk_pub} 
    Score: {i[1]}
    --------------------------------
    """)
    j += 1
    if (j >= count):
        break



    Number: 1
    Title:  10 MINUTE GUIDE TO LINUX 
    Author: ROY J 
    Publisher: PHI 
    Score: 0.5976143046671968
    --------------------------------
    

    Number: 2
    Title:  10 MINUTE GUIDE TO YEAR 2000 CRISIS SURVIVAL 
    Author: PAULSON,ED 
    Publisher: PHI 
    Score: 0.5720775535473555
    --------------------------------
    

    Number: 3
    Title:  USING MICROSOFT EXCEL 2000 
    Author: KELLY,JULIA 
    Publisher: PHI 
    Score: 0.47809144373375745
    --------------------------------
    

    Number: 4
    Title:  WINDOWS 2000 SERVER 
    Author: MICROSOFT 
    Publisher: PHI 
    Score: 0.4242640687119285
    --------------------------------
    

    Number: 5
    Title:  COMPLETE GUIDE TO MICROSOFT WINDOWS 2000 SERVER 
    Author: NORTON,PETER 
    Publisher: TECHMEDIA 
    Score: 0.4
    --------------------------------
    

    Number: 6
    Title:  MICROSOFT OFFICE 2000 
    Author: HABRAKEN,JOE 
    Publisher: PHI 
    Score: 0.3872983346207417
