In [1]:
import numpy as np 
import pandas as pd 
import os

'''Originally on kaggle.com Dataset loaded directly from their interface'''
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# !pip install sentence-transformers
if os.path.exists('../input/book-depository-dataset'):
    path_prefix = '../input/book-depository-dataset/{}.csv'
else:
    path_prefix = '../export/kaggle/{}.csv'

/kaggle/input/book-depository-dataset/formats.csv
/kaggle/input/book-depository-dataset/places.csv
/kaggle/input/book-depository-dataset/categories.csv
/kaggle/input/book-depository-dataset/authors.csv
/kaggle/input/book-depository-dataset/dataset.csv
/kaggle/input/descr-title/Descr_Title.csv
/kaggle/input/finaddataset-nodups-givenv/FinalDataset_npDups (1).csv


In [2]:
# Read CSV, get column names and % of null values in each features
df = pd.read_csv(r"/kaggle/input/book-depository-dataset/dataset.csv")
df.columns
# df.columns,df.isnull().sum()/len(df)

Index(['authors', 'bestsellers-rank', 'categories', 'description',
       'dimension-x', 'dimension-y', 'dimension-z', 'edition',
       'edition-statement', 'for-ages', 'format', 'id', 'illustrations-note',
       'image-checksum', 'image-path', 'image-url', 'imprint', 'index-date',
       'isbn10', 'isbn13', 'lang', 'publication-date', 'publication-place',
       'rating-avg', 'rating-count', 'title', 'url', 'weight'],
      dtype='object')

In [3]:
# Remove irrelevant features
df = df[['authors', 'categories', 'description', 'illustrations-note',
       'image-path', 'image-url','isbn10', 'isbn13', 'lang', 'publication-date','title', 'url']]
df.columns,df.isnull().sum()/len(df)

(Index(['authors', 'categories', 'description', 'illustrations-note',
        'image-path', 'image-url', 'isbn10', 'isbn13', 'lang',
        'publication-date', 'title', 'url'],
       dtype='object'),
 authors               0.000000
 categories            0.000000
 description           0.072191
 illustrations-note    0.678672
 image-path            0.000024
 image-url             0.000024
 isbn10                0.000000
 isbn13                0.000000
 lang                  0.054451
 publication-date      0.002346
 title                 0.000000
 url                   0.000000
 dtype: float64)

In [4]:
#Drop all Null-valued rows
print(len(df))
df = df.dropna()
print(len(df))
print(df.description.isnull().values.any())

1109383
336197
False


In [5]:
# Filter entries with larger than 40 and only english novels
print(len(df))
df = df[df['description'].apply(lambda x: len(x)>40)]
print(len(df))
df = df[df['lang'].apply(lambda x: x=="en")]
print(len(df))

336197
329890
316233


In [6]:
#Drop duplicates wrt description feature
df = df.drop_duplicates(subset=['description'])
print(len(df))
df = df.drop_duplicates(subset=['title'])
print(len(df))#,len(df2))

279483
270128


# **Get category and Author info from other CSV**

In [22]:
#Get the datas to expand auth and category
df3 = df.sample(n=50000)
len(df3)
auth = pd.read_csv(r"/kaggle/input/book-depository-dataset/authors.csv")
cats = pd.read_csv(r"/kaggle/input/book-depository-dataset/categories.csv")

In [23]:
#Need to convert the list of IDs to their respective joined strings 
df3.head(5),len(df3)

(                                        authors  \
 496398                                 [327297]   
 930134                 [563289, 563290, 563291]   
 939056  [571891, 425631, 571892, 571893, 52449]   
 487642                                  [53690]   
 588007                                 [342299]   
 
                                         categories  \
 496398            [411, 2718, 661, 865, 947, 2761]   
 930134              [1520, 1694, 1722, 1833, 1835]   
 939056  [1294, 1593, 1603, 1622, 1722, 1738, 1887]   
 487642        [1402, 1456, 2772, 2791, 2831, 2836]   
 588007                          [2802, 2841, 2452]   
 
                                               description  \
 496398  Emanating from an international conference on ...   
 930134  This is the first book aimed at development of...   
 939056  Pearl millet is mainly used for animal and pou...   
 487642  Young or old, sedentary or athletic; it is lik...   
 588007  Proud Peacock Journal - A 6x9" Size

In [24]:
# Dict {author_id : author_name}
print(auth.head(10))
auths = dict(auth.values)

   author_id                  author_name
0       9561                          NaN
1     451324                # House Press
2     454250                # Petal Press
3     249724              #GARCIA MIGUELE
4     287710            #Worldlcass Media
5     156535        #shakeback Publishing
6      74042  &  Rueckert  Elkins McCarty
7     172310                & Bacon Allyn
8      95712       & Berry Roskin & Berry
9     104707  & Bonchek Shepsle & Bonchek


In [25]:
# Dict of {category_id : category_name}
print(cats.head(5))
cats = dict(cats.values)

   category_id                                category_name
0         1998                             .Net Programming
1          176  20th Century & Contemporary Classical Music
2         3291  20th Century & Contemporary Classical Music
3         2659      20th Century History: C 1900  To C 2000
4         2661          21st Century History: From C 2000 -


In [26]:
# Convert the string to parsable list format
import ast
df3.authors = df3.authors.apply(lambda x: ast.literal_eval(x))

In [27]:
df3.categories = df3.categories.apply(lambda x: ast.literal_eval(x))

In [28]:
# Convert IDs to strings
def expand_auth(ls,dictn):
#     print(ls)
    if len(ls)==1:
        return dictn[ls[0]]
    s=""
    for x in ls:
        if type(dictn[x])==str:
            s+= dictn[x] + " ; "
    return s
df3.authors = df3.authors.apply(lambda x: expand_auth(x,auths))
df3.head(5)                           


Unnamed: 0,authors,categories,description,illustrations-note,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url
496398,Takyiwaa Manuh,"[411, 2718, 661, 865, 947, 2761]",Emanating from an international conference on ...,"1, black & white illustrations",full/d/6/c/d6c66c203d3a18ab6a03e643872814b9fcf...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,9988550790,9789988550790,en,2005-12-31 00:00:00,At Home in the World : International Migration...,/At-Home-World-Takyiwaa-Manuh/9789988550790
930134,Nadezhda Goncharova ; Jean-Louis Morel ; Guill...,"[1520, 1694, 1722, 1833, 1835]",This is the first book aimed at development of...,"XI, 346 p.",full/5/f/9/5f91944ea9009920769d0eb8a10d69d719a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1402046871,9781402046872,en,2006-06-15 00:00:00,Phytoremediation of Metal-Contaminated Soils,/Phytoremediation-Metal-Contaminated-Soils-Jea...
939056,Kawaljit Singh Sandhu ; Sneh Punia ; Maninder ...,"[1294, 1593, 1603, 1622, 1722, 1738, 1887]",Pearl millet is mainly used for animal and pou...,"11 Tables, black and white; 19 Illustrations, ...",full/6/4/0/6404c37d6d2d1eabedc9d240f7d39983254...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,367354861,9780367354862,en,2020-03-20 00:00:00,"Pearl Millet : Properties, Functionality and i...",/Pearl-Millet-Sneh-Punia/9780367354862
487642,Dr. Jwing-Ming Yang,"[1402, 1456, 2772, 2791, 2831, 2836]","Young or old, sedentary or athletic; it is lik...","202 Halftones, black and white",full/9/2/6/92661e2ee3699c7c3dcbeab4e0e7afa571d...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1594390258,9781594390258,en,2004-09-01 00:00:00,Back Pain Relief : Chinese Qigong for Healing ...,/Back-Pain-Relief-Dr-Jwing-Ming-Yang/978159439...
588007,Quipoppe Publications,"[2802, 2841, 2452]","Proud Peacock Journal - A 6x9"" Size Journaling...","Illustrations, black and white",full/a/f/8/af8d1a39061669137488ab6f0de4590a31a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1985202581,9781985202580,en,2018-02-08 00:00:00,Proud Peacock Lined Journal : Medium Lined Jou...,/Proud-Peacock-Lined-Journal-Quipoppe-Publicat...


In [29]:
df3.categories = df3.categories.apply(lambda x: expand_auth(x,auths))
df3.head(5) 

Unnamed: 0,authors,categories,description,illustrations-note,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url
496398,Takyiwaa Manuh,John Sydes ; Gerunda B. Hughes ; Alice Hansen ...,Emanating from an international conference on ...,"1, black & white illustrations",full/d/6/c/d6c66c203d3a18ab6a03e643872814b9fcf...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,9988550790,9789988550790,en,2005-12-31 00:00:00,At Home in the World : International Migration...,/At-Home-World-Takyiwaa-Manuh/9789988550790
930134,Nadezhda Goncharova ; Jean-Louis Morel ; Guill...,Marcy Pavord ; Dennis Carter ; Moosewood Colle...,This is the first book aimed at development of...,"XI, 346 p.",full/5/f/9/5f91944ea9009920769d0eb8a10d69d719a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1402046871,9781402046872,en,2006-06-15 00:00:00,Phytoremediation of Metal-Contaminated Soils,/Phytoremediation-Metal-Contaminated-Soils-Jea...
939056,Kawaljit Singh Sandhu ; Sneh Punia ; Maninder ...,Tami Bertagna ; Shu F. Ho ; Fatima Pirbhai-Ill...,Pearl millet is mainly used for animal and pou...,"11 Tables, black and white; 19 Illustrations, ...",full/6/4/0/6404c37d6d2d1eabedc9d240f7d39983254...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,367354861,9780367354862,en,2020-03-20 00:00:00,"Pearl Millet : Properties, Functionality and i...",/Pearl-Millet-Sneh-Punia/9780367354862
487642,Dr. Jwing-Ming Yang,Canon Franco William Alberto ; Gary Spruce ; E...,"Young or old, sedentary or athletic; it is lik...","202 Halftones, black and white",full/9/2/6/92661e2ee3699c7c3dcbeab4e0e7afa571d...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1594390258,9781594390258,en,2004-09-01 00:00:00,Back Pain Relief : Chinese Qigong for Healing ...,/Back-Pain-Relief-Dr-Jwing-Ming-Yang/978159439...
588007,Quipoppe Publications,Tim Hawcroft ; Nik Hynek ; Garry Cooper ;,"Proud Peacock Journal - A 6x9"" Size Journaling...","Illustrations, black and white",full/a/f/8/af8d1a39061669137488ab6f0de4590a31a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1985202581,9781985202580,en,2018-02-08 00:00:00,Proud Peacock Lined Journal : Medium Lined Jou...,/Proud-Peacock-Lined-Journal-Quipoppe-Publicat...


In [30]:
df3.to_csv("All_info.csv")

In [31]:
df3.head(5)

Unnamed: 0,authors,categories,description,illustrations-note,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url
496398,Takyiwaa Manuh,John Sydes ; Gerunda B. Hughes ; Alice Hansen ...,Emanating from an international conference on ...,"1, black & white illustrations",full/d/6/c/d6c66c203d3a18ab6a03e643872814b9fcf...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,9988550790,9789988550790,en,2005-12-31 00:00:00,At Home in the World : International Migration...,/At-Home-World-Takyiwaa-Manuh/9789988550790
930134,Nadezhda Goncharova ; Jean-Louis Morel ; Guill...,Marcy Pavord ; Dennis Carter ; Moosewood Colle...,This is the first book aimed at development of...,"XI, 346 p.",full/5/f/9/5f91944ea9009920769d0eb8a10d69d719a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1402046871,9781402046872,en,2006-06-15 00:00:00,Phytoremediation of Metal-Contaminated Soils,/Phytoremediation-Metal-Contaminated-Soils-Jea...
939056,Kawaljit Singh Sandhu ; Sneh Punia ; Maninder ...,Tami Bertagna ; Shu F. Ho ; Fatima Pirbhai-Ill...,Pearl millet is mainly used for animal and pou...,"11 Tables, black and white; 19 Illustrations, ...",full/6/4/0/6404c37d6d2d1eabedc9d240f7d39983254...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,367354861,9780367354862,en,2020-03-20 00:00:00,"Pearl Millet : Properties, Functionality and i...",/Pearl-Millet-Sneh-Punia/9780367354862
487642,Dr. Jwing-Ming Yang,Canon Franco William Alberto ; Gary Spruce ; E...,"Young or old, sedentary or athletic; it is lik...","202 Halftones, black and white",full/9/2/6/92661e2ee3699c7c3dcbeab4e0e7afa571d...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1594390258,9781594390258,en,2004-09-01 00:00:00,Back Pain Relief : Chinese Qigong for Healing ...,/Back-Pain-Relief-Dr-Jwing-Ming-Yang/978159439...
588007,Quipoppe Publications,Tim Hawcroft ; Nik Hynek ; Garry Cooper ;,"Proud Peacock Journal - A 6x9"" Size Journaling...","Illustrations, black and white",full/a/f/8/af8d1a39061669137488ab6f0de4590a31a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1985202581,9781985202580,en,2018-02-08 00:00:00,Proud Peacock Lined Journal : Medium Lined Jou...,/Proud-Peacock-Lined-Journal-Quipoppe-Publicat...


In [32]:
%cd /kaggle/working
from IPython.display import FileLink
FileLink(r'All_info.csv')
# Click the below blue-d .csv link to dowload

/kaggle/working


# Experimenting and sentence Transformers 

In [33]:
df = pd.read_csv(r"./All_info.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,authors,categories,description,illustrations-note,image-path,image-url,isbn10,isbn13,lang,publication-date,title,url
0,496398,Takyiwaa Manuh,John Sydes ; Gerunda B. Hughes ; Alice Hansen ...,Emanating from an international conference on ...,"1, black & white illustrations",full/d/6/c/d6c66c203d3a18ab6a03e643872814b9fcf...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,9988550790,9789988550790,en,2005-12-31 00:00:00,At Home in the World : International Migration...,/At-Home-World-Takyiwaa-Manuh/9789988550790
1,930134,Nadezhda Goncharova ; Jean-Louis Morel ; Guill...,Marcy Pavord ; Dennis Carter ; Moosewood Colle...,This is the first book aimed at development of...,"XI, 346 p.",full/5/f/9/5f91944ea9009920769d0eb8a10d69d719a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1402046871,9781402046872,en,2006-06-15 00:00:00,Phytoremediation of Metal-Contaminated Soils,/Phytoremediation-Metal-Contaminated-Soils-Jea...
2,939056,Kawaljit Singh Sandhu ; Sneh Punia ; Maninder ...,Tami Bertagna ; Shu F. Ho ; Fatima Pirbhai-Ill...,Pearl millet is mainly used for animal and pou...,"11 Tables, black and white; 19 Illustrations, ...",full/6/4/0/6404c37d6d2d1eabedc9d240f7d39983254...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,367354861,9780367354862,en,2020-03-20 00:00:00,"Pearl Millet : Properties, Functionality and i...",/Pearl-Millet-Sneh-Punia/9780367354862
3,487642,Dr. Jwing-Ming Yang,Canon Franco William Alberto ; Gary Spruce ; E...,"Young or old, sedentary or athletic; it is lik...","202 Halftones, black and white",full/9/2/6/92661e2ee3699c7c3dcbeab4e0e7afa571d...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1594390258,9781594390258,en,2004-09-01 00:00:00,Back Pain Relief : Chinese Qigong for Healing ...,/Back-Pain-Relief-Dr-Jwing-Ming-Yang/978159439...
4,588007,Quipoppe Publications,Tim Hawcroft ; Nik Hynek ; Garry Cooper ;,"Proud Peacock Journal - A 6x9"" Size Journaling...","Illustrations, black and white",full/a/f/8/af8d1a39061669137488ab6f0de4590a31a...,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,1985202581,9781985202580,en,2018-02-08 00:00:00,Proud Peacock Lined Journal : Medium Lined Jou...,/Proud-Peacock-Lined-Journal-Quipoppe-Publicat...


In [34]:
descr = df.description.tolist()
title = df.title.tolist()
print(len(descr),len(title),descr[0])

50000 50000 Emanating from an international conference on migration and development convened by the Institute of African Studies at the University of Ghana, Legon, the UNDP and the Royal Netherlands Embassy, this collection of papers considers topics such as: patterns of migration in West Africa; the Dutch perspective on contemporary migration; the macroeconomic impact of remittances; the impact of the brain drain on the health and higher education sectors in Ghana; the religious dimension of migration; and the role of diaspora-based organisations in socio-economic development.


# Semantic vectorisation using pre-trained model

In [35]:
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-1.2.0.tar.gz (81 kB)
[K     |████████████████████████████████| 81 kB 942 kB/s eta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-1.2.0-py3-none-any.whl size=123337 sha256=935f43e14d498b0ff65fced7a559a5f1d0e891b269c199cb050389bccbc9e6f5
  Stored in directory: /root/.cache/pip/wheels/5a/34/6c/17406cadd88634de11a062015d04d1de556b45c9921752805a
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-1.2.0


In [36]:
import torch
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
embeddings = model.encode(descr, convert_to_tensor=False,convert_to_numpy=True )

  0%|          | 0.00/306M [00:00<?, ?B/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

In [37]:
embeddings.shape

(50000, 768)

In [39]:
#Save model and use it for inference
np.savez_compressed("Embeddings.npz",embeddings)

In [40]:
%cd /kaggle/working
from IPython.display import FileLink
FileLink(r"Embeddings.npz")

/kaggle/working


In [41]:
cosine_scores = util.pytorch_cos_sim(embeddings, torch.tensor(embeddings[0]))
cosine_scores[:10]

tensor([[1.0000],
        [0.2538],
        [0.1824],
        [0.0438],
        [0.0022],
        [0.1884],
        [0.0654],
        [0.0937],
        [0.1780],
        [0.0455]])

In [44]:
#Get the most similar storylines for the 1st 20
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings[:20])
cos = cosine_scores.T
print(cos)
# Zero out the diagonal 1s to prevent self-match
ind = np.diag_indices(cos.shape[0])
cos[ind[0], ind[1]] = torch.zeros(cos.shape[0]).to(device='cpu')
print(cos)
inds = torch.argmax(cos,dim=1)
inds

tensor([[ 1.0000,  0.2538,  0.1824,  ...,  0.0858,  0.1929,  0.2478],
        [ 0.2538,  1.0000,  0.2913,  ...,  0.1557,  0.3298,  0.3814],
        [ 0.1824,  0.2913,  1.0000,  ...,  0.0972,  0.1357,  0.1555],
        ...,
        [ 0.1784,  0.1453,  0.2574,  ..., -0.0151,  0.2514,  0.2216],
        [ 0.1326,  0.0275, -0.0326,  ...,  0.0591,  0.2050,  0.0987],
        [ 0.2356,  0.2278,  0.2725,  ...,  0.1210,  0.1348,  0.2526]])
tensor([[ 0.0000,  0.2538,  0.1824,  ...,  0.0858,  0.1929,  0.2478],
        [ 0.2538,  0.0000,  0.2913,  ...,  0.1557,  0.3298,  0.3814],
        [ 0.1824,  0.2913,  0.0000,  ...,  0.0972,  0.1357,  0.1555],
        ...,
        [ 0.1784,  0.1453,  0.2574,  ..., -0.0151,  0.2514,  0.2216],
        [ 0.1326,  0.0275, -0.0326,  ...,  0.0591,  0.2050,  0.0987],
        [ 0.2356,  0.2278,  0.2725,  ...,  0.1210,  0.1348,  0.2526]])


tensor([15176, 15029, 31362, 25352, 32079, 36670, 33887, 34261, 38733, 45555,
        37733, 14987, 44222, 30976, 35789, 14555,  7849, 22298, 27696, 37381])

In [45]:
# print results
a=0
for x in inds.tolist():
    print(title[a],"--",descr[a],"\n == SIMILAR TO == \n",title[x],"--",descr[x],"\n\n\n\n")
    a+=1
    

At Home in the World : International Migration and Development in Contemporary Ghana and West Africa -- Emanating from an international conference on migration and development convened by the Institute of African Studies at the University of Ghana, Legon, the UNDP and the Royal Netherlands Embassy, this collection of papers considers topics such as: patterns of migration in West Africa; the Dutch perspective on contemporary migration; the macroeconomic impact of remittances; the impact of the brain drain on the health and higher education sectors in Ghana; the religious dimension of migration; and the role of diaspora-based organisations in socio-economic development. 
 == SIMILAR TO == 
 The African Christian and Islam -- During the summer of 2010 Ghana played host to the first ever conference held within Africa to focus solely on the relationship of the African Christian and Islam. The event was led by John Azumah in partnership with the Center of Early African Theology. The conferen

# Small EDA

In [None]:
#Reload data
df = pd.read_csv(r"/kaggle/input/book-depository-dataset/dataset.csv")

In [None]:
df.drop(['dimension-x', 'dimension-y','dimension-z', 'edition', 'edition-statement','isbn10',
         'isbn13','publication-place','url','weight','format','illustrations-note','imprint',
        'index-date','image-checksum','image-path','image-url','for-ages','id'], axis = 1,inplace = True)
#for-ages very sparse 93% nan values
df.head(3)

In [None]:
#Extract Year of Pub alone
df['publication-date'] = df['publication-date'].apply(lambda x:str(x).split("-")[0])
print(df['publication-date'][:5])

In [None]:
df.isnull().sum()/len(df)

In [None]:
print(len(df))
df = df.dropna()
print(len(df))

In [None]:
df.head(5)

In [None]:
df = df.sort_values("rating-avg",ascending=False)
df.head(5)

In [None]:
print(len(df))
df.dropna(subset=["description","lang","publication-date"],inplace=True)
print(len(df))
print(df.isnull().sum()/len(df))

In [None]:
# Final features: authors, categories, descr, lang, publication-date, title
df.head(3)

In [None]:
df['lang'].value_counts()

In [None]:
#Convert langs to unique IDs
langs = df.lang.unique()
print(len(langs))
lang_dict = { langs[x] : x+1 for x in range(len(langs))}
print(lang_dict)
df.lang = df.lang.apply(lambda x:lang_dict[x])


In [None]:
#Convert year of publication to unique IDs
dates = df["publication-date"].unique()
print(len(dates))
dates_dict = { dates[x] : x+1 for x in range(len(dates))}
print(dates_dict)
df["publication-date"] = df["publication-date"].apply(lambda x:dates_dict[x])

In [None]:
print(df["rating-avg"].describe())
print(df["rating-count"].describe())