In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

import warnings
warnings.filterwarnings('ignore')


In [3]:
book_data=pd.read_csv("book_data1.csv",encoding='latin1')
book_data

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [4]:
book_data.shape

(10000, 4)

In [5]:

book_data.isna().sum()

Unnamed: 0     0
User.ID        0
Book.Title     0
Book.Rating    0
dtype: int64

In [6]:
book_data.dtypes


Unnamed: 0      int64
User.ID         int64
Book.Title     object
Book.Rating     int64
dtype: object

In [7]:
book_data['User.ID'].nunique()

2182

In [8]:
book_data['User.ID'].unique()

array([276726, 276729, 276736, ..., 162113, 162121, 162129], dtype=int64)

In [9]:
book_data['Book.Title'].nunique()

9659

In [10]:
book_data['Book.Title'].unique()

array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
       'How to Flirt: A Practical Guide', 'Twilight',
       'Kids Say the Darndest Things'], dtype=object)

In [11]:
book_data['Book.Title'].value_counts()

Fahrenheit 451                                              5
Charlie and the Chocolate Factory                           4
The Subtle Knife (His Dark Materials, Book 2)               4
Vanished                                                    4
Ender's Game (Ender Wiggins Saga (Paperback))               4
                                                           ..
Murder on St. Mark's Place (Gaslight Mysteries)             1
State of Grace                                              1
Valsalva's Maneuver: Mots Justes and Indispensable Terms    1
I love you, I hate you                                      1
Kids Say the Darndest Things                                1
Name: Book.Title, Length: 9659, dtype: int64

UBCF::::::

#metric 1: Correlation Metrics

In [12]:
ubcf_data=pd.pivot_table(data=book_data,values='Book.Rating',index='Book.Title',columns='User.ID').fillna(0)
ubcf_data.columns=book_data['User.ID'].unique()
ubcf_data

Unnamed: 0_level_0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
Book.Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Jason, Madison &amp",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Other Stories;Merril;1985;McClelland &amp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Repairing PC Drives &amp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\Well, there's your problem\: Cartoons",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iI Paradiso Degli Orchi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
stardust,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
ubcf_data.corr()

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,1.000000,-0.000272,-0.000272,-0.000272,-0.000469,-0.000272,-0.000523,-0.000272,-0.000272,-0.000384,...,-0.000272,-0.000471,-0.000272,-0.001203,-0.000469,-0.000272,-0.000272,-0.000991,-0.000272,-0.000663
276729,-0.000272,1.000000,-0.000104,-0.000104,-0.000179,-0.000104,-0.000199,-0.000104,-0.000104,-0.000146,...,-0.000104,-0.000179,-0.000104,-0.000458,-0.000179,-0.000104,-0.000104,-0.000378,-0.000104,-0.000253
276736,-0.000272,-0.000104,1.000000,-0.000104,-0.000179,-0.000104,-0.000199,-0.000104,-0.000104,-0.000146,...,-0.000104,-0.000179,-0.000104,-0.000458,-0.000179,-0.000104,-0.000104,-0.000378,-0.000104,-0.000253
276737,-0.000272,-0.000104,-0.000104,1.000000,-0.000179,-0.000104,-0.000199,-0.000104,-0.000104,-0.000146,...,-0.000104,-0.000179,-0.000104,-0.000458,-0.000179,-0.000104,-0.000104,-0.000378,-0.000104,-0.000253
276744,-0.000469,-0.000179,-0.000179,-0.000179,1.000000,-0.000179,-0.000344,-0.000179,-0.000179,-0.000252,...,-0.000179,-0.000309,-0.000179,-0.000791,-0.000309,-0.000179,-0.000179,-0.000652,-0.000179,-0.000436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,-0.000272,-0.000104,-0.000104,-0.000104,-0.000179,-0.000104,-0.000199,-0.000104,-0.000104,-0.000146,...,-0.000104,-0.000179,-0.000104,-0.000458,-0.000179,1.000000,-0.000104,-0.000378,-0.000104,-0.000253
162109,-0.000272,-0.000104,-0.000104,-0.000104,-0.000179,-0.000104,-0.000199,-0.000104,-0.000104,-0.000146,...,-0.000104,-0.000179,-0.000104,-0.000458,-0.000179,-0.000104,1.000000,-0.000378,-0.000104,-0.000253
162113,-0.000991,-0.000378,-0.000378,-0.000378,-0.000652,-0.000378,-0.000727,-0.000378,-0.000378,-0.000533,...,-0.000378,-0.000654,-0.000378,-0.001673,-0.000653,-0.000378,-0.000378,1.000000,-0.000378,-0.000922
162121,-0.000272,-0.000104,-0.000104,-0.000104,-0.000179,-0.000104,-0.000199,-0.000104,-0.000104,-0.000146,...,-0.000104,-0.000179,-0.000104,-0.000458,-0.000179,-0.000104,-0.000104,-0.000378,1.000000,-0.000253


In [14]:
book_data[book_data['User.ID']==276736]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8


In [15]:
book_data[book_data['User.ID']==162121]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
9990,9991,162121,The Cloister Walk,7
9991,9992,162121,Open Water,5
9992,9993,162121,The Evolution of Jane,8
9993,9994,162121,AT PARADISE GATE,8
9994,9995,162121,I Should Have Stayed Home: The Worst Trips of ...,8
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [16]:
book_data[book_data['User.ID']==276744]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
5,6,276744,The Kitchen God's Wife,7


In [17]:
book_data[book_data['User.ID']==162129]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
9999,10000,162129,Kids Say the Darndest Things,6


In [18]:
book_data[(book_data['User.ID']==162129) | (book_data['User.ID']==276744) | (book_data['User.ID']==162121)|(book_data['User.ID']==276736)]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
5,6,276744,The Kitchen God's Wife,7
9990,9991,162121,The Cloister Walk,7
9991,9992,162121,Open Water,5
9992,9993,162121,The Evolution of Jane,8
9993,9994,162121,AT PARADISE GATE,8
9994,9995,162121,I Should Have Stayed Home: The Worst Trips of ...,8
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7


METRIC 2: EUCLIDEON DISTANCE...

In [19]:
    from sklearn.metrics import pairwise_distances

In [20]:
ubcf_eucld=pd.pivot_table(data=book_data,values='Book.Rating',index='User.ID',columns='Book.Title').fillna(0)
ubcf_eucld.index=book_data['User.ID'].unique()
ubcf_eucld

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
276726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
162121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
pairwise_distances( X=ubcf_eucld, Y=None,metric='euclidean',)

array([[ 0.        , 16.03121954, 16.03121954, ..., 29.05167809,
        16.88194302, 22.737634  ],
       [16.03121954,  0.        ,  8.48528137, ..., 25.67099531,
        10.        , 18.22086716],
       [16.03121954,  8.48528137,  0.        , ..., 25.67099531,
        10.        , 18.22086716],
       ...,
       [29.05167809, 25.67099531, 25.67099531, ...,  0.        ,
        26.21068484, 30.31501278],
       [16.88194302, 10.        , 10.        , ..., 26.21068484,
         0.        , 18.97366596],
       [22.737634  , 18.22086716, 18.22086716, ..., 30.31501278,
        18.97366596,  0.        ]])

In [22]:
pd.DataFrame(data=pairwise_distances( X=ubcf_eucld, Y=None,metric='euclidean',))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,2179,2180,2181
0,0.000000,16.031220,16.031220,17.916473,17.521415,17.378147,18.439089,16.431677,16.431677,20.049938,...,17.916473,22.825424,16.881943,37.854986,18.493242,16.881943,17.378147,29.051678,16.881943,22.737634
1,16.031220,0.000000,8.485281,11.661904,11.045361,10.816654,12.449900,9.219544,9.219544,14.730920,...,11.661904,18.330303,10.000000,35.327043,12.529964,10.000000,10.816654,25.670995,10.000000,18.220867
2,16.031220,8.485281,0.000000,11.661904,11.045361,10.816654,12.449900,9.219544,9.219544,14.730920,...,11.661904,18.330303,10.000000,35.327043,12.529964,10.000000,10.816654,25.670995,10.000000,18.220867
3,17.916473,11.661904,11.661904,0.000000,13.638182,13.453624,14.798649,12.206556,12.206556,16.763055,...,14.142136,20.000000,12.806248,36.221541,14.866069,12.806248,13.453624,26.888659,12.806248,19.899749
4,17.521415,11.045361,11.045361,13.638182,0.000000,12.922848,14.317821,11.618950,11.618950,16.340135,...,13.638182,19.646883,12.247449,36.027767,14.387495,12.247449,12.922848,26.627054,12.247449,19.544820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,16.881943,10.000000,10.000000,12.806248,12.247449,12.041595,13.527749,10.630146,10.630146,15.652476,...,12.806248,19.078784,11.313708,35.721142,13.601471,0.000000,12.041595,26.210685,11.313708,18.973666
2178,17.378147,10.816654,10.816654,13.453624,12.922848,12.727922,14.142136,11.401754,11.401754,16.186414,...,13.453624,19.519221,12.041595,35.958309,14.212670,12.041595,0.000000,26.532998,12.041595,19.416488
2179,29.051678,25.670995,25.670995,26.888659,26.627054,26.532998,27.239677,25.922963,25.922963,28.354894,...,26.888659,30.380915,26.210685,42.836900,27.276363,26.210685,26.532998,0.000000,26.210685,30.315013
2180,16.881943,10.000000,10.000000,12.806248,12.247449,12.041595,13.527749,10.630146,10.630146,15.652476,...,12.806248,19.078784,11.313708,35.721142,13.601471,11.313708,12.041595,26.210685,0.000000,18.973666


In [23]:
ubcf_eucld_distance=pd.DataFrame(data=pairwise_distances( X=ubcf_eucld, Y=None,metric='euclidean',))
ubcf_eucld_distance.index=book_data['User.ID'].unique()
ubcf_eucld_distance.columns=book_data['User.ID'].unique()
ubcf_eucld_distance.round(2)

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,0.00,16.03,16.03,17.92,17.52,17.38,18.44,16.43,16.43,20.05,...,17.92,22.83,16.88,37.85,18.49,16.88,17.38,29.05,16.88,22.74
276729,16.03,0.00,8.49,11.66,11.05,10.82,12.45,9.22,9.22,14.73,...,11.66,18.33,10.00,35.33,12.53,10.00,10.82,25.67,10.00,18.22
276736,16.03,8.49,0.00,11.66,11.05,10.82,12.45,9.22,9.22,14.73,...,11.66,18.33,10.00,35.33,12.53,10.00,10.82,25.67,10.00,18.22
276737,17.92,11.66,11.66,0.00,13.64,13.45,14.80,12.21,12.21,16.76,...,14.14,20.00,12.81,36.22,14.87,12.81,13.45,26.89,12.81,19.90
276744,17.52,11.05,11.05,13.64,0.00,12.92,14.32,11.62,11.62,16.34,...,13.64,19.65,12.25,36.03,14.39,12.25,12.92,26.63,12.25,19.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,16.88,10.00,10.00,12.81,12.25,12.04,13.53,10.63,10.63,15.65,...,12.81,19.08,11.31,35.72,13.60,0.00,12.04,26.21,11.31,18.97
162109,17.38,10.82,10.82,13.45,12.92,12.73,14.14,11.40,11.40,16.19,...,13.45,19.52,12.04,35.96,14.21,12.04,0.00,26.53,12.04,19.42
162113,29.05,25.67,25.67,26.89,26.63,26.53,27.24,25.92,25.92,28.35,...,26.89,30.38,26.21,42.84,27.28,26.21,26.53,0.00,26.21,30.32
162121,16.88,10.00,10.00,12.81,12.25,12.04,13.53,10.63,10.63,15.65,...,12.81,19.08,11.31,35.72,13.60,11.31,12.04,26.21,0.00,18.97


    METRICS:3 ...Cosine Distance....

In [28]:
 1-pairwise_distances( X=ubcf_eucld_distance, Y=None,metric='cosine')

array([[1.        , 0.97612228, 0.97612228, ..., 0.98618013, 0.98586671,
        0.9984766 ],
       [0.97612228, 1.        , 0.9998947 , ..., 0.92821916, 0.99854212,
        0.96502312],
       [0.97612228, 0.9998947 , 1.        , ..., 0.92821916, 0.99854212,
        0.96502312],
       ...,
       [0.98618013, 0.92821916, 0.92821916, ..., 1.        , 0.9458571 ,
        0.99263614],
       [0.98586671, 0.99854212, 0.99854212, ..., 0.9458571 , 1.        ,
        0.97706997],
       [0.9984766 , 0.96502312, 0.96502312, ..., 0.99263614, 0.97706997,
        1.        ]])

In [29]:
cosine_distance=pd.DataFrame(data=1-pairwise_distances( X=ubcf_eucld_distance, Y=None,metric='cosine',))
cosine_distance.index=book_data['User.ID'].unique()
cosine_distance.columns=book_data['User.ID'].unique()
cosine_distance.round(2)

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,1.00,0.98,0.98,0.99,0.99,0.99,1.00,0.98,0.98,1.00,...,0.99,1.00,0.99,0.97,1.00,0.99,0.99,0.99,0.99,1.00
276729,0.98,1.00,1.00,0.99,1.00,1.00,0.99,1.00,1.00,0.98,...,0.99,0.96,1.00,0.89,0.99,1.00,1.00,0.93,1.00,0.97
276736,0.98,1.00,1.00,0.99,1.00,1.00,0.99,1.00,1.00,0.98,...,0.99,0.96,1.00,0.89,0.99,1.00,1.00,0.93,1.00,0.97
276737,0.99,0.99,0.99,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,0.99,1.00,0.93,1.00,1.00,1.00,0.96,1.00,0.99
276744,0.99,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,...,1.00,0.98,1.00,0.93,1.00,1.00,1.00,0.96,1.00,0.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,0.99,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,...,1.00,0.98,1.00,0.91,1.00,1.00,1.00,0.95,1.00,0.98
162109,0.99,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,...,1.00,0.98,1.00,0.92,1.00,1.00,1.00,0.95,1.00,0.98
162113,0.99,0.93,0.93,0.96,0.96,0.95,0.97,0.94,0.94,0.98,...,0.96,0.99,0.95,1.00,0.97,0.95,0.95,1.00,0.95,0.99
162121,0.99,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,...,1.00,0.98,1.00,0.91,1.00,1.00,1.00,0.95,1.00,0.98


# The End.........