![SolidQ](https://www.solidq.com/wp-content/uploads/2015/06/Logo-SolidQ-Web.gif)

# Collaborative-Filtering utilizando K-NN

In [25]:

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD

book = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")

book.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL','Nousada1','Nousada2','Nousada3']
user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")

user.columns = ['userID', 'Location', 'Age']
rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")

rating.columns = ['userID', 'ISBN', 'bookRating']

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
rating.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [0]:
user.head()

In [0]:
book.head()

In [27]:
combine_book_rating = pd.merge(rating, book, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,Nousada1,Nousada2,Nousada3
0,276725,034545104X,0,Flesh Tones: A Novel,,,
1,2313,034545104X,5,Flesh Tones: A Novel,,,
2,6543,034545104X,0,Flesh Tones: A Novel,,,
3,8680,034545104X,5,Flesh Tones: A Novel,,,
4,10314,034545104X,9,Flesh Tones: A Novel,,,


### Filtrar solo los libros populares

Quitar filas donde no hay título

In [0]:
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])

In [0]:
book_ratingCount = (combine_book_rating.
     groupby(by = ['bookTitle'])['bookRating'].
     count().
     reset_index().
     rename(columns = {'bookRating': 'totalRatingCount'})
     [['bookTitle', 'totalRatingCount']]
    )
book_ratingCount.head()

Unnamed: 0,bookTitle,totalRatingCount
0,Earth Prayers From around the World: 365 Pray...,10
1,Final Fantasy Anthology: Official Strategy Gu...,4
2,Flight of Fancy: American Heiresses (Zebra Ba...,2
3,Little Comic Shop of Horrors (Give Yourself G...,4
4,Mystery Mile,2


#### Ahora podemos hacer un join de la cuenta de total rating en los datos de rating, lo que nos permitirá filtrar fácilmente los menos conocidos

In [0]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,Nousada1,Nousada2,Nousada3,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,,,,60
1,2313,034545104X,5,Flesh Tones: A Novel,,,,60
2,6543,034545104X,0,Flesh Tones: A Novel,,,,60
3,8680,034545104X,5,Flesh Tones: A Novel,,,,60
4,10314,034545104X,9,Flesh Tones: A Novel,,,,60


In [0]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   68765.000
mean        2.863
std         9.681
min         1.000
25%         1.000
50%         1.000
75%         2.000
max       833.000
Name: totalRatingCount, dtype: float64


#### El libro medio ha sido valorado solo una vez. Veamos la distribución

In [0]:
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    5.000
0.910    5.000
0.920    6.000
0.930    6.000
0.940    7.000
0.950    8.000
0.960   10.000
0.970   12.000
0.980   16.000
0.990   26.000
Name: totalRatingCount, dtype: float64


#### Parece que casi un 1% de los libros tienen 28 ratings, 2% tiene 17 ratings. Como tenemos un montón de libros en nuestros datos, nos vamos a limitar a ese 1%  

In [0]:
popularity_threshold = 25
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,Nousada1,Nousada2,Nousada3,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,,,,60
1,2313,034545104X,5,Flesh Tones: A Novel,,,,60
2,6543,034545104X,0,Flesh Tones: A Novel,,,,60
3,8680,034545104X,5,Flesh Tones: A Novel,,,,60
4,10314,034545104X,9,Flesh Tones: A Novel,,,,60


#### Vamos a filtrar usuarios solo de USA y Canada

In [0]:
combined = rating_popular_book.merge(user, left_on = 'userID', right_on = 'userID', how = 'left')

us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)
us_canada_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,Nousada1,Nousada2,Nousada3,totalRatingCount,Location
0,276725,034545104X,0,Flesh Tones: A Novel,,,,60,"tyler, texas, usa"
1,2313,034545104X,5,Flesh Tones: A Novel,,,,60,"cincinnati, ohio, usa"
2,6543,034545104X,0,Flesh Tones: A Novel,,,,60,"strafford, missouri, usa"
3,8680,034545104X,5,Flesh Tones: A Novel,,,,60,"st. charles county, missouri, usa"
4,10314,034545104X,9,Flesh Tones: A Novel,,,,60,"beaverton, oregon, usa"


In [0]:
if not us_canada_user_rating[us_canada_user_rating.duplicated(['userID', 'bookTitle'])].empty:
    initial_rows = us_canada_user_rating.shape[0]

    print('Initial dataframe shape {0}'.format(us_canada_user_rating.shape))
    us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
    current_rows = us_canada_user_rating.shape[0]
    print('New dataframe shape {0}'.format(us_canada_user_rating.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (37247, 9)
New dataframe shape (37162, 9)
Removed 85 rows


In [0]:
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

In [0]:
us_canada_user_rating_pivot.head()

userID,8,44,67,69,75,99,207,228,243,244,254,256,383,387,388,408,424,446,487,496,503,505,507,619,638,657,660,709,728,735,741,744,757,765,774,778,805,843,882,902,...,278002,278007,278014,278026,278075,278111,278119,278137,278144,278160,278162,278188,278194,278202,278216,278238,278251,278255,278257,278274,278314,278346,278356,278390,278411,278418,278503,278506,278534,278535,278541,278582,278633,278641,278663,278694,278832,278843,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
09-nov,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204 Rosewood Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
311 Pelican Court,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Entrenando el Algoritmo

In [0]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [0]:
#obtengo los vecinos
model_knn.kneighbors()

(array([[0.90094639, 0.901988  , 0.91350922, 0.91699029, 0.91963305],
        [0.64891821, 0.69722747, 0.72788091, 0.75358856, 0.76788035],
        [0.42653766, 0.44184369, 0.75086356, 0.81091644, 0.88873616],
        ...,
        [0.88380377, 0.88798013, 0.90284758, 0.91281313, 0.9128517 ],
        [0.83473218, 0.85326896, 0.87730241, 0.89663126, 0.89738687],
        [0.89123168, 0.91484634, 0.91737242, 0.92092217, 0.92320683]]),
 array([[ 76, 410, 278, 333, 396],
        [132,   4, 277,   3, 249],
        [313, 307, 120, 119, 596],
        ...,
        [240, 662, 567, 458,  65],
        [ 37, 398, 568,  46, 632],
        [406, 527, 198, 299, 687]]))

In [0]:
model_knn.effective_metric_

'cosine'

In [0]:
#Obtengo una observación al azar
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])


In [0]:
query_index

289

In [0]:
us_canada_user_rating_pivot.iloc[query_index, :]
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1,-1), n_neighbors = 6) 

## Obteniendo recomendaciones

In [0]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for In the Fall (Vintage Contemporaries (Paperback)):

1: Anger: Wisdom for Cooling the Flames, with distance of 0.8623912930687566:
2: Anna Karenina (Wordsworth Classics), with distance of 0.8701082935203155:
3: Angela's Ashes: A Memoir, with distance of 0.9108255382247769:
4: Forbidden Fruit, with distance of 0.9121921459063264:
5: Dead Run, with distance of 0.9198215245185651:


#Otro ejemplo de música

https://beckernick.github.io/music_recommender/ 