In [157]:
import os
import pandas as pd
import numpy as np

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [158]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw/")

In [159]:
os.listdir(path)

['BX-NewBooksUsers.csv',
 'BX-Books.csv',
 'BX-Ratings.csv',
 'BX-Users.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooks.csv']

In [198]:
users = pd.read_csv(path + 'BX-Users.csv')
ratings = pd.read_csv(path + 'BX-Ratings.csv')
books = pd.read_csv(path + 'BX-Books.csv')

In [209]:
ratings_user = ratings.merge(users)
ratings_user = ratings_user[['User-ID', 'ISBN', 'Book-Rating', 'User-Age']]

In [210]:
ratings_user

Unnamed: 0,User-ID,ISBN,Book-Rating,User-Age
0,276744,038550120X,7,
1,276754,0684867621,8,19
2,276755,0451166892,5,32
3,276762,0380711524,5,25
4,276772,0553572369,7,
...,...,...,...,...
204159,276688,0892966548,10,
204160,276688,1551669315,6,
204161,276704,0345386108,6,
204162,276704,0743211383,7,


In [211]:
books_read = ratings_user.groupby(['User-ID'])["ISBN"].count().reset_index()
books_read[books_read['ISBN'] > 25]

Unnamed: 0,User-ID,ISBN
40,254,37
81,507,28
97,638,45
148,882,30
252,1424,34
...,...,...
47894,276680,64
48014,277427,87
48143,278026,36
48215,278418,37


In [212]:
ratings_user = ratings_user[ratings_user['User-ID'].isin(books_read[books_read['ISBN'] > 20]['User-ID'])]

In [213]:
ratings_user = pd.pivot(ratings_user, index = ['User-ID', 'User-Age'], values = 'Book-Rating', columns = 'ISBN').reset_index()

In [214]:
ratings_user = ratings_user.T.fillna(ratings_user[ratings_user.columns[2:]].mean(axis = 1)).T

  ratings_user = ratings_user.T.fillna(ratings_user[ratings_user.columns[2:]].mean(axis = 1)).T


In [215]:
ratings_user["User-Age"] = ratings_user["User-Age"].str.rstrip('"')
ratings_user['User-Age'] = ratings_user['User-Age'].astype('Int64')

In [216]:
features = ratings_user[ratings_user.columns[2:]]
target = ratings_user['User-Age']

In [217]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [218]:
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(features)

In [219]:
def impute_age(row):
    if pd.isna(row['User-Age']):
        # Find indices of k-nearest neighbors
        distances, indices = knn.kneighbors([features.iloc[row.name]])
        nearest_neighbors_ages = target.iloc[indices.flatten()].dropna()
        if not nearest_neighbors_ages.empty:
            imputed_age = nearest_neighbors_ages.median()
            return imputed_age
    return row['User-Age']

In [220]:
ratings_user['User-Age-Imputed'] = ratings_user.apply(impute_age, axis=1)



In [221]:
user_age = ratings_user[['User-ID', 'User-Age', 'User-Age-Imputed']]

In [222]:
l = list(user_age.iloc[knn.kneighbors([features.iloc[1]])[1][0]]['User-ID'])
l.append(user_age.iloc[1]['User-ID'])
l



[507.0, 61028, 182838, 18082, 81121.0, 507.0]

In [223]:
books_read[books_read['User-ID'].isin(l)]

Unnamed: 0,User-ID,ISBN
81,507,28
2930,18082,31
10630,61028,30
14017,81121,23
31682,182838,29


In [224]:
users[users['User-ID'].isin(l)]

Unnamed: 0,User-ID,User-City,User-State,User-Country,User-Age
81,507,dumas,arkansas,"usa""",
2930,18082,southern,california,"usa""",38.0
10630,61028,charleston,south carolina,"usa""",50.0
14017,81121,chesterfield,missouri,"usa""",
31682,182838,cumberland,maryland,"usa""",26.0


In [225]:
user_age[user_age['User-Age'].isna()]['User-Age-Imputed'].unique()

array([ 38. ,  25. ,  24. ,  37. ,  20. ,  39. ,  48. ,  43.5,  30. ,
        33. ,  45. ,  39.5,  28.5,  55. ,  36. ,  28. ,  34. ,  25.5,
        30.5,  46.5,  57. ,  49.5,  54.5,  29. ,  37.5,  49. ,  50. ,
        35.5,  21. ,  44.5,  36.5,  41.5,  32. , 101. ,  51. ,  35. ,
        41. ,  29.5,  38.5,  53. ,  27. ,  22. ,  40. ,  23. ,  31. ,
        32.5,  34.5,  33.5,  68.5,  42. ,  55.5,  42.5,  47. ,  51.5,
        19. ,  26.5,  18. ,  27.5,  23.5,  45.5,  44. ,  46. ,  22.5])

In [226]:
book_ratings = ratings.merge(books)

In [227]:
book_ratings = book_ratings[book_ratings['User-ID'].isin(l)].sort_values('Book-Author')

In [228]:
book_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher
137232,182838,0553579002,8,"Solo Command (Star Wars: X-Wing Series, Book 7)",Aaron Allston,1999,Bantam
61737,81121,0060830956,9,Brave New World,Aldous Huxley,1984,HarperCollins Publishers
16787,18082,0425117383,8,At Risk,Alice Hoffman,1989,Berkley Publishing Group
137210,182838,0345409329,8,"Blood and Gold (Rice, Anne, Vampire Chronicles.)",Anne Rice,2002,Ballantine Books
16792,18082,0446604275,8,Primary Colors: A Novel of Politics,Anonymous,1996,Warner Books
...,...,...,...,...,...,...,...
61746,81121,0441117732,8,Count Zero (Ace Science Fiction),William Gibson,1987,Ace Books
61749,81121,0441569595,10,Neuromancer (Remembering Tomorrow),William Gibson,1995,Ace Books
61751,81121,0553281747,8,Mona Lisa Overdrive (Bantam Spectra Book),William Gibson,1989,Bantam
61758,81121,0812694333,9,The Simpsons and Philosophy: The D'oh! of Homer,William Irwin,2001,Open Court Publishing Company


In [229]:
book_ratings[book_ratings['Book-Title'].duplicated()]

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher
16807,18082,0767912918,9,Passing for Thin : Losing Half My Weight and F...,FRANCES KUFFEL,2004,Broadway
1435,507,055357695X,7,Out of the Shadows (Shadows Trilogy (Paperback)),Kay Hooper,2000,Bantam Books


In [230]:
book_ratings[book_ratings['ISBN'] == '0767912918']

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher
1446,507,767912918,7,Passing for Thin : Losing Half My Weight and F...,FRANCES KUFFEL,2004,Broadway
16807,18082,767912918,9,Passing for Thin : Losing Half My Weight and F...,FRANCES KUFFEL,2004,Broadway


In [231]:
book_ratings[book_ratings['Book-Title'] == 'The King of Torts']

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher


In [234]:
a = ratings_user[ratings_user['User-ID'].isin(l)].T

In [248]:
a[347]['0002005018']

8.129032258064516

In [244]:
a[a[1] != 8.142857142857142]
a[a[1] != 8.129032258064516]

Unnamed: 0_level_0,1,92,347,464,1081
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
User-ID,507.0,18082.0,61028.0,81121.0,182838.0
User-Age,,38.0,50.0,,26.0
0020427859,9.0,8.129032,8.133333,8.173913,8.137931
0060558865,8.0,8.129032,8.133333,8.173913,8.137931
0140230165,7.0,8.129032,8.133333,8.173913,8.137931
0142001740,10.0,8.129032,8.133333,8.173913,8.137931
0312986769,9.0,8.129032,8.133333,8.173913,8.137931
0345369068,8.0,8.129032,8.133333,8.173913,8.137931
0425168948,7.0,8.129032,8.133333,8.173913,8.137931
0451184718,9.0,8.129032,8.133333,8.173913,8.137931


In [242]:
a[1]['0002005018']

8.142857142857142