In [1]:
import pandas as pd
import numpy as np
# to divide train and test set
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
#preprocessors
from sklearn.base import BaseEstimator, TransformerMixin
from feature_engine.selection import DropFeatures

In [16]:
# load dataset
data = pd.read_csv('train.csv')
data.columns = ['userID', 'ISBN', 'bookRating', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL', 'Location', 'Age']
data.head(20)

  data = pd.read_csv('train.csv')


Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,Location,Age
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"tyler, texas, usa",
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"cincinnati, ohio, usa",23.0
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"strafford, missouri, usa",34.0
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"st. charles county, missouri, usa",2.0
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"beaverton, oregon, usa",
5,23768,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"st. louis, missouri, usa",45.0
6,28266,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"portland, oregon, usa",
7,28523,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"springfield, missouri, usa",24.0
8,39002,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"san jose, ,",
9,50403,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"conway, arkansas, usa",


In [9]:
class NullVariableTransformer(BaseEstimator, TransformerMixin):
	# Null data transformer

    def __init__(self, subject, user, rating_variable):
        self.subject = subject
        self.user = user
        self.rating_variable = rating_variable

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

    	# so that we do not over-write the original dataframe
        X = X.copy()
        X = X.drop_duplicates([self.user, self.subject])
        X = X.dropna(axis = 0, subset = self.subject)
                
        return X

In [10]:
class AddVariableTransformer(BaseEstimator, TransformerMixin):
	# Null data transformer

    def __init__(self, subject, user, rating_variable, new_feature):
        self.subject = subject
        self.user = user
        self.rating_variable = rating_variable
        self.new_feature = new_feature
        

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

    	# so that we do not over-write the original dataframe
        X = X.copy()
        
        X_additional = X.groupby(by = [self.subject])[self.rating_variable].count().reset_index().rename(columns = {self.rating_variable: self.new_feature})[[self.subject, self.new_feature]]
        X_last = X.merge(X_additional, left_on=self.subject, right_on=self.subject, how='left')

        return X_last

In [11]:
class RestrictVariablesTransformer(BaseEstimator, TransformerMixin):
	# null data transformer

    def __init__(self, location, popularity_threshold, new_feature, specific_location):
        self.location = location
        self.popularity_threshold = popularity_threshold
        self.new_feature = new_feature
        self.specific_location = specific_location
        

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

    	# so that we do not over-write the original dataframe
        X = X.copy()
        X = X[X[self.new_feature] >= self.popularity_threshold]
        X = X[X[self.location].str.contains(self.specific_location)]
        return X

In [12]:
class PrepareVariablesTransformer(BaseEstimator, TransformerMixin):
	# Null data transformer

    def __init__(self, subject, user, rating_variable):
        self.subject = subject
        self.user = user
        self.rating_variable = rating_variable
        self.X_pivot = None

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        return self

    def transform(self, X):

    	# so that we do not over-write the original dataframe
        X = X.copy()
        self.X_pivot = X.pivot(index = self.subject, columns=self.user, values=self.rating_variable).fillna(0)
        X_matrix = csr_matrix(self.X_pivot.values)
        return X_matrix
    
    def get_prepared_data(self):
        return self.X_pivot

In [13]:
features_to_drop = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL', 'Age']
subject = 'bookTitle'
user = 'userID'
rating_variable='bookRating'
popularity_threshold = 50
new_feature = 'totalRatingCount'
location = 'Location'
specific_location = "usa|canada"

In [14]:
book_pipe = Pipeline([('drop_features', DropFeatures(features_to_drop=features_to_drop)), 
                      ('null_variable', NullVariableTransformer(subject, user, rating_variable)),
                      ('add_variable', AddVariableTransformer(subject, user, rating_variable, new_feature)),
                      ('restrict_variable', RestrictVariablesTransformer(location, popularity_threshold, new_feature, specific_location)),
                      ('prepare', PrepareVariablesTransformer(subject, user, rating_variable)), 
                      ('knn', NearestNeighbors(metric='cosine', algorithm='brute'))])

In [17]:
book_pipe.fit(data)

Pipeline(steps=[('drop_features',
                 DropFeatures(features_to_drop=['yearOfPublication',
                                                'publisher', 'bookAuthor',
                                                'imageUrlS', 'imageUrlM',
                                                'imageUrlL', 'Age'])),
                ('null_variable',
                 NullVariableTransformer(rating_variable='bookRating',
                                         subject='bookTitle', user='userID')),
                ('add_variable',
                 AddVariableTransformer(new_feature='totalRatingCount',
                                        rating_variable='boo...
                                        subject='bookTitle', user='userID')),
                ('restrict_variable',
                 RestrictVariablesTransformer(location='Location',
                                              new_feature='totalRatingCount',
                                              popularity_thresh

In [27]:
results = {"preds": None, "dist": None, "version": '0.0.1', "errors": []}
preds = []
dist = []
input_data = '16 Lighthouse Road'
matrix = book_pipe.named_steps['prepare'].get_prepared_data()
distances, indices = book_pipe.named_steps['knn'].kneighbors(matrix.loc[input_data, :].values.reshape(1, -1), n_neighbors=6)

for i in range (0, len(distances.flatten())):
    if i == 0:
        preds.append('Recommendations for {0}:\n'.format(
                matrix.loc[input_data, :].name))
    else:
        dist.append('{0}: {1}, with distance of {2}:'.format(
                i, matrix.index[indices.flatten()[i]], distances.flatten()[i]))
 # Fill the results dict
results["preds"] = [pred for pred in preds]
results["dist"] = [prob for prob in dist]
print(results)

{'preds': ['Recommendations for 16 Lighthouse Road:\n'], 'dist': ['1: Hurricane Bay, with distance of 0.7278809052310367:', '2: 204 Rosewood Lane, with distance of 0.7535885568384382:', '3: Dangerous, with distance of 0.7698546937657823:', '4: Macgregor Brides (Macgregors), with distance of 0.7713655197576696:', '5: Until You, with distance of 0.7733383455783607:'], 'version': '0.0.1', 'errors': []}


In [14]:
data['bookTitle'][0]

'Flesh Tones: A Novel'

In [15]:
matrix = book_pipe.named_steps['prepare'].get_prepared_data()
matrix

userID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
matrix.loc['16 Lighthouse Road', :].name

'16 Lighthouse Road'

In [38]:
matrix.iloc[1, :]

userID
8         0.0
9         0.0
14        0.0
16        0.0
17        0.0
         ... 
278843    0.0
278844    0.0
278846    0.0
278851    0.0
278854    0.0
Name: 16 Lighthouse Road, Length: 39924, dtype: float64

In [23]:
a = dict(results)

ValueError: dictionary update sequence element #0 has length 40; 2 is required

In [None]:
print(type(a))