In [12]:
import pandas as pd
from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import RidgeCV, LinearRegression, SGDRegressor, Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from collections import defaultdict

In [5]:
# One-hot encode the catergories. In order to vectorize them using DictVectorizer()
#need to transform the column into list of dictionaries

class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col, imp = 0):
        self.col = col
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(l):
            dic = defaultdict(int)
            try:
                for x in l:
                    if x:
                        dic[x] = 10**(-self.imp)
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

In [6]:
class DictEncoder_l(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col, imp=0): # if not specify label, use the whole features
        self.col = col
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(x):
            dic = defaultdict(int)
            try:
                if x:
                    dic[x] = 10**(-self.imp)
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

# scale professional rating into numbers between 1 to 10
class DictEncoder_scale(base.BaseEstimator, base.TransformerMixin): 
    
    def __init__(self, col, label = [], imp=0): # if not specify label, use the whole features
        self.col = col
        self.label = label
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(x):
            dic = defaultdict(int)
            try:
                if 1 <= x <= 100:
                    dic[str(x)] = (x/100.0)*10**self.imp
                elif x == 0:
                    dic[str(x)] = 1
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)


In [8]:
def unioned_features(weight = [0,0,0,0,0,0]):
    vineyard_pipe = Pipeline([('encoder', DictEncoder_l('Vineyard/Name', imp = weight[0])),
                         ('vectorizer', DictVectorizer())])
    varietal_pipe = Pipeline([('encoder', DictEncoder_l('Varietal/Name', imp = weight[1])),
                         ('vectorizer', DictVectorizer())])
    wineType_pipe = Pipeline([('encoder', DictEncoder_l('Varietal/WineType/Name', imp = weight[2])),
                         ('vectorizer', DictVectorizer())])
    region_pipe = Pipeline([('encoder', DictEncoder_l('Appellation/Region/Name',imp = weight[3])),
                         ('vectorizer', DictVectorizer())])
    Attr_pipe = Pipeline([('encoder', DictEncoder('Attr', imp = weight[4])),
                         ('vectorizer', DictVectorizer())])
    rating_p = Pipeline([('encoder', DictEncoder_scale('Ratings/HighestScore', imp = weight[5])),
                         ('vectorizer', DictVectorizer())])
    
    union = FeatureUnion([('vineyard', vineyard_pipe),
                      ('varietal', varietal_pipe),
                      ('wineType', wineType_pipe),
                      ('Region', region_pipe),
                      ('Attr', Attr_pipe),
                      ('rating', rating_p)
                     ])
    return union

In [94]:
# content results
def content_recommend(wine_n, wine_id, df):
    df = df.fillna(value = "nan")
    union = unioned_features()
    features = union.fit_transform(df)
    nn = NearestNeighbors(n_neighbors=wine_n).fit(features)
    res = []
    for i in wine_id:
        dists, indices = nn.kneighbors(features[fd[fd["Id"]==i].index.tolist()[0]])
        res.append(fd.loc[indices[0]])
    df = pd.concat(res).drop_duplicates(subset = ["Id"])
    return df

In [93]:
def bayes_sum(N, mu):
    return lambda x: (x.sum() + mu*N) / float(x.count() + N)

In [105]:
#collabrative results
def collabrative_filtering(rev, wine_n_col, user_name):
    user_ratings = rev[['wine_id', 'rating', 'user_name']]
    by_user_rating = user_ratings.groupby('user_name').apply(lambda items : {i[1]:i[2] for i in items.itertuples()})
    features = DictVectorizer().fit_transform(by_user_rating)
    nn = NearestNeighbors(n_neighbors = wine_n_col, metric='cosine', algorithm='brute').fit(features)
    dists, indices = nn.kneighbors(features[by_user_rating.index.get_loc(user_name), :])
    neighbors = [by_user_rating.index[i] for i in indices[0]][1:]
    ratings_grp = rev[rev['user_name'].isin(neighbors)].groupby('wine_id')['rating']
    bsum = ratings_grp.aggregate(bayes_sum(5, 3)).sort_values(ascending=False)[:10]
    res = bsum.to_frame().index.tolist()
    return res

In [100]:
#final result
def recommend_result(wine_n, wine_id, df, rev, wine_n_col, user_name, content_only = True, collab_only = True):
    content = content_recommend(wine_n, wine_id, df)["Id"]
    collab = collabrative_filtering(rev, wine_n, user_name)
    if content_only and collab_only:
        rec = set()
        rec.update(content)
        rec.update(collab)
        return list(rec)
    if content_only and not collab_only:
        return content
    if not content_only and collab_only:
        return collab

In [103]:
#df is the whole dataframe, rev is the review_list dataframe
rec = recommend_result(5, [109052, 109373], df, rev, 5, 'Lisa Strutz', collab_only = False)