# Content-based

## Use the attributes in the dataset

In [2]:
import pandas as pd
import glob
import json
from collections import defaultdict
from flatten_dict import flatten
from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import RidgeCV, LinearRegression, SGDRegressor, Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
import numpy as np

In [4]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
fd = pd.read_pickle("feature.pkl")

In [20]:
# One-hot encode the catergories. In order to vectorize them using DictVectorizer()
#need to transform the column into list of dictionaries

class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col, imp = 0):
        self.col = col
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(l):
            dic = defaultdict(int)
            try:
                for x in l:
                    if x:
                        dic[x] = 10**(-self.imp)
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

### Customize labels in columns

In [31]:
# this function is to transform the single features per row, which means not in a list
# One-hot encode the catergories. In order to vectorize them using DictVectorizer()
#need to transform the column into list of dictionaries
# old class: try to add weight to each specific label
'''
import collections
class DictEncoder_l(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col, label = [], imp=0): # if not specify label, use the whole features
        self.col = col
        self.label = label
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(x):
            dic = defaultdict(int)
            try:
                if self.label:
                    if x in self.label:
                        dic[x] = 10**(-self.imp)
                    else:
                        dic[x] = 1
                    return dic
                else:
                    if x:
                        dic[x] = 1
                    return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)
'''

In [8]:
import collections
class DictEncoder_l(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col, imp=0): # if not specify label, use the whole features
        self.col = col
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(x):
            dic = defaultdict(int)
            try:
                if x:
                    dic[x] = 10**(-self.imp)
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

In [10]:
# scale professional rating into numbers between 1 to 10
class DictEncoder_scale(base.BaseEstimator, base.TransformerMixin): 
    
    def __init__(self, col, label = [], imp=0): # if not specify label, use the whole features
        self.col = col
        self.label = label
        self.imp = imp
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(x):
            dic = defaultdict(int)
            try:
                if 1 <= x <= 100:
                    dic[str(x)] = (x/100.0)*10**self.imp
                elif x == 0:
                    dic[str(x)] = 1
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)


In [11]:
# scale selected ranged price into numbers between 0 to 1
class Price_scale(base.BaseEstimator, base.TransformerMixin): 
    
    def __init__(self, col, Prange = (4.79, 100)): # if not specify label, use the whole features
        self.col = col
        self.range = Prange
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(x):
            dic = defaultdict(int)
            try:
                if self.range[0]<= x <= self.range[1]:
                    dic[str(x)] = x/(self.range[1] - self.range[0])
                else:
                    dic[str(x)] = 1
                return dic
            except TypeError:
                return {}
        
        return X[self.col].apply(to_dict)

In [17]:
# adding all feature columns: 'Vintage', 'Vineyard/Name', 'Varietal/Name'(grape), 'Varietal/WineType/Name','Attr'
vintage_pipe = Pipeline([('encoder', DictEncoder_l('Vintage')),
                     ('vectorizer', DictVectorizer())])
vineyard_pipe = Pipeline([('encoder', DictEncoder_l('Vineyard/Name')),
                     ('vectorizer', DictVectorizer())])
varietal_pipe = Pipeline([('encoder', DictEncoder_l('Varietal/Name', imp = 2)),
                     ('vectorizer', DictVectorizer())])
wineType_pipe = Pipeline([('encoder', DictEncoder_l('Varietal/WineType/Name')),
                     ('vectorizer', DictVectorizer())])
PriceRetail_pipe = Pipeline([('encoder', Price_scale('PriceRetail', Prange = (10, 30))),
                     ('vectorizer', DictVectorizer())])
region_pipe = Pipeline([('encoder', DictEncoder_l('Appellation/Region/Name' )),
                     ('vectorizer', DictVectorizer())])
Attr_pipe = Pipeline([('encoder', DictEncoder('Attr')),
                     ('vectorizer', DictVectorizer())])
rating_p = Pipeline([('encoder', DictEncoder_scale('Ratings/HighestScore')),
                     ('vectorizer', DictVectorizer())])

In [18]:
union = FeatureUnion([('vintage', vintage_pipe),
                     ('vineyard', vineyard_pipe),
                      ('varietal', varietal_pipe),
                      ('wineType', wineType_pipe),
                      ('Region', region_pipe),
                      ('PriceRetail', PriceRetail_pipe),
                      ('Attr', Attr_pipe),
                      ('rating', rating_p)
                     ])

In [19]:
features = union.fit_transform(fd)

In [15]:
fd[fd['Id'] == 119903]

Unnamed: 0,Attr,Vintage,Vineyard/Name,Varietal/Name,Varietal/WineType/Name,PriceRetail,Appellation/Region/Name,Ratings/HighestScore,Id,Name
33156,"[Smooth &amp; Supple, Green Wines]",2010,Frog's Leap,Merlot,Red Wines,34,California,93,119903,Frog's Leap Merlot 2010


In [30]:
sum(fd['PriceRetail']>100)/110152.0

0.091255719369598379

In [16]:
nn = NearestNeighbors(n_neighbors=20).fit(features)

# features[1] is the second observation in df, which is used as the point to find its 20 neareast neighbours
dists, indices = nn.kneighbors(features[fd[fd["Id"]==119903].index.tolist()[0]])
fd.loc[indices[0],:]


Unnamed: 0,Attr,Vintage,Vineyard/Name,Varietal/Name,Varietal/WineType/Name,PriceRetail,Appellation/Region/Name,Ratings/HighestScore,Id,Name
33156,"[Smooth &amp; Supple, Green Wines]",2010,Frog's Leap,Merlot,Red Wines,34.0,California,93,119903,Frog's Leap Merlot 2010
99360,"[Smooth &amp; Supple, Green Wines]",2010,Frog's Leap,Merlot,Red Wines,76.0,California,0,121126,Frog's Leap Merlot (1.5 Liter Magnum) 2010
32976,"[Smooth &amp; Supple, Green Wines]",2010,Frog's Leap,Cabernet Sauvignon,Red Wines,42.0,California,0,121128,Frog's Leap Estate Grown Cabernet Sauvignon 2010
71011,"[Green Wines, Smooth &amp; Supple]",2010,Frog's Leap,Cabernet Sauvignon,Red Wines,85.0,California,0,125119,Frog's Leap Estate Grown Cabernet Sauvignon (1...
30744,"[Smooth &amp; Supple, Green Wines]",2010,Twomey Cellars by Silver Oak,Pinot Noir,Red Wines,49.99,California,93,121792,Twomey Cellars by Silver Oak Russian River Pin...
71358,"[Smooth &amp; Supple, Green Wines]",2010,Frog's Leap,Cabernet Sauvignon,Red Wines,25.99,California,0,121708,Frog's Leap Estate Grown Cabernet Sauvignon (3...
1090,"[Smooth &amp; Supple, Green Wines, Has Large L...",2009,Frog's Leap,Merlot,Red Wines,34.0,California,90,114337,Frog's Leap Merlot 2009
78907,[Smooth &amp; Supple],2010,Schug Estate Winery,Pinot Noir,Red Wines,16.99,California,93,131071,Schug Carneros Pinot Noir (375ML half-bottle) ...
11972,"[Green Wines, Smooth &amp; Supple, Has Large L...",2008,Frog's Leap,Merlot,Red Wines,34.0,California,0,109148,Frog's Leap Merlot 2008
81348,"[Smooth &amp; Supple, Green Wines, Has Large L...",2012,Frog's Leap,Cabernet Sauvignon,Red Wines,19.99,California,93,135623,Frog's Leap Estate Grown Cabernet Sauvignon (3...


In [None]:
from functools import reduce
res = []
for i in selected['Id']:
    dists, indices = nn.kneighbors(features[fd[fd["Id"]==i].index.tolist()[0]])
    res.append(fd.loc[indices[0]])
df = pd.concat(res)

In [None]:
df[df.duplicated(subset =["Id"])]

In [None]:
#from sklearn.cluster import KMeans
#km = KMeans(n_clusters = 5).fit(features)

In [None]:
#km.cluster_centers_[0]

In [None]:
#select wine for myself to use for prediction
res = []
for i in range(len(fdf['Attr'])):
    try:
        if fd.loc[i]['Attr'][0] == 'Smooth &amp; Supple' and fd.loc[i]['Varietal/Name']=='Merlot':
            res.append(fd.loc[i]["Id"])
    except:
        continue
#sort by both features
fd[fd["Id"].isin(res)].sort_values(["Ratings/HighestScore", "PriceRetail"], ascending = [0, 1])

In [None]:
#selected wine ids
sd = [109052, 109373, 113234, 119903]
#get rows correspondent to the selected ids
selected = fd[fd["Id"].isin(sd)]

In [None]:
#assign the ratings for each wine that I picked
selected["rating"] = [4,5,3,4]

In [None]:
selected

In [None]:
#the recommended results (wine ids) from collabrotive filtering
collab = [139662,
 142616,
 138343,
 135501,
 131991,
 94950,
 131981,
 145266,
 122774,
 167243,
 126553,
 124908,
 123602,
 167882,
 141174,
 117411,
 116212,
 98683,
 94958,
 94073]

In [None]:
#concatenate the results from both systems
col = fd[fd["Id"].isin(collab)]
l = [col, selected]
recommend = pd.concat(l)

In [None]:
recommend