In [4]:
import pandas as pd
import glob
import json
from collections import defaultdict
from flatten_dict import flatten
from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import RidgeCV, LinearRegression, SGDRegressor, Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [5]:
import numpy as np

In [6]:
# load rev dataframe
rev = pd.read_pickle('review.pkl')

In [8]:
df_content = pd.read_pickle("feature.pkl")

In [11]:
# combine the rev and content table to make a hybrid table as reference
df_hybrid = pd.DataFrame.merge(rev, df_content, on = "Id")
df_hybrid.to_pickle("hybrid.pkl")

In [12]:
df_hybrid.iloc[0]['rating']

4

In [7]:
user_ratings = rev[['Id', 'rating', 'author', 'date']]

In [8]:
#group the rating by user, then convert the dataframe into series to fit into DictVectorizor later in the format of 
#user name as index, the value will be dicts with id(wine id) as key, rating as values

by_user_rating = user_ratings.groupby('author').apply(lambda items : {i[1]:i[2] for i in items.itertuples()})

In [10]:
trial =by_user_rating.to_frame().T

In [79]:
trial['"mowine"']

0    {97933: 4}
Name: "mowine", dtype: object

In [77]:
trial['"Q"']

0    {136338: 5}
Name: "Q", dtype: object

In [11]:
trial['Yudong']=[{109052:5, 109373:4, 113234:3, 119903:4}]

In [13]:
series1=trial.iloc[0,:] #convert the dataframe back to series

In [14]:
type(series1)

pandas.core.series.Series

In [15]:
# take wine id as features and ratings as its corresponding values, 
# which can be used to calculate the distance for similarity later in knn. 8757 unique users and 26624 wine ids.
features = DictVectorizer().fit_transform(series1)

In [16]:
features

<8758x17251 sparse matrix of type '<class 'numpy.float64'>'
	with 36603 stored elements in Compressed Sparse Row format>

In [17]:
nn = NearestNeighbors(n_neighbors=20, metric='cosine', algorithm='brute').fit(features)

In [18]:
rate = DictVectorizer()
f = rate.fit_transform(series1)
#rate.get_feature_names()

In [19]:
dists, indices = nn.kneighbors(features[series1.index.get_loc('Yudong'), :])

In [20]:
indices[0][:]

array([8757, 5835, 5821, 5834, 5836, 5837, 5838, 5839, 5840, 5841, 5842,
       5843, 5844, 5845, 5846, 5847, 5848, 5849, 5822, 5823])

In [21]:
neighbors = [series1.index[i] for i in indices[0]][1:]
ratings_grp = df_hybrid[df_hybrid['author'].isin(neighbors)] \
    .groupby('Id')['rating']
ratings_grp

NameError: name 'df_hybrid' is not defined

In [104]:
# Naive Mean
ratings_grp.agg(['mean', 'count']).sort_values('mean', ascending=False)[:20]

Unnamed: 0_level_0,mean,count
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
135501,5,1
94950,5,1
131991,5,1
138343,5,1
139662,5,1
142616,5,1
145266,5,1
131981,5,1
159530,4,1
158574,4,1


In [106]:
#bayesian smoothing
def bayes_sum(N, mu):
    return lambda x: (x.sum() + mu*N) / float(x.count() + N)

bsum = ratings_grp.aggregate(bayes_sum(5, 3)).sort_values(ascending=False)[:20]

In [120]:
bsum.to_frame().index.tolist()

[139662,
 142616,
 138343,
 135501,
 131991,
 94950,
 131981,
 145266,
 122774,
 167243,
 126553,
 124908,
 123602,
 167882,
 141174,
 117411,
 116212,
 98683,
 94958,
 94073]

In [44]:
# convert user names into integer id
user_ratings = user_ratings.assign(user_id=(user_ratings['author']).astype('category').cat.codes)

In [47]:
user_ratings = user_ratings.rename(index = str, columns = {"Id":"wine_id", "author":"user_name"})

In [49]:
user_ratings.to_csv("review_list.csv")

In [None]:
user = user_ratings[[usr_ratings[]]]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(user_ratings, user_ratings['rating'])

In [28]:
class Dictizer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col):
        self.col = col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.col].apply(lambda x: {x: 1})

In [29]:
user_dict = Pipeline([("users", Dictizer('user_id')),
                     ('vect', DictVectorizer())])
wine_dict = Pipeline([("wines", Dictizer('Id')),
                     ('vect', DictVectorizer())])
union = FeatureUnion([("users", user_dict),
                     ("wines", wine_dict)])
lr = Pipeline([("features", union),
              ("lr_model", Ridge())])

In [30]:
lr.fit(X_train, y_train)

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('users', Pipeline(steps=[('users', Dictizer(col='user_id')), ('vect', DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])), ('wines', Pipeline(steps=[('wines', Dictizer(col='Id')), ('vect'...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [31]:
def rmse(model, X, y):
    return np.sqrt(mean_squared_error(model.predict(X), y))

In [32]:
np.sqrt(y_test.var())

1.5398523000234718

In [33]:
rmse(lr, X_test, y_test)

1.0969321501510341

In [34]:
rmse(lr, X_train, y_train)

0.63573593076395474

In [35]:
gs = GridSearchCV(lr, {'lr_model__alpha': np.logspace(-1,2,20)}, n_jobs=-1)

In [36]:
gs.fit(X_train, y_train)
rmse(gs, X_test, y_test)

1.0904123043979514

In [37]:
best_lr = gs.best_params_

In [38]:
# Algorithm adapted from http://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/
# Modeling Interaction by matrix factorization

class ResidualMatrixFactorization(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self, base_est, n_factors, n_iters, alpha):
        self.base_est = base_est
        self.n_factors = n_factors
        self.n_iters = n_iters
        self.alpha = alpha
        
        self.user_index = self.rating_index = self.Q = self.W = self.user_f = self.movie_f = None
    
    def fit_step(self):
        if self.user_index is None:
            self.init_fit(X, y)
        
        alphas = self.alpha * np.eye(self.n_factors)
        for u, Wu in enumerate(self.W):
            movie_f = self.movie_f[:,Wu]
            self.user_f[u] = np.linalg.solve(
                np.dot(movie_f, movie_f.T) + alphas,
                np.dot(movie_f, self.Q[u, Wu])).T
        for i, Wi in enumerate(self.W.T):
            user_f = self.user_f[Wi,:]
            self.movie_f[:, i] = np.linalg.solve(
                np.dot(user_f.T, user_f) + alphas,
                np.dot(user_f.T, self.Q[Wi, i]))
        
        return self
    
    def fit(self, X, y):
        self.base_est.fit(X, y)
        residuals = y - self.base_est.predict(X)
        df = pd.DataFrame({'user_id': X['user_id'], 'movie_id': X['Id'], 'residuals': residuals})
        
        rating_mat = df.pivot_table('residuals', 'user_id', 'movie_id')
        self.user_index = rating_mat.index
        self.movie_index = rating_mat.columns
        
        self.Q = rating_mat.fillna(0).values
        self.W = (~rating_mat.isnull()).values
        self.user_f = np.random.rand(self.Q.shape[0], self.n_factors)
        self.movie_f = np.random.rand(self.n_factors, self.Q.shape[1])

        for n in xrange(self.n_iters):
            self.fit_step()
        
        return self
    
    def interaction_prediction(self, row):
        uid = row['user_id']
        mid = row['movie_id']
        if uid in self.user_index and mid in self.movie_index:
            return np.dot(self.user_f[self.user_index.get_loc(uid), :],
                          self.movie_f[:, self.movie_index.get_loc(mid)])
        return 0
        
    def predict(self, X):
        return self.base_est.predict(X) + X.apply(self.interaction_prediction, axis=1).values

In [49]:
df = pd.DataFrame({'user_id': X_train['user_id'], 'movie_id': X_train['Id']})
        
rating_mat = df.pivot_table('user_id', 'movie_id')
user_index = rating_mat.index
movie_index = rating_mat.columns

In [50]:
df.head()

Unnamed: 0,movie_id,user_id
19391,160280,6691
48214,139789,6627
13838,133839,7900
50364,117169,1769
28374,136001,6663


In [51]:
rating_mat.head()

Unnamed: 0_level_0,user_id
movie_id,Unnamed: 1_level_1
315,5177.0
316,6155.0
515,2405.0
1231,3863.25
1486,5146.0


In [52]:
gs = GridSearchCV(ResidualMatrixFactorization(best_lr, 20, 10, 1),
                  {'alpha': [1,3,10,30,100]})
gs.fit(X_train, y_train)
gs.best_params_

AttributeError: 'dict' object has no attribute 'fit'