In [1]:
# coding: utf-8

# # Assignment 3:  Recommendation systems
#
# Here we'll implement a content-based recommendation algorithm.
# It will use the list of genres for a movie as the content.
# The data come from the MovieLens project: http://grouplens.org/datasets/movielens/
# Note that I have not provided many doctests for this one. I strongly
# recommend that you write your own for each function to ensure your
# implementation is correct.

# Please only use these imports.
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import urllib.request
import zipfile

def download_data():
    """ DONE. Download and unzip data.
    """
    url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()


def tokenize_string(my_string):
    """ DONE. You should use this in your tokenize function.
    """
    return re.findall('[\w\-]+', my_string.lower())


In [2]:
def tokenize(movies):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie. Use the tokenize_string method above.
    Note: you may modify the movies parameter directly; no need to make
    a new copy.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.
    >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
    >>> movies = tokenize(movies)
    >>> movies['tokens'].tolist()
    [['horror', 'romance'], ['sci-fi']]
    """
    ###TODO
    t=[]
    for i in movies["genres"]:
        z = tokenize_string(i)
        t.append(z)
    #movies["tokens"] = t
    s = np.array(t)
    #print(s)
    s1 = pd.Series(s,movies.index)
    #print(s1)
    movies["tokens"] = s1
    return movies

In [3]:
movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
movies = tokenize(movies)
movies['tokens'].tolist()

[['horror', 'romance'], ['sci-fi']]

In [4]:
def featurize(movies):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i
    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """
    feats = defaultdict(lambda: 0)
    f_list = list(defaultdict(lambda: 0))
    ind = movies.index
    shape = movies.shape[0]
    mat=[]
    mat_list = []
    doc = []
    r = []
    c = []
    d = []
    mo = movies['tokens']
    for i in mo:
        w = 0
        for j in set(i):
            feats[j]=feats[j]+1
            w=w+1
    vocab = dict((i, sorted(feats).index(i)) for i in sorted(feats))
    
    new_dict = {}
    for i in mo:
        new_dict = defaultdict(lambda: 0)
        for k in i:
            new_dict[k] = new_dict[k]+1
        f_list.append(new_dict)
    
    i=0
    while i<shape:
        r=[]
        d=[]
        c=[]
        t=True
        doc = f_list[i]
        max_k = doc[max(doc,key=doc.get)]
        for j in doc:
            if j in vocab:
                c.append(vocab[j])
                w1 = doc[j]/max_k
                w = math.log((shape / feats[j]),10)
                tf_idf = w1 * w
                d.append(tf_idf)
        r = [0]*len(c)
        l1 = len(vocab)
        mat = csr_matrix((d,(r,c)),shape = (1, l1))
        mat_list.append(mat)
        i=i+1
    f =pd.DataFrame(mat_list,ind)
    movies['features'] = f
    return movies,vocab

            
        
    
   
    

In [5]:
movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi|Action'],[789,'Comedy']],columns=['movieId', 'genres'])
movies = tokenize(movies)
vocab_keys = sorted(set(sum(movies['tokens'].tolist(), [])))
print ('keys - ',vocab_keys)
movies, vocab = featurize(movies)
print('\nvocab:')
print(sorted(vocab.items())[:10])

keys -  ['action', 'comedy', 'horror', 'romance', 'sci-fi']

vocab:
[('action', 0), ('comedy', 1), ('horror', 2), ('romance', 3), ('sci-fi', 4)]


In [6]:
def train_test_split(ratings):
    """DONE.
    Returns a random split of the ratings matrix into a training and testing set.
    """
    test = set(range(len(ratings))[::1000])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]

In [7]:
def cosine_sim(a, b):
    """
    Compute the cosine similarity between two 1-d csr_matrices.
    Each matrix represents the tf-idf feature vector of a movie.
    Params:
      a...A csr_matrix with shape (1, number_features)
      b...A csr_matrix with shape (1, number_features)
    Returns:
      A float. The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
      where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a.
    """
    A = a.dot(b.T).toarray()[0][0]
    B = (np.linalg.norm(a.toarray())*np.linalg.norm(b.toarray()))
    return A/B

In [10]:
def make_predictions(movies, ratings_train, ratings_test):
    """
    movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi|Action'],[789,'Comedy']], columns=['movieId', 'genres'])
    ratings = pd.DataFrame([[1,123,2],[1,456,4],[1,789,1.5],[2,123,3],[2,456,1],[2,789,3],[3,123,3],[3,456,4],[3,789,2],[4,123,2],[4,456,3],[4,789,2.5],
                            [5,123,2.5],[5,456,4.5],[5,789,5],[6,123,3.5],[6,456,1.5],[6,789,5],[7,123,3.5],[7,456,4.5],[7,789,4],[8,123,2.5],[8,456,3.5],[8,789,4.5],
                            [9,123,4],[9,456,4],[9,789,2.5],[10,123,3],[10,456,2],[10,789,4],[11,123,3],[11,456,4],[11,789,3.5],[12,123,2],[12,456,1],[12,789,3]],
                           columns=['userId','movieId','rating'])
    movies = tokenize(movies)
    vocab_keys = sorted(set(sum(movies['tokens'].tolist(), [])))
    print ('keys - ',vocab_keys)
    movies, vocab = featurize(movies)
    print('vocab:')
    print(sorted(vocab.items())[:10])
    ratings_train, ratings_test = train_test_split(ratings)

    print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
    predictions = make_predictions(movies, ratings_train, ratings_test)

    #Out
    keys -  ['action', 'comedy', 'horror', 'romance', 'sci-fi']

    vocab:
    [('action', 0), ('comedy', 1), ('horror', 2), ('romance', 3), ('sci-fi', 4)]
    35 training ratings; 1 testing ratings
    [ 2.75]
    """
    r=[]
    iterr = ratings_test.iterrows()
    for i,j in iterr:
        mid = j['movieId']
        feat = movies.loc[movies.movieId == mid].squeeze()['features']
        wr = 0.
        ss = 0.
        a = True
        uid = j['userId']
        user = ratings_train.loc[ratings_train.userId == uid]
        u_iterr = user.iterrows()
        for k,l in u_iterr:
            co_sine = cosine_sim(feat,movies.loc[movies.movieId==l['movieId']].squeeze()['features'])
            #if co_sine < 0:
            if co_sine > 0:
                ss = ss + co_sine
                wr = wr + co_sine * l['rating']
                a = False
        if a != True:
            r.append(wr/ss)
        elif a == True:
            r.append(user['rating'].mean())
    
    z = np.array(r)
    return z

In [11]:
movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi|Action'],[789,'Comedy']], columns=['movieId', 'genres'])
ratings = pd.DataFrame([[1,123,2],[1,456,4],[1,789,1.5],[2,123,3],[2,456,1],[2,789,3],[3,123,3],[3,456,4],[3,789,2],[4,123,2],[4,456,3],[4,789,2.5],
                            [5,123,2.5],[5,456,4.5],[5,789,5],[6,123,3.5],[6,456,1.5],[6,789,5],[7,123,3.5],[7,456,4.5],[7,789,4],[8,123,2.5],[8,456,3.5],[8,789,4.5],
                            [9,123,4],[9,456,4],[9,789,2.5],[10,123,3],[10,456,2],[10,789,4],[11,123,3],[11,456,4],[11,789,3.5],[12,123,2],[12,456,1],[12,789,3]],
                           columns=['userId','movieId','rating'])
movies = tokenize(movies)
vocab_keys = sorted(set(sum(movies['tokens'].tolist(), [])))
print ('keys - ',vocab_keys)
movies, vocab = featurize(movies)
print('vocab:')
print(sorted(vocab.items())[:10])
ratings_train, ratings_test = train_test_split(ratings)

print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
predictions = make_predictions(movies, ratings_train, ratings_test)


keys -  ['action', 'comedy', 'horror', 'romance', 'sci-fi']
vocab:
[('action', 0), ('comedy', 1), ('horror', 2), ('romance', 3), ('sci-fi', 4)]
35 training ratings; 1 testing ratings


In [12]:
def mean_absolute_error(predictions, ratings_test):
    """DONE.
    Return the mean absolute error of the predictions.
    """
    return np.abs(predictions - np.array(ratings_test.rating)).mean()

In [13]:
def main():
    download_data()
    path = 'ml-latest-small'
    ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
    movies = pd.read_csv(path + os.path.sep + 'movies.csv')
    movies = tokenize(movies)
    movies, vocab = featurize(movies)
    print('vocab:')
    print(sorted(vocab.items())[:10])
    ratings_train, ratings_test = train_test_split(ratings)
    print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
    predictions = make_predictions(movies, ratings_train, ratings_test)
    print('error=%f' % mean_absolute_error(predictions, ratings_test))
    print(predictions[:10])


if __name__ == '__main__':
    main()

vocab:
[('action', 0), ('adventure', 1), ('animation', 2), ('children', 3), ('comedy', 4), ('crime', 5), ('documentary', 6), ('drama', 7), ('fantasy', 8), ('film-noir', 9)]
99903 training ratings; 101 testing ratings
error=0.787455
[2.7945653  2.62385286 2.76558239 4.24064843 3.22115155 4.11423684
 3.92040563 3.98286924 3.21565818 3.32692774]
