In [3]:
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import urllib.request
import zipfile

In [4]:
def download_data():
    """ DONE. Download and unzip data.
    """
    url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()

In [5]:
def tokenize_string(my_string):
    """ DONE. You should use this in your tokenize function.
    """
    return re.findall('[\w\-]+', my_string.lower())

In [6]:
def tokenize(movies):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie. Use the tokenize_string method above.

    Note: you may modify the movies parameter directly; no need to make
    a new copy.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.

    >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
    >>> movies = tokenize(movies)
    >>> movies['tokens'].tolist()
    [['horror', 'romance'], ['sci-fi']]
    """
    movies['tokens'] = movies['genres'].apply(tokenize_string)
    return movies

In [7]:
movies = pd.DataFrame([[123, 'Horror|Romance|Sci-Fi'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
movies = tokenize(movies)
movies['tokens'].tolist()

[['horror', 'romance', 'sci-fi'], ['sci-fi']]

In [18]:
def featurize(movies):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i

    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """
    #create vocab
    vocab = dict()
    vocabulary = sorted(list(set([voc for vocs in movies.tokens.values for voc in vocs])))
    for voc in vocabulary:
        vocab[voc] = vocabulary.index(voc)
    
    
    doc_occurences = sum((Counter(set(doc)) for doc in movies.tokens.values), Counter())
    
    def create_csr(doc):
        csr = csr_matrix((1,len(vocab)))
        
        for token in doc:
            freq = doc.count(token)
            tfidf = freq*np.log10(len(movies)/doc_occurences[token])
            csr[0,vocab[token]] = tfidf
        return csr

    movies['features'] = movies["tokens"].apply(create_csr)
        
    return (movies, vocab)

In [19]:
featurize(movies)



(   movieId                 genres                     tokens  \
 0      123  Horror|Romance|Sci-Fi  [horror, romance, sci-fi]   
 1      456                 Sci-Fi                   [sci-fi]   
 
                                             features  
 0    (0, 0)\t0.301029995664\n  (0, 1)\t0.30102999...  
 1                                        (0, 2)\t0.0  ,
 {'horror': 0, 'romance': 1, 'sci-fi': 2})

In [21]:
def train_test_split(ratings):
    """DONE.
    Returns a random split of the ratings matrix into a training and testing set.
    """
    test = set(range(len(ratings))[::1000])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]

In [22]:
def cosine_sim(a, b):
    """
    Compute the cosine similarity between two 1-d csr_matrices.
    Each matrix represents the tf-idf feature vector of a movie.
    Params:
      a...A csr_matrix with shape (1, number_features)
      b...A csr_matrix with shape (1, number_features)
    Returns:
      A float. The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
      where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a.
    """
    normalize_a = np.sqrt(np.sum([item ** 2 for item in list(a.toarray())[0]]))
    normalize_b = np.sqrt(np.sum([item ** 2 for item in list(b.toarray())[0]]))
    cosine_sim = a.dot(b.T) / (normalize_a*normalize_b)
    return cosine_sim.toarray()[0][0]

In [23]:
def make_predictions(movies, ratings_train, ratings_test):
    """
    Using the ratings in ratings_train, predict the ratings for each
    row in ratings_test.

    To predict the rating of user u for movie i: Compute the weighted average
    rating for every other movie that u has rated.  Restrict this weighted
    average to movies that have a positive cosine similarity with movie
    i. The weight for movie m corresponds to the cosine similarity between m
    and i.

    If there are no other movies with positive cosine similarity to use in the
    prediction, use the mean rating of the target user in ratings_train as the
    prediction.

    Params:
      movies..........The movies DataFrame.
      ratings_train...The subset of ratings used for making predictions. These are the "historical" data.
      ratings_test....The subset of ratings that need to predicted. These are the "future" data.
    Returns:
      A numpy array containing one predicted rating for each element of ratings_test.
    """
    predictions = list()
    for index, rating in ratings_test.iterrows():
        cosine_sims = list()
        user_ratings = ratings_train[ratings_train.userId == int(rating.userId)]
        reference_features = movies[movies.movieId == int(rating.movieId)].features.iloc()[0]
        for index, rating in user_ratings.iterrows():
            movie_features = movies[movies.movieId == int(rating.movieId)].features.iloc()[0]
            cosine_sims.append(cosine_sim(reference_features, movie_features))
        user_ratings.loc[index, "cosine"] = cosine_sims
        usefull_ratings = user_ratings[user_ratings.cosine != 0]
        usefull_ratings.loc[index, "weight"] = usefull_ratings.cosine * usefull_ratings.rating
        if len(usefull_ratings) != 0:
            pred = usefull_ratings.weight.sum() / usefull_ratings.cosine.sum()
        else:
            pred = user_ratings.rating.mean()
        predictions.append(pred)
    return np.array(predictions)

In [24]:
def mean_absolute_error(predictions, ratings_test):
    """DONE.
    Return the mean absolute error of the predictions.
    """
    return np.abs(predictions - np.array(ratings_test.rating)).mean()

In [25]:
download_data()
path = 'ml-latest-small'
ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
movies = pd.read_csv(path + os.path.sep + 'movies.csv')

In [26]:
movies = tokenize(movies)
movies, vocab = featurize(movies)
print('vocab:')
print(sorted(vocab.items())[:10])
ratings_train, ratings_test = train_test_split(ratings)
print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))



vocab:
[('action', 0), ('adventure', 1), ('animation', 2), ('children', 3), ('comedy', 4), ('crime', 5), ('documentary', 6), ('drama', 7), ('fantasy', 8), ('film-noir', 9)]
99903 training ratings; 101 testing ratings


In [27]:
predictions = make_predictions(movies, ratings_train, ratings_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


ValueError: setting an array element with a sequence.

In [207]:
print('error=%f' % mean_absolute_error(predictions, ratings_test))
print(predictions)

error=0.787455
[ 2.7945653   2.62385286  2.76558239  4.24064843  3.22115155  4.11423684
  3.92040563  3.98286924  3.21565818  3.32692774  4.27272727  3.31632777
  3.37410419  3.6070913   4.19457837  3.11741553  3.9041065   3.47972898
  3.54681362  3.99956908  2.61430437  3.83085137  3.41293997  3.08717938
  2.22706562  3.86792458  1.92705103  4.15639473  3.54094255  2.72964384
  2.42921253  3.75428664  3.8156648   3.75633539  3.3759458   4.17401655
  2.70290168  3.50817296  4.32120579  3.19255656  3.94989633  3.65050843
  3.5248113   3.40363323  3.05135337  2.05252246  3.77652343  3.70554955
  2.71055541  3.11391527  3.34696535  3.07691756  3.39595805  3.18173131
  3.60142465  3.16071363  3.51946964  3.53780451  4.1255733   4.28514229
  3.68441942  4.21954433  3.0047321   2.36989936  3.75551908  3.4704615
  2.69670497  3.51192033  3.80634344  4.21522321  3.44948935  3.9882542
  3.24598924  3.60530866  3.85374624  3.5997006   3.5862512   3.28802589
  4.41838202  3.2957547   3.40068384  