### Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import *
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import hstack

### Define working directories

In [2]:
path_raw_data = 'C:/users/lbros/documents/mids/w207/final_project/raw_data/'
path_clean_data = 'C:/users/lbros/documents/mids/w207/final_project/clean_data/'

### Load clean data

#### Ratings

In [3]:
# load ratings dataframe
ratings_df = pd.read_csv(path_clean_data + 'ratings_final.csv')

In [4]:
# filter usable columns
ratings_df = ratings_df[['userId', 'imdbId', 'rating', 'timestamp']]

In [5]:
# rename imdbId
ratings_df.columns = ['userId', 'imdb_id', 'rating', 'timestamp']

In [6]:
# print dataframe shape
ratings_df.shape

(22040570, 4)

In [7]:
# print dataframe columns
ratings_df.columns

Index(['userId', 'imdb_id', 'rating', 'timestamp'], dtype='object')

#### Movies

In [139]:
# load movies dataframe
movies_df = pd.read_csv(path_clean_data + 'movies_final.csv')

In [140]:
# exclude imdb_id duplicates
movies_df = movies_df[~movies_df['imdb_id'].duplicated(keep='last')]

In [141]:
# set imdb_id as index
movies_df.set_index('imdb_id', inplace=True)

In [142]:
# filter usable columns
movies_df = movies_df.iloc[:,3:]

In [143]:
# print dataframe shape
movies_df.shape

(29399, 177)

In [144]:
# print dataframe columns
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'originally_english',
       'overview', 'popularity', 'production_companies',
       'production_countries', 'revenue', 'runtime',
       ...
       'zu', 'canceled', 'in-production', 'planned', 'post-production',
       'released', 'rumored', 'cast_names', 'crew_names', 'description'],
      dtype='object', length=177)

### Split data

#### Split users into dev and test sets

In [12]:
def user_split(ratings_df, dev_size=5000, random_state=100):
    
    '''Split users into development and test sets'''
    
    # randomly pick [dev_size] users
    unique_users = ratings_df['userId'].unique()
    dev = np.random.choice(unique_users, size=dev_size, replace=False)
    # split users into dev and test based on picked users
    dev_users = ratings_df[np.isin(ratings_df['userId'], dev)]
    test_users = ratings_df[~np.isin(ratings_df['userId'], dev)]
    
    return dev_users, test_users

In [13]:
# apply user_split to ratings_df
dev_users, test_users = user_split(ratings_df)

In [14]:
print('Development set has {} unique users and {} ratings in total.'.format(len(dev_users['userId'].unique()), dev_users.shape[0]))

Development set has 5000 unique users and 841019 ratings in total.


In [15]:
print('Test set has {} unique users and {} ratings in total.'.format(len(test_users['userId'].unique()), test_users.shape[0]))

Test set has 126880 unique users and 21199551 ratings in total.


#### Hold last rating by user for evaluation

In [16]:
def hold_last_out(user_data):
    
    '''Split ratings by user into train and test sets using the hold
    last out method. Test contains the last rated movie by user while
    training contains all the other rated movies.
    '''
    
    # find the indexes correspondent to maximum timestamp by user
    idx_test = user_data.groupby('userId', sort=False).idxmax()['timestamp']
    # filter test data with idx_test
    test = user_data.loc[idx_test]
    # drop idx_test to get train data
    train = user_data.drop(idx_test, axis=0)
    
    return train, test

In [17]:
# apply hold_last_out to dev_users
dev_train, dev_test = hold_last_out(dev_users)

In [18]:
print('Dev_train set has {} unique users and {} ratings in total.'.format(len(dev_train['userId'].unique()), dev_train.shape[0]))

Dev_train set has 5000 unique users and 836019 ratings in total.


In [19]:
print('Dev_test set has {} unique users and {} ratings in total.'.format(len(dev_test['userId'].unique()), dev_test.shape[0]))

Dev_test set has 5000 unique users and 5000 ratings in total.


In [20]:
# apply hold_last_out to test_users
test_train, test_test = hold_last_out(test_users)

In [21]:
print('Test_train set has {} unique users and {} ratings in total.'.format(len(test_train['userId'].unique()), test_train.shape[0]))

Test_train set has 126880 unique users and 21072671 ratings in total.


In [22]:
print('Test_test set has {} unique users and {} ratings in total.'.format(len(test_test['userId'].unique()), test_test.shape[0]))

Test_test set has 126880 unique users and 126880 ratings in total.


### Vectorize text fields

#### Start with movie description as a pilot

In [145]:
# initiatize TfidVectorizer
vectorizer = TfidfVectorizer()
# fit_transform movie description
word_vector = vectorizer.fit_transform(movies_df['description'])
# print number of features
print('Each movie was converted into {} word tokens.'.format(len(vectorizer.get_feature_names())))

Each movie was converted into 9506 word tokens.


### Test different classifiers

In [208]:
def run_clf(user_id, clf=knn, train_ratings_data=dev_train, test_ratings_data=dev_test, movies_data=movies_df, word_vector=word_vector):
    
    '''Run a classifier for a single user and return hit score'''
    
    # get movies and ratings from user training data
    train_movies = train_ratings_data[train_ratings_data['userId']==user_id]['imdb_id']
    train_ratings = train_ratings_data[train_ratings_data['userId']==user_id]['rating']
    #print('User {} rated {} movies in training set.'.format(user_id, len(train_movies)))
    #print('User {} gave {} positive ratings and {} negative ratings.'.format(user_id, (train_ratings==1).sum(), (train_ratings==0).sum()), '\n')
    
    # get movie and rating for hold out user test data
    hold_out_movie = test_ratings_data[test_ratings_data['userId']==user_id]['imdb_id']
    hold_out_rating = test_ratings_data[test_ratings_data['userId']==user_id]['rating']
    #print('User {} rated {} movie in the hold-out set.'.format(user_id, len(hold_out_movie)))
    #print('User {} gave a {} rating for the hold-out movie.'.format(user_id, int(hold_out_rating)), '\n')
    
    # complement test data with other 99 randomly selected movies
    allowed_list = movies_data.loc[~movies_data.index.isin(train_movies.append(hold_out_movie))]
    rd_movies = allowed_list.sample(n=99, replace=False).index.to_series()
    test_movies = hold_out_movie.append(rd_movies)
    
    # extract X_train and X_test matrices
    X_train = word_vector[[movies_data.index.get_loc(x) for x in train_movies], :]
    #print('X_train shape:', X_train.shape)
    X_test = word_vector[[movies_data.index.get_loc(x) for x in test_movies], :]
    #print('X_test shape:', X_test.shape)
    
    # extrac y_train vector
    y_train = train_ratings.values
    #print('y_train shape:', y_train.shape, '\n')
    
    # fit training data
    clf.fit(X_train, y_train)
    # compute probabilities for each class
    proba = clf.predict_proba(X_test)
    # compute the ranking for class==test_ratings
    ranking = np.argsort(proba, axis=0)[:,clf.classes_[clf.classes_==int(hold_out_rating)]]
    # apply a positive hit if test example ranked on top-10
    hit_score = ranking[0] > 89
    
    return hit_score

In [298]:
def get_hit_rate(user_list, clf=knn, train_ratings_data=dev_train, test_ratings_data=dev_test, movies_data=movies_df, word_vector=word_vector):

    '''Compute hit rate across diferent users'''
    
    hit_list = []
    for user_id in user_list:
        train_ratings = train_ratings_data[train_ratings_data['userId']==user_id]['rating']
        #print('User {} rated {} movies in training set.'.format(user_id, len(train_ratings)))
        #print('User {} gave {} positive ratings and {} negative ratings.'.format(user_id, (train_ratings==1).sum(), (train_ratings==0).sum()), '\n')
        if train_ratings.sum()==0 or train_ratings.sum()==len(train_ratings):
            continue
        else:
            hit_score = run_clf(user_id, clf=clf, train_ratings_data=train_ratings_data, test_ratings_data=test_ratings_data, movies_data=movies_data, word_vector=word_vector)
            hit_list.append(bool(hit_score))
    return sum(hit_list) / len(hit_list)

#### K-Nearest Neighbors

In [299]:
user_list = dev_train['userId'].unique()
knn = KNeighborsClassifier(n_neighbors=5)
get_hit_rate(user_list, clf=knn)

0.12073806658644204