### Step 1: Import packages

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import *
from sklearn.metrics.pairwise import linear_kernel

### Step 2: Define working directories

In [2]:
path_raw_data = 'C:/users/lbros/documents/mids/w207/final_project/raw_data/'
path_clean_data = 'C:/users/lbros/documents/mids/w207/final_project/clean_data/'

### Step 3: Load clean data

#### Split ratings

In [3]:
# load ratings dataframes
dev_train = pd.read_csv(path_clean_data + 'dev_train.csv')
dev_test = pd.read_csv(path_clean_data + 'dev_test.csv')
test_train = pd.read_csv(path_clean_data + 'test_train.csv')
test_test = pd.read_csv(path_clean_data + 'test_test.csv')

In [4]:
# drop 'Unnamed 0' column
dev_train.drop('Unnamed: 0', axis=1, inplace=True)
dev_test.drop('Unnamed: 0', axis=1, inplace=True)
test_train.drop('Unnamed: 0', axis=1, inplace=True)
test_test.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
# print dataframes shapes
print(dev_train.shape)
print(dev_test.shape)
print(test_train.shape)
print(test_test.shape)

(834600, 3)
(5000, 3)
(17948251, 3)
(107150, 3)


In [6]:
# print dataframes columns
print(dev_train.columns)
print(dev_test.columns)
print(test_train.columns)
print(test_test.columns)

Index(['userId', 'imdb_id', 'rating'], dtype='object')
Index(['userId', 'imdb_id', 'rating'], dtype='object')
Index(['userId', 'imdb_id', 'rating'], dtype='object')
Index(['userId', 'imdb_id', 'rating'], dtype='object')


#### Movies

In [30]:
# load movies dataframe
movies_df = pd.read_csv(path_clean_data + 'movies_final.csv')

In [32]:
# print remaining duplicated movies (if any)
print('Number of duplicated imdb_id: ', movies_df[movies_df['imdb_id'].duplicated()].shape[0])

Number of duplicated imdb_id:  546


In [33]:
# exclude remaining imdb_id duplicates
movies_df = movies_df[~movies_df['imdb_id'].duplicated(keep='last')]

In [34]:
# set imdb_id as index
movies_df.set_index('imdb_id', verify_integrity=True, inplace=True)

In [35]:
# filter usable columns
movies_df = movies_df.iloc[:,3:]

In [36]:
# print dataframe shape
movies_df.shape

(29399, 176)

In [37]:
# print dataframe columns
movies_df.columns

Index(['belongs_to_collection', 'budget', 'originally_english', 'overview',
       'popularity', 'production_companies', 'production_countries', 'revenue',
       'runtime', 'tagline',
       ...
       'zu', 'canceled', 'in-production', 'planned', 'post-production',
       'released', 'rumored', 'cast_names', 'crew_names', 'description'],
      dtype='object', length=176)

### Step 4: Vectorize text fields

#### Start with movie description as a pilot

In [113]:
# initiatize TfidVectorizer
vectorizer = TfidfVectorizer()
# fit_transform movie description
word_vector = vectorizer.fit_transform(movies_df['description'])
# print number of features
print('Each movie was converted into {} word tokens.'.format(len(vectorizer.get_feature_names())))

Each movie was converted into 9506 word tokens.


### Step 5: Test and tune hyperparameters for different classifiers

#### Define auxiliary functions

In [114]:
def run_clf(user_id, clf, train_ratings_data=dev_train, test_ratings_data=dev_test, movies_data=movies_df, word_vector=word_vector):
    
    '''Run a classifier for a single user and return the score'''
    
    # get movies and ratings from user training data
    train_movies = train_ratings_data[train_ratings_data['userId']==user_id]['imdb_id']
    train_ratings = train_ratings_data[train_ratings_data['userId']==user_id]['rating']
    #print('User {} rated {} movies in training set.'.format(user_id, len(train_movies)))
    #print('User {} gave {} positive ratings and {} negative ratings.'.format(user_id, (train_ratings==1).sum(), (train_ratings==0).sum()), '\n')
    
    # get movie and rating for hold out user test data
    hold_out_movie = test_ratings_data[test_ratings_data['userId']==user_id]['imdb_id']
    hold_out_rating = test_ratings_data[test_ratings_data['userId']==user_id]['rating']
    #print('User {} rated {} movie in the hold-out set.'.format(user_id, len(hold_out_movie)))
    #print('User {} gave a {} rating for the hold-out movie.'.format(user_id, int(hold_out_rating)), '\n')
    
    # complement test data with other 99 randomly selected movies
    allowed_list = movies_data.loc[~movies_data.index.isin(train_movies.append(hold_out_movie))]
    rd_movies = allowed_list.sample(n=99, replace=False).index.to_series()
    test_movies = hold_out_movie.append(rd_movies)
    
    # extract X_train and X_test matrices
    X_train = word_vector[[movies_data.index.get_loc(x) for x in train_movies], :]
    #print('X_train shape:', X_train.shape)
    X_test = word_vector[[movies_data.index.get_loc(x) for x in test_movies], :]
    #print('X_test shape:', X_test.shape)
    
    # extrac y_train vector
    y_train = train_ratings.values
    #print('y_train shape:', y_train.shape, '\n')
    
    # fit training data
    clf.fit(X_train, y_train)
    # compute probabilities for each class
    proba = clf.predict_proba(X_test)
    # compute the ranking for class==test_ratings
    ranking = np.argsort(proba, axis=0)[:,clf.classes_[clf.classes_==int(hold_out_rating)]]
    # apply a positive hit if test example ranked on top-10 in descending order
    score = ranking[0] > 89
    
    return score

In [115]:
def get_hit_rate(user_list, clf, train_ratings_data=dev_train, test_ratings_data=dev_test, movies_data=movies_df, word_vector=word_vector):

    '''Compute hit rate across diferent users'''
    
    hit_list = []
    for user_id in user_list:
        score = run_clf(user_id, clf=clf, train_ratings_data=train_ratings_data, test_ratings_data=test_ratings_data, 
                            movies_data=movies_data, word_vector=word_vector)
        hit_list.append(bool(score))
    return sum(hit_list) / len(hit_list)

In [116]:
def get_hit_rate_by_rating(user_list, clf, train_ratings_data=dev_train, test_ratings_data=dev_test, movies_data=movies_df, word_vector=word_vector):
    
    '''Compute hit rate by user rating in the hold out movie'''
    
    # for different ratings in the hold out movie
    ratings = [0, 1]
    hit_rate_by_rating = []
    for r in ratings:
        # define the user_list as the subset of users who gave rate r in the hold out movie
        user_list = test_ratings_data[test_ratings_data['rating']==r]['userId']
        # compute the hit rate
        hit_rate = get_hit_rate(user_list=user_list, clf=clf, train_ratings_data=train_ratings_data, test_ratings_data=test_ratings_data, 
                                movies_data=movies_df, word_vector=word_vector)
        # append hit rate to the hit_rate_by_rating
        hit_rate_by_rating.append(hit_rate)
        # print results
        print('For rating=={} hit rate=={:.3f}'.format(r, hit_rate))
    
    return hit_rate_by_rating

In [117]:
def get_hit_rate_by_n_ratings(user_list, clf, train_ratings_data=dev_train, test_ratings_data=dev_test, movies_data=movies_df, word_vector=word_vector):
    
    '''Compute hit rate by number of ratings in the user training set'''
    
    # split users by number of ratings
    bins = [0, 50, 100, 150, 200, 20000]
    user_list_by_n_ratings = pd.cut(train_ratings_data.groupby('userId').count()['rating'], bins).reset_index('userId')
    intervals = user_list_by_n_ratings['rating'].unique()
    # for different intervals of number of ratings
    hit_rate_by_n_ratings = []
    for i in intervals:
        # define the user_list as the subset of users within the interval
        user_list = user_list_by_n_ratings[user_list_by_n_ratings['rating']==i]['userId']
        # compute the hit rate
        hit_rate = get_hit_rate(user_list=user_list, clf=clf, train_ratings_data=train_ratings_data, test_ratings_data=test_ratings_data, 
                                movies_data=movies_df, word_vector=word_vector)
        # append hit rate to the hit_rate_by_n_ratings
        hit_rate_by_n_ratings.append(hit_rate)
        # print results
        print('For interval=={} hit rate=={:.3f}'.format(i, hit_rate))
        
    return hit_rate_by_n_ratings

#### K-Nearest Neighbors

In [118]:
# define user_list | limiting to first 500 users to speed up simulations
user_list = dev_train['userId'].unique()[:500]

In [120]:
# define range for k (n_neighbors) parameter
param_range = [5, 10, 15, 20, 25]
# for different values of parameter k (n_beighbors)
hit_rate_list = []
for param in param_range:
    # initialize classifier
    knn = KNeighborsClassifier(n_neighbors=param)
    # compute the hit rate
    hit_rate = get_hit_rate(user_list, clf=knn)
    # append hit rate to the hit_rate_list
    hit_rate_list.append(hit_rate)
    # print results
    print('For k=={:2} hit rate=={:.3f}'.format(param, hit_rate))

For k== 5 hit rate==0.132
For k==10 hit rate==0.138
For k==15 hit rate==0.116
For k==20 hit rate==0.102
For k==25 hit rate==0.134


Parameter **n_neighbors (k) will be set as 10**

**Hit rate of 13.2%** slighly above 10% baseline of a random ranking

In [121]:
# evalute hit rate by user rating in the hold out movie
knn = KNeighborsClassifier(n_neighbors=10)
hit_rate_by_rating = get_hit_rate_by_rating(user_list, knn)

For rating==0 hit rate==0.118
For rating==1 hit rate==0.125


Hit rate **higher for positive scores** in the hold-out movie

In [122]:
# evaluate hit rate by number of ratings in the user training set
knn = KNeighborsClassifier(n_neighbors=10)
hit_rate_by_n_ratings = get_hit_rate_by_n_ratings(user_list, knn)

For interval==(0, 50] hit rate==0.128
For interval==(50, 100] hit rate==0.105
For interval==(200, 20000] hit rate==0.121
For interval==(100, 150] hit rate==0.091
For interval==(150, 200] hit rate==0.126


**Hit rate decreases** for users with **n_ratings above 50**

#### BernoulliNB

In [123]:
# define range for alpha parameter
param_range = [0.001, 0.01, 0.1, 1]
# for different values of parameter alpha
hit_rate_list = []
for param in param_range:
    # initialize classifier
    bnb = BernoulliNB(alpha=param)
    # compute the hit rate
    hit_rate = get_hit_rate(user_list, clf=bnb)
    # append hit rate to the hit_rate_list
    hit_rate_list.append(hit_rate)
    # print results
    print('For alpha=={:.4f} hit rate=={:.3f}'.format(param, hit_rate))

For alpha==0.0010 hit rate==0.096
For alpha==0.0100 hit rate==0.080
For alpha==0.1000 hit rate==0.102
For alpha==1.0000 hit rate==0.058


Parameter **alpha will be set as 0.1**

**Hit rate of 10.2%** equal to 10% baseline of a random ranking

In [131]:
# evalute hit rate by user rating in the hold out movie
bnb = BernoulliNB(alpha=0.1)
hit_rate_by_rating = get_hit_rate_by_rating(user_list, bnb)

For rating==0 hit rate==0.103
For rating==1 hit rate==0.099


**Hit rate similar** for positive and negative scores in the hold-out movie

In [132]:
# evaluate hit rate by number of ratings in the user training set
bnb = BernoulliNB(alpha=0.1)
hit_rate_by_n_ratings = get_hit_rate_by_n_ratings(user_list, bnb)

For interval==(0, 50] hit rate==0.089
For interval==(50, 100] hit rate==0.092
For interval==(200, 20000] hit rate==0.086
For interval==(100, 150] hit rate==0.099
For interval==(150, 200] hit rate==0.102


**Hit rate usually increases** as n_ratings increase, except for the more than 200 segment

#### Logistic Regression

In [124]:
# define range for C parameter
param_range = [0.001, 0.01, 0.1, 1]
# for different values of C parameter
hit_rate_list = []
for param in param_range:
    # initialize classifier
    lr = LogisticRegression(C=param)
    # compute the hit rate
    hit_rate = get_hit_rate(user_list, clf=lr)
    # append hit rate to the hit_rate_list
    hit_rate_list.append(hit_rate)
    # print results
    print('For C=={:.3f} hit rate=={:.3f}'.format(param, hit_rate))

For C==0.001 hit rate==0.122
For C==0.010 hit rate==0.116
For C==0.100 hit rate==0.128
For C==1.000 hit rate==0.124


Parameter **C will be set as 0.1**

**Hit rate of 12.8%** slightly better than 10% baseline of a random ranking

In [133]:
# evalute hit rate by user rating in the hold out movie
lr = LogisticRegression(C=0.1)
hit_rate_by_rating = get_hit_rate_by_rating(user_list, lr)

For rating==0 hit rate==0.131
For rating==1 hit rate==0.139


Hit rate **higher for positive scores** in the hold-out movie

In [134]:
# evaluate hit rate by number of ratings in the user training set
lr = LogisticRegression(C=0.1)
hit_rate_by_n_ratings = get_hit_rate_by_n_ratings(user_list, lr)

For interval==(0, 50] hit rate==0.111
For interval==(50, 100] hit rate==0.134
For interval==(200, 20000] hit rate==0.134
For interval==(100, 150] hit rate==0.144
For interval==(150, 200] hit rate==0.114


**Hit rate usually increases** as n_ratings increase

#### SVM

In [130]:
# define range for C parameter
param_range = [0.001, 0.01, 0.1, 1, 10]
# for different values of C parameter
hit_rate_list = []
for param in param_range:
    # initialize classifier
    svm = SVC(C=param, kernel='rbf', probability=True)
    # compute the hit rate
    hit_rate = get_hit_rate(user_list, clf=svm)
    # append hit rate to the hit_rate_list
    hit_rate_list.append(hit_rate)
    # print results
    print('For C=={:.3f} hit rate=={:.3f}'.format(param, hit_rate))

For C==0.001 hit rate==0.126
For C==0.010 hit rate==0.124
For C==0.100 hit rate==0.156
For C==1.000 hit rate==0.108
For C==10.000 hit rate==0.130


Parameter **alpha will be set as 0.1**

**Hit rate of 15.6%** slightly better than 10% baseline of a random ranking

In [135]:
# evalute hit rate by user rating in the hold out movie
svm = SVC(C=0.1, kernel='rbf', probability=True)
hit_rate_by_rating = get_hit_rate_by_rating(user_list, svm)

For rating==0 hit rate==0.149
For rating==1 hit rate==0.129


Hit rate **higher for negative scores** in the hold-out movie

In [136]:
# evaluate hit rate by number of ratings in the user training set
svm = SVC(C=0.1, kernel='rbf', probability=True)
hit_rate_by_n_ratings = get_hit_rate_by_n_ratings(user_list, svm)

For interval==(0, 50] hit rate==0.109
For interval==(50, 100] hit rate==0.134
For interval==(200, 20000] hit rate==0.142
For interval==(100, 150] hit rate==0.163
For interval==(150, 200] hit rate==0.143


Hit rate **higher** for n_ratings **larger than 50**

#### Random Forest

In [128]:
# define range for n_estimators parameter
param_range = [5, 10, 50, 100]
# for different values of n_estimators parameter
hit_rate_list = []
for param in param_range:
    # initialize classifier
    rfc = RandomForestClassifier(n_estimators=param, criterion='entropy')
    # compute the hit rate
    hit_rate = get_hit_rate(user_list, clf=rfc)
    # append hit rate to the hit_rate_list
    hit_rate_list.append(hit_rate)
    # print results
    print('For n_estimators=={:3} hit rate=={:.3f}'.format(param, hit_rate))

For C==  5 hit rate==0.122
For C== 10 hit rate==0.120
For C== 50 hit rate==0.138
For C==100 hit rate==0.120


Parameter **n_estimators will be set as 50**

**Hit rate of 13.8%** slightly better than 10% baseline of a random ranking

In [137]:
# evalute hit rate by user rating in the hold out movie
rfc = RandomForestClassifier(n_estimators=50, criterion='entropy')
hit_rate_by_rating = get_hit_rate_by_rating(user_list, rfc)

For rating==0 hit rate==0.112
For rating==1 hit rate==0.128


Hit rate **higher for positive scores** in the hold-out movie

In [138]:
# evaluate hit rate by number of ratings in the user training set
rfc = RandomForestClassifier(n_estimators=50, criterion='entropy')
hit_rate_by_n_ratings = get_hit_rate_by_n_ratings(user_list, rfc)

For interval==(0, 50] hit rate==0.128
For interval==(50, 100] hit rate==0.135
For interval==(200, 20000] hit rate==0.121
For interval==(100, 150] hit rate==0.146
For interval==(150, 200] hit rate==0.124


**Hit rate usually increases** as n_ratings increase