## Data loading and cleaning

In [1]:
import pandas as pd
import numpy as np
import re
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import random
from collections import Counter

In [4]:
raw_meta = pd.read_csv('resource/movies_metadata.csv')  
raw_rt_sml = pd.read_csv('resource/ratings_small.csv') 
dpl_list = pd.concat(g for _, g in raw_meta.groupby("id") if len(g) > 1).id.unique().tolist()
raw_meta = raw_meta.loc[~raw_meta['id'].isin(dpl_list)]
meta_df = raw_meta[['id','overview','runtime','title','release_date', 'genres','vote_average']] #
meta_df.dropna(how='any',inplace=True)
movieid_title = meta_df[['id','title']]
movieid_title['id'] = movieid_title['id'].astype(int)
# keep only movies that can find names by id
inter_movies_sml = list(set(movieid_title.id.unique().tolist()).intersection(set(raw_rt_sml.movieId.unique().tolist())))
movieid_title = movieid_title.loc[movieid_title['id'].isin(inter_movies_sml)]
movie_lookup_dict = dict(zip(movieid_title.id.tolist(), movieid_title.title.tolist())) #movie id and movie name dict

rt_df_sml = raw_rt_sml.loc[raw_rt_sml['movieId'].isin(inter_movies_sml)]
rt_df_sml.loc[rt_df_sml['rating']>3,'like'] = 10
rt_df_sml.loc[rt_df_sml['rating']<=3,'like'] = -10
rt_df_sml.userId = rt_df_sml.userId.apply(lambda x: 'U'+str(x))

# conver to pivot table
user_movie_pivot = rt_df_sml.pivot_table(index='userId', columns='movieId', values='like').fillna(0)
# convert dataframe of movie features to scipy sparse matrix
user_movie_features = csr_matrix(user_movie_pivot.values)
# build and train the model
model_nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7, n_jobs=-1)
model_nn.fit(user_movie_features)
# get the list of all movie id
movie_list = user_movie_pivot.columns.tolist()

## Function to generate random 10 movies for user to rate (like/dislike/dont know)

In [5]:
# This functin returns movies names and ids. To use it, there should be a variable storing the id for the next recommending function.
def provide_movies_to_user(num):
    ran_id = random.sample(movie_list, num)
    ran_name = [movie_lookup_dict[ran] for ran in ran_id]
    return ran_name,ran_id

## Function to take in user feedback and recommend accordingly

In [6]:
# Helper function 1 -- to exclude movies that users already seen from the recommendation list
def exclude_seen_movies(u_select):
    # get seen movie id from user feedback
    seen_id_list = [m for m,r in u_select if r!=0]
    seen_name_list = [movie_lookup_dict[x] for x in seen_id_list]
    return seen_name_list                    

In [7]:
# Helper function 2 -- to get users with similar preferences for these 10 movies
def get_similar_users(u_select):
    # get movie id of user feedback
    p_id_list = [m for m,r in u_select if r>0]
    n_id_list = [m for m,r in u_select if r<0]

    # get index of movie for user feedback
    p_idx_list = [movie_list.index(x) for x in p_id_list]
    n_idx_list = [movie_list.index(x) for x in n_id_list]

    # create input array of same dimension for prediction
      # modify value of corresponding index to 10/-10
    user_fb_arr = np.zeros((1,2807))
    for idx1 in p_idx_list:
        user_fb_arr[0,idx1] = 10
    for idx2 in n_idx_list:
        user_fb_arr[0,idx2] = -10
    
    # find users who have similar preferences for these movies
    neigh_idx = model_nn.kneighbors(user_fb_arr, 3, return_distance=False)
    users_id = [] 
    for user_idx, val in enumerate(user_movie_pivot.index[neigh_idx][0]):
        users_id.append(val)
        #print((user_idx+1),". ",val)
    return users_id

In [8]:
# Helper function 3 -- To get movies liked by those similar users 
def get_sim_user_like(sim_user_idx):
    user_row = user_movie_pivot.loc[[sim_user_idx]].values.flatten().tolist()
    user_like_movie_idx = []
    i = 0
    for y in user_row:
        if y == 10:
            user_like_movie_idx.append(i)
        i += 1
    if len(user_like_movie_idx) > 0:
        user_like_movie_id = [movie_list[m] for m in user_like_movie_idx]
        user_like_movie_name = [movie_lookup_dict[n] for n in user_like_movie_id]
        return user_like_movie_name
    else:
        print('no similar user found')


In [10]:
# Recommendation function
def recommend_to_user(u_fb):

    feedback_list = list(zip(provide[1],u_fb))
    sim_user_ids = get_similar_users(feedback_list)
    movie_exclude = exclude_seen_movies(feedback_list)

    recommend_list = []
    for each in sim_user_ids:
        each_like = get_sim_user_like(each)
        each_like_cl = [mov for mov in each_like if mov not in movie_exclude]
        recommend_list.extend(each_like_cl)

    counter = Counter(recommend_list)
    toplist = counter.most_common(5)
    recommend_movies = [tl for tl,ct in toplist]
    return recommend_movies

### Examples on how to call these functions

In [11]:
# Call provider function to show 10 movies to start with and store the movie name, id pair in a global variable
provide = provide_movies_to_user(10)
provide[0]

['Within the Woods',
 'Persepolis',
 'The Pebble and the Penguin',
 'Dog Eat Dog',
 'Fighting Elegy',
 'Shopgirl',
 'Children of the Corn IV: The Gathering',
 'The Blue Angel',
 'St. Louis Blues',
 'Monsters, Inc.']

In [12]:
# Upon receiving user feedback (the input of this function), make recommendations
recommend_to_user([10,0,-10,10,0,10,0,0])

['Solaris',
 'Rebecca',
 'Young and Innocent',
 'The Searchers',
 'Sleepless in Seattle']