In [36]:
import numpy as np
import pandas as pd
import string
import kmodes
from sklearn.cluster import KMeans
from scipy import stats

In [37]:
tags_table = pd.read_csv('../data/ml-latest-small/tags.csv', index_col=['userId', 'movieId'])
movies_table = pd.read_csv('../data/ml-latest-small/movies.csv', index_col='movieId')
ratings_table = pd.read_csv('../data/ml-latest-small/ratings.csv', index_col=['userId', 'movieId'])

tag_table = tags_table.drop('timestamp', axis='columns')
rating_table = ratings_table.drop('timestamp', axis='columns')

# Data Cleaning and Exploration

Here we simply lower the case of the tags so that tags like 'funny' and 'Funny' are considered the same. We also transform the tags into a matrix of binary values instead of a string. This is probably too large and sparse to work with directly. We also calculate the mean rating for each movie and find the difference between the user's rating and the mean. This difference could maybe be used to find like minded users in the case that they both like/dislike a movie that the general population of users disliked/liked. 

In [38]:
# noticed that some tags are upper case and some are lower
tags_table.tag = tags_table.tag.apply(lambda x: x.lower())

# transform tags into binary features
tags_matrix = tag_table
tags_matrix = tags_matrix.pivot_table(index=['userId', 'movieId'], columns='tag', aggfunc=len, fill_value=0)
tags_matrix

Unnamed: 0_level_0,tag,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2,60756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,89774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,106782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,48516,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,7382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,7936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
610,3265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# turn ratings into a mean rating for each move id
mean_ratings = rating_table.reset_index()
mean_ratings = mean_ratings[['movieId', 'rating']].groupby('movieId').mean()

# get the difference between the user's rating and the average rating for a movie

ratings_diff = rating_table.join(mean_ratings, on='movieId', rsuffix='_mean')
ratings_diff['rating_diff'] = ratings_diff['rating'] - ratings_diff['rating_mean']
ratings_diff = ratings_diff.drop('rating_mean', axis='columns')
ratings_diff

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,rating_diff
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,0.079070
1,3,4.0,0.740385
1,6,4.0,0.053922
1,47,5.0,1.024631
1,50,5.0,0.762255
...,...,...,...
610,166534,4.0,0.666667
610,168248,5.0,0.857143
610,168250,5.0,1.366667
610,168252,5.0,0.720000


# Problems

Here we create a table that contains data pertaining to both ratings and tags. This table however is filled with missing data, because users either don't tag movies they rate or don't rate movies they tag. This presents a problem when trying to use this data in a k-means model. 

In [40]:
# this now contains data if a user either rated a movie or added one or more tags to that movie
rating_tag_table = ratings_diff.merge(tags_matrix, on=['userId', 'movieId'], how='outer')

In [41]:
rating_diff_table = pd.DataFrame(rating_tag_table['rating_diff'])
uid_mid_diff_table = rating_diff_table.unstack(-1)
uid_mid_diff_table.columns = uid_mid_diff_table.columns.map(lambda x: x[1])
uid_mid_diff_table

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.07907,,0.740385,,,0.053922,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.07907,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.42093,,,,,,-0.685185,,,,...,,,,,,,,,,
607,0.07907,,,,,,,,,,...,,,,,,,,,,
608,-1.42093,-1.431818,-1.259615,,,,,,,0.503788,...,,,,,,,,,,
609,-0.92093,,,,,,,,,0.503788,...,,,,,,,,,,


# Creating the model

We went with a simple model that simply used userIds, movieIds, and ratings as features. The number of clusters is just randomly chosen, and the model is given an initial state in order to keep the results deterministic. A better way of determining it would be to use the Elbow method to find a point where the variance between points and their centers isn't decreasing as quickly. I couldn't come up with a good measure of variance here. 

In [42]:
flat_rating_table = rating_table.reset_index()

In [43]:
kmeans = KMeans(n_clusters=20, random_state=0)

clusters = kmeans.fit(flat_rating_table.values)

# Final Predictor

The final prediction function takes as input a user. It first gets all points of data related to the user. Then the data is given to the model and an array is returned that gives each data point a cluster number. Then the mode of this array is taken as the cluster that the user most belongs to. The rating data is then scraped for all movies that belong to that cluster and have a rating score above 4. That list of movies is then compared to all of the movies that the user has rated and any ones that haven't been rated are added to the recommendation list. 

In [44]:
# copy the table used as input to the clustering method and add the returned labels to it
frtcp = flat_rating_table.copy()
cluster_labels = pd.Series(clusters.labels_, name='cluster_labels')
frtcp['cluster_labels'] = cluster_labels

def provide_recommendations(userId):
    user_data = flat_rating_table[flat_rating_table['userId'] == userId]
    
    # each data point from the user is predicted to be in a cluster
    preds = clusters.predict(user_data.values)
    
    # use the mode of the possible clusters to decide where to box the user into
    predicted_cluster = stats.mode(preds)[0][0]

    # gather the data points within the chosen cluster
    matching_points = frtcp[frtcp['cluster_labels'] == predicted_cluster]
    
    # only choose very highly rated movies
    highrated_points = matching_points[matching_points['rating'] > 4]
    
    movies = highrated_points['movieId'].unique()
    user_movies = user_data['movieId'].unique()
    
    recs = []
    for m in movies:
        if m not in user_movies:
            recs.append(movies_table['title'][m])
    
    return recs

In [45]:
provide_recommendations(7)

['M*A*S*H (a.k.a. MASH) (1970)',
 'Hangar 18 (1980)',
 'Galaxy of Terror (Quest) (1981)',
 'Looker (1981)',
 'Android (1982)',
 'Alien Contamination (1980)',
 'Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)',
 'Death Race 2000 (1975)',
 "No Man's Land (2001)",
 'Austin Powers in Goldmember (2002)',
 'Adaptation (2002)',
 'Chasing Liberty (2004)',
 'Troy (2004)',
 'Notebook, The (2004)',
 'First Daughter (2004)',
 'Love Actually (2003)',
 'City of God (Cidade de Deus) (2002)',
 'Laputa: Castle in the Sky (Tenkû no shiro Rapyuta) (1986)',
 'Nausicaä of the Valley of the Wind (Kaze no tani no Naushika) (1984)',
 'Witness for the Prosecution (1957)',
 'Sleuth (1972)',
 'Pianist, The (2002)',
 'Flickering Lights (Blinkende lygter) (2000)',
 'Barton Fink (1991)',
 "Monty Python's The Meaning of Life (1983)",
 'Big Fish (2003)',
 'The Butterfly Effect (2004)',
 'Lammbock (2001)',
 'Gosford Park (2001)',
 'Chorus Line, A (1985)',
 'Die Another Day (2002)',
 'You Only 

# Closing thoughts

Overall the model is probably really bad. Something feels wrong about using the user and movie ids as inputs to k-means since they're categorical data, but I can't put my finger on it. Another thing that bothers me is that the model uses userId when, in reality, we should only need an array of movies and ratings that a theoretical user is giving. Maybe an improvement would be to only pass the movieId and the rating. 

In [46]:
# final little experiment 
flat_rating_table2 = rating_table.reset_index()[['movieId', 'rating']]

kmeans2 = KMeans(n_clusters=20, random_state=0)
clusters2 = kmeans2.fit(flat_rating_table2.values)

frtcp2 = flat_rating_table2.copy()
cluster_labels2 = pd.Series(clusters2.labels_, name='cluster_labels')
frtcp2['cluster_labels'] = cluster_labels2

def provide_recommendations2(mar):   
    preds = clusters2.predict(mar)
    
    # use the mode of the possible clusters to decide where to box the user into
    predicted_cluster = stats.mode(preds)[0][0]

    # gather the data points within the chosen cluster
    matching_points = frtcp[frtcp['cluster_labels'] == predicted_cluster]
    
    # only choose very highly rated movies
    highrated_points = matching_points[matching_points['rating'] > 4]
    
    movies = highrated_points['movieId'].unique()
    user_movies = mar['movieId'].unique()
    
    recs = []
    for m in movies:
        if m not in user_movies:
            recs.append(movies_table['title'][m])
    
    return recs



In [47]:
user7 = flat_rating_table[flat_rating_table['userId'] == 7]
user7_movie_rating = user7[['movieId', 'rating']]
user7_movie_rating

recs = provide_recommendations2(user7_movie_rating)
recs

['Rogue One: A Star Wars Story (2016)',
 'Logan (2017)',
 'Split (2017)',
 'John Wick: Chapter Two (2017)',
 'The Godfather Trilogy: 1972-1990 (1992)',
 'Black Mirror: White Christmas (2014)',
 'Storks (2016)',
 'Maximum Ride (2016)',
 "A Dog's Purpose (2017)",
 'Dana Carvey: Straight White Male, 60 (2016)',
 'Arrival (2016)',
 'Get Out (2017)',
 'Winnie the Pooh Goes Visiting (1971)',
 'Your Name. (2016)',
 'Winnie the Pooh and the Day of Concern (1972)',
 'Hacksaw Ridge (2016)',
 'Wings, Legs and Tails (1986)',
 'Alesha Popovich and Tugarin the Dragon (2004)',
 'Junior and Karlson (1968)',
 'A Plasticine Crow (1981)',
 'Band of Brothers (2001)',
 'There Once Was a Dog (1982)',
 'Planet Earth II (2016)',
 'Death Note: Desu nôto (2006–2007)',
 "Last Year's Snow Was Falling (1983)",
 'Investigation Held by Kolobki (1986)',
 'Karlson Returns (1970)',
 'Vacations in Prostokvashino (1980)',
 'Winter in Prostokvashino (1984)',
 'Priklyucheniya Kapitana Vrungelya (1979)',
 'Vovka in the King