In [91]:
import pandas as pd
import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn

import pickle
from fuzzywuzzy import process

In [2]:
def watched_movies(userId):
    '''
    this function creates a user-item-matrix of the ratings_long table
    and returns a list of all watched movies by a user (userId)
    '''
    ratings = pd.pivot_table(ratings_long, values='rating', index='userId', columns='movieId')
    watched_movies = ratings.loc[userId].dropna().index
    return list(watched_movies)

def favorite_genres(userId):
    '''
    this function returns a users n favorite genres
    needs the movie table with one hot encoded genres
    '''
    watched_movies_list = watched_movies(userId)
    top_genres = movies.loc[watched_movies_list,'romance':].sum().sort_values(ascending=False).index[:3]
    return top_genres


def popularity_recommender(userId, k):
    '''
    this function checks, wether a user has seen one of the most popular movies
    if not, he gets k of the most popular movies recommended
    '''
    liked_items = watched_movies(userId)
    recommend = top_movies[~top_movies.index.isin(liked_items)][:k]
    return recommend

def popularity_genre_recommender(userId, k):
    '''
    this function checks, wether a user has seen one of the most popular movies
    if not, he gets k of the most popular movies recommended
    '''
    liked_items = watched_movies(userId)
    fav_genres = favorite_genres(userId)
    recommend = top_movies[~top_movies.index.isin(liked_items)]
    genre_filter = (recommend[fav_genres[0]]==1)|(recommend[fav_genres[2]]==1)|(recommend[fav_genres[0]]==1)
    return recommend[genre_filter].iloc[:k]

def create_neighborhood(userId):
    # like pivot_tablet but much more memory efficient 
    user_item = csr_matrix((ratings_long['rating'], (ratings_long['userId'], ratings_long['movieId'])))
    # initialize the unsupervised model
    model = NearestNeighbors(metric='cosine')
    # fit it to the user-item matrix
    model.fit(user_item)
    # find the neighborhood
    user_vec = user_item[userId,:]
    distances, user_ids = model.kneighbors(user_vec, n_neighbors=10)
    return user_ids
    

def neighbor_recommender(userId):
    neighbor_ids = create_neighborhood(userId)
    neighborhood = ratings_long.set_index('userId').loc[neighbor_ids[0]]
    recommendations = neighborhood.groupby('movieId')['rating'].sum().sort_values(ascending=False)
    item_filter = ~recommendations.index.isin(watched_movies(userId))
    recommendations = recommendations.loc[item_filter]
    return movies.loc[recommendations.head(10).index]



In [3]:
# read in ratings in a long table format
ratings_long = pd.read_csv('./data/ml-latest-small/ratings.csv')
print(ratings_long.shape)
ratings_long.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [21]:
ratings_by_user = ratings_long.set_index('userId')

In [25]:
ratings_by_user.loc[15]['movieId']

userId
15         1
15        44
15        47
15       158
15       172
       ...  
15    152081
15    158872
15    160980
15    166528
15    166635
Name: movieId, Length: 135, dtype: int64

In [4]:
# how many movies per user?
ratings_long['userId'].value_counts()

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
406      20
595      20
569      20
431      20
442      20
Name: userId, Length: 610, dtype: int64

In [5]:
# how many movies has been rated?
ratings_long['movieId'].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
5986        1
100304      1
34800       1
83976       1
8196        1
Name: movieId, Length: 9724, dtype: int64

In [6]:
# calculate the mean rating of all movies
ratings_mean = ratings_long.groupby('movieId').mean()
ratings_mean.drop(columns=['userId','timestamp'], inplace=True)
ratings_mean.columns=['mean_rating']
ratings_mean.head()

Unnamed: 0_level_0,mean_rating
movieId,Unnamed: 1_level_1
1,3.92093
2,3.431818
3,3.259615
4,2.357143
5,3.071429


In [7]:
# calvulate the count of ratings per movie
ratings_count = ratings_long.groupby('movieId').count()
ratings_count.drop(columns=['userId','timestamp'], inplace=True)
ratings_count.columns = ['count']
ratings_count.head()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49


In [115]:
# read in movies
movies = pd.read_csv('./data/ml-latest-small/movies.csv', index_col=0)
print(movies.shape)
movies.head()

(9742, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [116]:
genres = pd.read_csv('./data/movies_genres.csv', index_col=0)
genres.shape

(9742, 21)

In [117]:
genres.loc[:,'romance':]

Unnamed: 0_level_0,romance,sci-fi,animation,film-noir,musical,adventure,thriller,horror,documentary,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
193583,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
193587,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [118]:
movies = pd.concat([movies, ratings_mean, ratings_count, genres.loc[:,'romance':]], axis=1)
movies.head()

Unnamed: 0,title,genres,mean_rating,count,romance,sci-fi,animation,film-noir,musical,adventure,...,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0,0,0,1,0,0,1,...,1,0,1,1,0,0,0,0,0,0
2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110.0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52.0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7.0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
5,Father of the Bride Part II (1995),Comedy,3.071429,49.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [119]:
print(movies.shape)
print(movies.isna().sum())

(9742, 23)
title           0
genres          0
mean_rating    18
count          18
romance         0
sci-fi          0
animation       0
film-noir       0
musical         0
adventure       0
thriller        0
horror          0
documentary     0
fantasy         0
mystery         0
children        0
comedy          0
crime           0
western         0
imax            0
war             0
drama           0
action          0
dtype: int64


In [18]:
top_movies = movies[(movies['mean_rating']>3)&(movies['count']>100)]
top_movies.sort_values(by=['mean_rating'], ascending=False)
top_movies

Unnamed: 0,title,genres,mean_rating,count,title.1,year,romance,sci-fi,animation,film-noir,...,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,215.0,Toy Story,1995.0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0
2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110.0,Jumanji,1995.0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
6,Heat (1995),Action|Crime|Thriller,3.946078,102.0,Heat,1995.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
10,GoldenEye (1995),Action|Adventure|Thriller,3.496212,132.0,GoldenEye,1995.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,3.983051,177.0,Twelve Monkeys (a.k.a. 12 Monkeys),1995.0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48516,"Departed, The (2006)",Crime|Drama|Thriller,4.252336,107.0,"Departed, The",2006.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.238255,149.0,"Dark Knight, The",2008.0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,1
60069,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi,4.057692,104.0,WALL·E,2008.0,1,1,1,0,...,0,0,1,0,0,0,0,0,0,0
68954,Up (2009),Adventure|Animation|Children|Drama,4.004762,105.0,Up,2009.0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [13]:
#creating a user-movie-matrix
ratings = pd.pivot_table(ratings_long, values='rating', index='userId', columns='movieId')
ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [15]:
favorite_genres(15)

Index(['action', 'sci-fi', 'adventure'], dtype='object')

In [16]:
popularity_genre_recommender(15,5)

Unnamed: 0,title,genres,mean_rating,count,title.1,year,romance,sci-fi,animation,film-noir,...,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
110,Braveheart (1995),Action|Drama|War,4.031646,237.0,Braveheart,1995.0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
541,Blade Runner (1982),Action|Sci-Fi|Thriller,4.100806,124.0,Blade Runner,1982.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.161765,136.0,Monty Python and the Holy Grail,1975.0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,4.232394,142.0,"Princess Bride, The",1987.0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
1208,Apocalypse Now (1979),Action|Drama|War,4.219626,107.0,Apocalypse Now,1979.0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


In [17]:
popularity_recommender(15,5)

Unnamed: 0,title,genres,mean_rating,count,title.1,year,romance,sci-fi,animation,film-noir,...,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.237745,204.0,"Usual Suspects, The",1995.0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
110,Braveheart (1995),Action|Drama|War,4.031646,237.0,Braveheart,1995.0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
111,Taxi Driver (1976),Crime|Drama|Thriller,4.105769,104.0,Taxi Driver,1976.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
541,Blade Runner (1982),Action|Sci-Fi|Thriller,4.100806,124.0,Blade Runner,1982.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279.0,"Silence of the Lambs, The",1991.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [77]:
neighbor_recommender(15)

Unnamed: 0_level_0,title,genres,mean_rating,count,title,year,romance,sci-fi,animation,film-noir,...,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,4.118919,185.0,"Lord of the Rings: The Return of the King, The",2003.0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
48516,"Departed, The (2006)",Crime|Drama|Thriller,4.252336,107.0,"Departed, The",2006.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
44191,V for Vendetta (2006),Action|Sci-Fi|Thriller|IMAX,3.885,100.0,V for Vendetta,2006.0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279.0,"Silence of the Lambs, The",1991.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
110,Braveheart (1995),Action|Drama|War,4.031646,237.0,Braveheart,1995.0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
33794,Batman Begins (2005),Action|Crime|IMAX,3.862069,116.0,Batman Begins,2005.0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
1291,Indiana Jones and the Last Crusade (1989),Action|Adventure,4.046429,140.0,Indiana Jones and the Last Crusade,1989.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1258,"Shining, The (1980)",Horror,4.082569,109.0,"Shining, The",1980.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4011,Snatch (2000),Comedy|Crime|Thriller,4.155914,93.0,Snatch,2000.0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75,238.0,Jurassic Park,1993.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [81]:
top_movies.sort_values('mean_rating', ascending=False).head(20)

Unnamed: 0,title,genres,mean_rating,count,title.1,year,romance,sci-fi,animation,film-noir,...,fantasy,mystery,children,comedy,crime,western,imax,war,drama,action
318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317.0,"Shawshank Redemption, The",1994.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
858,"Godfather, The (1972)",Crime|Drama,4.289062,192.0,"Godfather, The",1972.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2959,Fight Club (1999),Action|Crime|Drama|Thriller,4.272936,218.0,Fight Club,1999.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
1221,"Godfather: Part II, The (1974)",Crime|Drama,4.25969,129.0,"Godfather: Part II, The",1974.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
48516,"Departed, The (2006)",Crime|Drama|Thriller,4.252336,107.0,"Departed, The",2006.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1213,Goodfellas (1990),Crime|Drama,4.25,126.0,Goodfellas,1990.0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,4.238255,149.0,"Dark Knight, The",2008.0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,1
50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.237745,204.0,"Usual Suspects, The",1995.0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,4.232394,142.0,"Princess Bride, The",1987.0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251.0,Star Wars: Episode IV - A New Hope,1977.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [82]:
genres = ['romance',
       'sci-fi', 'animation', 'film-noir', 'musical', 'adventure', 'thriller',
       'horror', 'documentary', 'fantasy', 'mystery', 'children', 'comedy',
       'crime', 'western', 'imax', 'war', 'drama', 'action']

Index(['title', 'genres', 'mean_rating', 'count', 'title', 'year', 'romance',
       'sci-fi', 'animation', 'film-noir', 'musical', 'adventure', 'thriller',
       'horror', 'documentary', 'fantasy', 'mystery', 'children', 'comedy',
       'crime', 'western', 'imax', 'war', 'drama', 'action'],
      dtype='object')

In [137]:
# this is the input to the backend

liked_movies = {
    'iron man': 5,
    'titanic': 5,
    'star wars': 5
}

In [164]:
liked_movies

{'titanic': [5, 1721], 'star wars': [5, 260], 'test': [5, 59315]}

In [162]:
liked_movies['test'] = liked_movies.pop('iron man')

In [138]:
# find the movie ids for the search strings above (with Fuzzywuzzy)
liked_movie_ids = []
for key in liked_movies.keys():
    print(key)
    match = process.extractBests(key, movies['title'])
    print(match[0][2])
    liked_movie_ids.append(matches[0][2])
    liked_movies[key] = [liked_movies[key], match[0][2]]

iron man
59315
titanic
1721
star wars
260


In [148]:
# create a new user vector
new_user_vector = np.repeat(0, 193610)
for value in liked_movies.values():
    new_user_vector[value[1]] = value[0]
#liked_movie_ids
print(new_user_vector)

# load in the fitted neighborhood model
model = pickle.load(open('./models/NN_cosine.sav', 'rb'))

# find neighbors for the new user vector
neighbors = model.kneighbors([new_user_vector], n_neighbors=20)

# calculate the (weighted) average rating


# recommend some movies


# put everything into a function, test it and move it into the flask project

[0 0 0 ... 0 0 0]


(array([[0.7361203 , 0.76690547, 0.78865901, 0.79110681, 0.79227435,
         0.79524291, 0.81849664, 0.82592234, 0.82592234, 0.82746459,
         0.83048412, 0.83432472, 0.83816128, 0.83931488, 0.84033516,
         0.84321319, 0.84612539, 0.84978956, 0.85032891, 0.8504128 ]]),
 array([[407, 515,  25,  86,  30, 128, 557, 120, 481,  69, 529, 344, 123,
         582, 248, 531, 493,  75, 516, 551]]))

In [241]:
user_movie = ['titanic', 'iron man', 'medicus']
rating = [3, 4, 5]

In [242]:
user_ratings = dict(zip(user_movie, rating))

In [243]:
user_ratings

{'titanic': 3, 'iron man': 4, 'medicus': 5}

In [244]:
user_genres = ['action', 'thriller', 'children', 'crime']

In [245]:
user = {
    'ratings':user_ratings,
    'genres':user_genres
}

In [246]:
user

{'ratings': {'titanic': 3, 'iron man': 4, 'medicus': 5},
 'genres': ['action', 'thriller', 'children', 'crime']}

In [285]:
def lookup_movie(search_query, titles):
    """
    given a search query, uses fuzzy string matching to search for similar 
    strings in a pandas series of movie titles

    returns a list of search results. Each result is a tuple that contains 
    the title, the matching score and the movieId.
    """
    matches = process.extractBests(search_query, titles)
    # [(title, score, movieId), ...]
    return matches

def extract_movie_ids(user, titles):
    for key in user['ratings'].keys():
        movie_id = lookup_movie(key, titles)[0][2]
        print(f"{key}: {movie_id}")
        user['ratings'][movie_id] = user['ratings'].pop(key)
    print('The users dictionary has been changed! You cannot execute this function again on the same user')
    return user

def create_new_user(user):
    vector = np.repeat(0, 193610)
    for key,val in user['ratings'].items():
        vector[key] = val
    return vector

def load_model(path):
    model = pickle.load(open(path, 'rb'))
    return model

def create_neighborhood(user_vector):
    model = load_model('./models/NN_cosine.sav')
    distances, neighbor_ids = model.kneighbors(user_vector, n_neighbors=20)
    return neighbor_ids

def neighbor_recommender(user):
    user_vector = create_new_user(user)
    #print(user_vector[1721])
    neighbor_ids = create_neighborhood([user_vector])
    neighborhood = ratings_long.set_index('userId').loc[neighbor_ids[0]]
    #print(neighborhood)
    recommendations = neighborhood.groupby('movieId')['rating'].sum().sort_values(ascending=False)
    #print(recommendations)
    #print(list(user['ratings'].keys()))
    item_filter = ~recommendations.index.isin(list(user['ratings'].keys()))
    recommendations = recommendations.loc[item_filter]
    return recommendations

In [248]:
extract_movie_ids(user,movies['title'])

titanic: 1721
iron man: 59315
medicus: 2822
The users dictionary has been changed! You cannot execute this function again on the same user


{'ratings': {1721: 3, 59315: 4, 2822: 5},
 'genres': ['action', 'thriller', 'children', 'crime']}

In [250]:
user_vector = create_new_user(user)

In [291]:
reco = neighbor_recommender(user)
reco

movieId
60069    47.0
58559    46.5
2028     46.0
2571     42.0
79132    41.5
         ... 
2953      0.5
185       0.5
6793      0.5
172       0.5
1327      0.5
Name: rating, Length: 947, dtype: float64

In [296]:
list(movies.loc[reco.index]['title'].head(10))

['WALL·E (2008)',
 'Dark Knight, The (2008)',
 'Saving Private Ryan (1998)',
 'Matrix, The (1999)',
 'Inception (2010)',
 'Up (2009)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Fight Club (1999)']

## tried one hot encoding of genres 

In [None]:
list_of_genres = []
for i in movies.iterrows():
    #print(i[1]['genres'].split('|'))
    new_genres = i[1]['genres'].split('|')
    list_of_genres = list_of_genres + new_genres[:]
#list_of_genres    

In [None]:
df_all_genres = pd.DataFrame({
    'genres':list_of_genres
})

In [None]:
genres = df_all_genres['genres'].unique()[:-1]

In [None]:
movie_genres = pd.DataFrame(0, index=movies.index, columns=genres)
print(movie_genres.shape)
movie_genres.head(10)

In [None]:
movie_genres['genres'] = movies['genres']

In [None]:
movie_genres_test = movie_genres.loc[:20,:]
movie_genres_test

In [None]:
for i in movie_genres_test.iterrows():
    print(i[1].index[:-1])
    new_genres = i[1]['genres'].split('|')
    print(new_genres)
    print(i[1].index[:-1].isin(new_genres))
    i[1]['Adventure']=1

In [None]:
for i in movie_genres_test.index:
    #print(movie_genres_test.loc[i,'genres'])
    #print(movie_genres_test.columns[:-1])
    print(movie_genres_test.columns[:-1].isin(movie_genres_test.loc[i,'genres']))
    for j in movie_genres_test.columns[:-1]:
        if j in movie_genres_test.loc[i,'genres']:
            movie_genres_test.loc[i,j]=1

In [None]:
movie_genres_test