In [None]:
import numpy as np
import pandas as pd
import matplotlib
import scipy.sparse as sp
# import flurs

# from flurs.recommender.fm import FMRecommender
# from flurs.evaluator import Evaluator
# from flurs.data.entity import User, Item, Event

import pandas_profiling

%matplotlib inline

In [None]:
movies_df = pd.read_csv(r"ml-1m\movies.dat", delimiter="::", engine="python",
                        names=["id", "name", "genre"], index_col=0, header=None)

In [None]:
ratings_df = pd.read_csv(r"ml-1m\ratings.dat", delimiter="::", engine="python", 
                        names=["user_id", "movie_id", "rating", "timestamp"], header=None)

In [None]:
users_df = pd.read_csv(r"ml-1m\users.dat", delimiter="::", engine="python",
                      names=["id", "gender", "age", "occupation", "zip"], index_col=0, header=None)

### EDA

#### Movies

In [None]:
movies_df.head()

In [None]:
movies_df.info()

Genre field doesn't look too convinient to work with, let's replace it with a set

In [None]:
movies_df["genre"] = movies_df.genre.apply(lambda x: frozenset(x.split("|")))

In [None]:
len(movies_df.genre.unique())

In [None]:
class set_plus(set):
    def __init__(self, s):
        self.__set = s
    
    def __add__(self, other):
        return self.__set | other
    
    def __radd__(self, other):
        return self.__add__(other)

In [None]:
all_unique_genres = movies_df.genre.apply(lambda x: set_plus(x)).sum()
print(" ".join(all_unique_genres))
print(len(all_unique_genres))

In [None]:
movies_df.genre.apply(lambda x: len(x & all_unique_genres)).hist()

Users

In [None]:
users_df.head()

In [None]:
users_df.info()

In [None]:
len(users_df.occupation.unique())

In [None]:
users_df.groupby("age").age.agg(["count"]).plot(kind="bar")

In [None]:
users_df.groupby("occupation").occupation.agg(["count"]).plot(kind="bar")

In [None]:
users_df.groupby("gender").gender.agg(["count"]).plot(kind="bar")

Ratings

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [None]:
ratings_df.groupby("user_id").user_id.agg(["count"]).hist(bins=100)

In [None]:
np.log(ratings_df.groupby("user_id").user_id.agg(["count"])).hist(bins=100)

In [None]:
ratings_df.groupby("movie_id").movie_id.agg(["count"]).hist(bins=100)

In [None]:
np.log(ratings_df.groupby("movie_id").movie_id.agg(["count"])).hist(bins=100)

In [None]:
mean_rating = ratings_df.rating.mean()
mean_rating

In [None]:
movies_diff_with_mean = ratings_df.groupby("movie_id").rating.mean() - mean_rating
movies_diff_with_mean.hist(bins=100)

In [None]:
movies_diff_with_mean.describe()

In [None]:
sum(abs(movies_diff_with_mean) <= 0.5) / len(movies_diff_with_mean)

In [None]:
users_diff_with_mean = ratings_df.groupby("user_id").rating.mean() - mean_rating
users_diff_with_mean.hist(bins=100)

In [None]:
users_diff_with_mean.describe()

In [None]:
sum(abs(users_diff_with_mean) <= 0.5) / len(users_diff_with_mean)

In [None]:
df = users_df.merge(ratings_df, left_index=True, right_on="user_id").merge(movies_df, left_on="movie_id", right_index=True)

In [None]:
def get_interactions(df):
    max_user_id_plus_one = df.user_id.max() + 1
    max_movie_id_plus_one = df.movie_id.max() + 1
    
    interactions = sp.lil_matrix((max_user_id_plus_one, max_movie_id_plus_one), dtype=np.float32)
    
    for row in df.itertuples():
        user_id = row.user_id
        movie_id = row.movie_id
        rating = row.rating
        
        interactions[user_id, movie_id] = rating
#         if rating >= 4.0:
#             interactions[user_id, movie_id] = 1.0
#         else:
#             interactions[user_id, movie_id] = -1.0
    
    return interactions.tocsr()

In [None]:
def get_users_meta(df):
    max_user_id_plus_one = df.user_id.max() + 1
    
    unique_genders = sorted(df.gender.unique())
    genders_map = dict(zip(unique_genders, range(len(unique_genders))))
    
    unique_ages = sorted(df.age.unique())
    ages_map = dict(zip(unique_ages, range(len(unique_ages))))
    
    unique_occupations = sorted(df.occupation.unique())
    occupations_map = dict(zip(unique_occupations, range(len(unique_occupations))))
    
    genders = sp.lil_matrix((max_user_id_plus_one, len(unique_genders)), dtype=np.float32)
    ages = sp.lil_matrix((max_user_id_plus_one, len(unique_ages)))
    occupations = sp.lil_matrix((max_user_id_plus_one, len(unique_occupations)))
    identity = sp.identity(max_user_id_plus_one, format='csr', dtype=np.float32) # identity matrix makes model more expressive
    
    for row in df.itertuples():
        user_id = row.user_id
        gender = row.gender
        age = row.age
        occupation = row.occupation
        
        gender_index = genders_map[gender]
        age_index = ages_map[age]
        occupation_index = occupations_map[occupation]
        
        genders[user_id, gender_index] = 1.0
        ages[user_id, age_index] = 1.0
        occupations[user_id, occupation_index] = 1.0
        
#     return sp.hstack([genders, ages, occupations]).tocsr(), {"genders": genders_map, "ages": ages_map, "occupations": occupations_map}
    return sp.hstack([genders, ages, occupations, identity]).tocsr(), {"genders": genders_map, "ages": ages_map, "occupations": occupations_map}

In [None]:
def get_movies_meta(df):
    max_movie_id_plus_one = df.movie_id.max() + 1

    unique_genres = sorted(df.genre.apply(lambda x: set_plus(x)).sum())
    genres_map = dict(zip(unique_genres, range(len(unique_genres))))
    
    genres = sp.lil_matrix((max_movie_id_plus_one, len(unique_genres)), dtype=np.float32)
    identity = sp.identity(max_movie_id_plus_one, format='csr', dtype=np.float32)
    
    for row in df.itertuples():
        movie_id = row.movie_id
        genre = row.genre
        
        for g in genre:
            genre_index = genres_map[g]
            genres[movie_id, genre_index] = 1.0
    
    return sp.hstack([genres, identity]).tocsr(), {"genre": genres_map}
#     return genres.tocsr(), {"genre": genres_map}

In [None]:
interactions = get_interactions(df)
users_meta, user_meta_maps = get_users_meta(df)
movies_meta, movies_meta_maps = get_movies_meta(df)

In [None]:
from lightfm import LightFM, cross_validation, evaluation

In [None]:
def prepare_model(interactions, users_meta, movies_meta, model=None):
    partial = model is not None
    
    if partial:
        new_model = model.fit_partial(interactions=interactions, user_features=users_meta, item_features=movies_meta)
    else:
        np.random.seed(0)
        train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=np.random)
        lfm = LightFM(loss="warp", no_components=100, max_sampled=100)    
        new_model = lfm.fit(interactions=train, user_features=users_meta, item_features=movies_meta, epochs=20)
        
        train_auc = evaluation.auc_score(new_model, train, user_features=users_meta, item_features=movies_meta)
        test_auc = evaluation.auc_score(new_model, test, user_features=users_meta, item_features=movies_meta)
        train_precision_at_10 = evaluation.precision_at_k(new_model, train, user_features=users_meta, item_features=movies_meta, k=10)
        test_precision_at_10 = evaluation.precision_at_k(new_model, test, user_features=users_meta, item_features=movies_meta, k=10)
        train_recall_at_10 = evaluation.recall_at_k(new_model, train, user_features=users_meta, item_features=movies_meta, k=10)
        test_recall_at_10 = evaluation.recall_at_k(new_model, test, user_features=users_meta, item_features=movies_meta, k=10)
    
        print("AUC\n\ttrain: {}\n\ttest: {}".format(train_auc.mean(), test_auc.mean()))
        print("Precision (percentage of relevant items in prediction) at 10\n\ttrain: {}\n\ttest: {}"
              .format(train_precision_at_10.mean(), test_precision_at_10.mean()))
        print("Recall (number of  relevant items in prediction / total number of relevant items) at 10\n\ttrain: {}\n\ttest: {}"
              .format(train_recall_at_10.mean(), test_recall_at_10.mean()))
    
    return new_model

In [None]:
model = prepare_model(interactions, users_meta, movies_meta)
# lfm = LightFM(loss="warp", no_components=10, max_sampled=10)#, k=5, n=10, learning_schedule=’adagrad’, loss=’logistic’, learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=None)

# np.random.seed(0)
# train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=np.random)

# model = lfm.fit(interactions=train, user_features=users_meta, item_features=movies_meta, epochs=20)

In [None]:
def get_top_10(df, min_ratings):
    means_and_counts = df.groupby("name").agg({"rating": ["mean", "count"]})
    top = means_and_counts[means_and_counts.rating["count"] >= min_ratings].sort_values(("rating", "mean"), ascending=False)
    return list(top.head(10).index)

In [None]:
def add_new_user(gender, age, occupation, users_meta, user_meta_maps, interactions, for_full_refit=False):
    new_user = sp.lil_matrix((1, users_meta.shape[1]), dtype=np.float32)
    new_user[0, user_meta_maps["genders"][gender]] = 1.0
    new_user[0, len(user_meta_maps["genders"]) + user_meta_maps["ages"][age]] = 1.0
    new_user[0, len(user_meta_maps["genders"]) + len(user_meta_maps["ages"]) + user_meta_maps["occupations"][occupation]] = 1.0
    
    new_users_meta = sp.vstack([users_meta, new_user])
    new_interactions = sp.vstack([interactions, sp.lil_matrix((1, interactions.shape[1]))])
    
    # identity feature improves model's prediction ability, but makes fit_partial impossible
    if for_full_refit:
        new_user_identity = sp.lil_matrix((new_users_meta.shape[0], 1), dtype=np.float32)
        new_user_identity[-1, 0] = 1.0
        new_users_meta = sp.hstack([new_users_meta, new_user_identity])
    
    return (new_users_meta.shape[0] - 1, new_users_meta, new_interactions)

In [None]:
def add_ratings_for_user_manual(user_id, interactions, movies_df):
    print("1-5 to rate a movie, 0 to skip, anything else to stop")
    new_interactions = interactions.tolil()
    while True:
        movie_id = np.random.choice(movies_df.index)
        title = df.loc[movie_id]["name"]
        rating = float(input(title + ": "))
        if rating <= 0.0 or rating >= 5.0:
            break
        elif rating == 0.0:
            continue
               
        new_interactions[user_id, movie_id] = rating
    return new_interactions.tocsr()

In [None]:
def add_ratings_for_user(n, user_id, interactions, movies_df):
    new_interactions = interactions.tolil()
    for _ in range(n):
        movie_id = np.random.choice(movies_df.index)
        rating = np.random.randint(1, 5)
        new_interactions[user_id, movie_id] = rating
        
    return new_interactions.tocsr()

In [None]:
new_user_id, new_users_meta, new_interactions_empty = add_new_user("M", 25, 12, users_meta, user_meta_maps, interactions)
new_model = prepare_model(new_interactions_empty, new_users_meta, movies_meta, model)

In [None]:
predictions_without_ratigs = new_model.predict(new_user_id, np.array(range(new_interactions_empty.shape[1])), 
                                               user_features=new_users_meta, 
                                               item_features=movies_meta)

print ("Top-10 recommended movies without any ratings from the new user\n")
for ind, movie_id in enumerate(predictions_without_ratigs.argsort()[-20:][::-1]):
    print(ind+1, movies_df.loc[movie_id]["name"])

In [None]:
new_interactions = add_ratings_for_user_manual(new_user_id, dummy_interactions, movies_df)

In [None]:
new_model_with_ratings = prepare_model(new_interactions, new_users_meta, movies_meta, model)
predictions_with_ratings = new_model_with_ratings.predict(new_user_id, np.array(range(new_new.shape[1])), 
                                                            user_features=new_users_meta, 
                                                            item_features=movies_meta)

print ("Top-10 recommended movies with ratings from the new user\n")
for ind, movie_id in enumerate(predictions_with_ratings.argsort()[-20:][::-1]):
    print(ind+1, movies_df.loc[movie_id]["name"])