In [70]:
import pandas as pd
import numpy as np
import json
import random
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import pickle

In [71]:
total_recom = 20
def extract_from_dict(x, key):
    if key not in x.keys():
        return None
    return x.get(key)

def imdb_rule_rating(x, C, M):
  v = x['vote_count']
  r = x['vote_average']
  return ( (v / (v + M)) * r +  (M / (v + M)) * C )

def preprocess(movie_data_df, user_rating_df):
    urd = user_rating_df.groupby(by=['user_id', 'title']).mean().reset_index()
    urd.set_index('user_id', inplace=True)

    movie_data_df['genres']   = movie_data_df['info'].apply(lambda x: extract_from_dict(x, 'genres'))
    movie_data_df['overview'] = movie_data_df['info'].apply(lambda x: extract_from_dict(x, 'overview'))
    movie_data_df['production_companies'] = movie_data_df['info'].apply(lambda x: extract_from_dict(x, 'production_companies'))
    movie_data_df['vote_average'] = movie_data_df['info'].apply(lambda x: extract_from_dict(x, 'vote_average')).astype('float')
    movie_data_df['vote_count']   = movie_data_df['info'].apply(lambda x: extract_from_dict(x, 'vote_count')).astype('float')
    movie_data_df['genres'] =movie_data_df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
    movie_data_df['production_companies'] =movie_data_df['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
    mdd = movie_data_df.drop('info', 1)

    return mdd, urd


def train():
    user_rating_df = pd.read_csv("rating.csv", names=["user_id", "title", "rating"])

    movie_data_df = pd.read_csv("movie_meta_un.csv", names=["title", "info"])
    movie_data_df['info'] = movie_data_df['info'].apply(lambda x: json.loads(x))

    mdd, urd = preprocess(movie_data_df, user_rating_df)


    C = mdd['vote_average'].mean()
    M = mdd['vote_count'].quantile(0.75)

    best_movie_df = mdd[(mdd['vote_count'] >= M) & (mdd['vote_count'].notnull()) & (mdd['vote_average'].notnull())]
    best_movie_df.loc[:,'rating'] = best_movie_df.apply(lambda x: imdb_rule_rating(x, C, M), axis = 1)
    best_movie_df['genres']               = best_movie_df['genres'].apply(lambda x: ' '.join(x))
    best_movie_df['production_companies'] = best_movie_df['production_companies'].apply(lambda x: ' '.join(x))
    best_movie_df['description'] = best_movie_df['overview'] + best_movie_df['genres'] + best_movie_df['genres'] + best_movie_df['production_companies'] + best_movie_df['production_companies']

    porter_stemmer = PorterStemmer()
    best_movie_df.loc[:,'description'] = best_movie_df['description'].apply(lambda x: porter_stemmer.stem(x.lower()))

    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(best_movie_df['description'])

    cosine_sim   = linear_kernel(tfidf_matrix, tfidf_matrix)

    #Dumping the files
#     urd.to_pickle('urd.p')
#     best_movie_df.to_pickle('bmd.p')
#     cosine_sim.dump('cs.p')
    return urd, best_movie_df, cosine_sim

def get_top_user_movies(user_id, urd, titles, best_movie_df):
    try:
        top_movie_list = urd.loc[user_id].sort_values('rating', ascending=False)['title'].tolist()
    except Exception:
        return []
    movie_list = titles.to_list()
    indices = [x for x in top_movie_list if x in movie_list]

    top_movies = best_movie_df.loc[best_movie_df['title'].isin(indices)]['title'].to_list()
    if(len(top_movies) > total_recom):
        return top_movies[:total_recom]
    return top_movies


def get_recommendations(title, recom_number, indices, titles, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:61]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices].to_list()[0:recom_number]


def predict():
    best_movie_df = best_movie_df.reset_index()
    titles  = best_movie_df['title']
    indices = pd.Series(best_movie_df.index, index=best_movie_df['title'])

    suggestion = {}
    count = 0

    for user_id in urd.index.unique():
        recommended_movies = []
        top_movies       = self.get_top_user_movies(user_id, urd, titles, best_movie_df)
        #if the user has no record in the system, or all the rated movies are removed, return chef's special
        if len(top_movies) == 0:
            # print('chefs special for {}'.format(user_id))
            continue
            # return give_chefs_sepcial(genre_list)
        temp = []
        for idx, movie in enumerate(top_movies):
            temp.append(self.get_recommendations(movie, total_recom, indices, titles, cosine_sim))
        for i in range(total_recom):
            for j in range(len(temp)):
                if temp[j][i] not in recommended_movies:
                    recommended_movies.append(temp[j][i])
            if len(recommended_movies) == total_recom:
                break

        suggestion[user_id] = recommended_movies[:total_recom]
        count += 1

    pickle.dump(suggestion, open( "recommandations.p", "wb" ))
    print('wrote recom for {} users'.format(count))

In [72]:
%%time
urd, best_movie_df, cosine_sim = train()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_movie_df['genres']               = best_movie_df['genres'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a sli

CPU times: user 5.72 s, sys: 228 ms, total: 5.95 s
Wall time: 5.94 s


In [79]:
test_df = urd
print(test_df.head())
test_df = test_df.filter(['user_id', 'title', 'rating']).query("rating >= 5")
test_df = test_df.groupby('user_id').agg({'title':lambda x: set(x)}).reset_index()
test_df.head()

                           title  rating
user_id                                 
6               dragonheart+1996     4.0
6        executive+decision+1996     4.0
6          independence+day+1996     3.0
6          mighty+aphrodite+1995     5.0
6         mr.+hollands+opus+1995     5.0


Unnamed: 0,user_id,title
0,6,"{sabrina+1995, sense+and+sensibility+1995, the..."
1,9,{the+godfather+1972}
2,23,"{clerks+1994, swingers+1996, la+femme+nikita+1..."
3,39,"{heat+1995, toy+story+1995, star+wars+1977, de..."
4,51,"{the+usual+suspects+1995, pulp+fiction+1994, r..."


In [80]:
def f(x):
    """
    lambda function
    :param x:
    """
    movieIds = x['title']
    if len(movieIds) >= 10:
        return 'no'
    else:
        return 'yes'

test_df['drop'] = test_df.apply(lambda x: f(x), axis = 1)

test_df = test_df.filter(['user_id', 'title', 'drop']).query("drop == 'no'")
test_df.head()
print(len(test_df))

2997


In [81]:
best_movie_df = best_movie_df.reset_index()
titles = best_movie_df['title']
indices = pd.Series(best_movie_df.index, index=best_movie_df['title'])

In [82]:
test_df.head()

Unnamed: 0,user_id,title,drop
9,81,"{gone+in+sixty+seconds+2000, the+great+escape+...",no
130,741,"{enemy+at+the+gates+2001, femme+fatale+2002, d...",no
218,1390,"{an+american+werewolf+in+london+1981, enemy+at...",no
227,1479,{the+lord+of+the+rings+the+fellowship+of+the+r...,no
274,1809,"{speed+1994, forrest+gump+1994, mission+imposs...",no


In [143]:

def get_recommendations(title, recom_number):
    try:
        idx = indices[title]
    except Exception:
        return None
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:61]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices].to_list()[0:recom_number]

def get_recommendations_with_user_id(x):
    recommended_movies = []

    top_movies       = test_df[test_df['user_id'] == x['user_id']]['title'].apply(lambda x: list(x)).to_list()[0]
    #if the user has no record in the system, or all the rated movies are removed, return chef's special
    if len(top_movies) == 0:
        print('chefs special for you')
        return []
    temp = []
    for idx, movie in enumerate(top_movies):
        a = get_recommendations(movie, total_recom)
        if a is None:
            continue
        temp.append(a)
    for i in range(total_recom):
        for j in range(len(temp)):
            if temp[j][i] not in recommended_movies:
                recommended_movies.append(temp[j][i])
        if len(recommended_movies) == total_recom:
            break

    rec =  '.'.join(recommended_movies[:total_recom])
    for m in top_movies:
        if m in rec:
            return "yes"
        
    return "no"

In [144]:
test_df['yes_no'] = test_df.apply(lambda x: get_recommendations_with_user_id(x), axis = 1)
test_df.head()

Unnamed: 0,user_id,title,drop,yes_no
9,81,"{gone+in+sixty+seconds+2000, the+great+escape+...",no,no
130,741,"{enemy+at+the+gates+2001, femme+fatale+2002, d...",no,no
218,1390,"{an+american+werewolf+in+london+1981, enemy+at...",no,no
227,1479,{the+lord+of+the+rings+the+fellowship+of+the+r...,no,no
274,1809,"{speed+1994, forrest+gump+1994, mission+imposs...",no,yes


In [145]:
yes_df = test_df.filter(['userId', 'movieId', 'rating', 'yes_no']).query("yes_no == 'yes'")
no_df = test_df.filter(['userId', 'movieId', 'rating', 'yes_no']).query("yes_no == 'no'")

acc = len(yes_df) / (len(yes_df) + len(no_df))

print("Accuracy = " + str(acc))

Accuracy = 0.23790457123790457
