<h1 style='background-color:#ffff00;color:ivory;'><font size=100 color='black'>Movie Recommender System Using Content Based Filtering </h1>
<!-- <img src='https://miro.medium.com/max/1000/1*BME1JjIlBEAI9BV5pOO5Mg.png' style='height:500px'> -->

In [1]:
import numpy as np
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import cufflinks as cf
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
cf.go_offline(connected=True)
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')
from scipy.sparse import csr_matrix
from math import log10
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse

Download Movielens-lates-small dataset (1 Mb) [here](http://files.grouplens.org/datasets/movielens/ml-latest-small.zip)

In [2]:
df_movies=pd.read_csv('movies.csv')
df_ratings=pd.read_csv('ratings.csv')

In [3]:
tot_mov=df_movies.groupby('movieId').count().shape[0]
tot_user=df_ratings.groupby('userId').count().shape[0]

In [4]:
ff.create_table(pd.DataFrame(np.array([[tot_mov],[tot_user]]).T,index=['Total'],columns=['Movies','Users']),
                index=True,colorscale='YlOrRd').iplot()

<h2>DataSet Validation

In [5]:
movie_data=pd.merge(df_ratings,df_movies,on='movieId')

    Merge Ratings and Movies data 

In [6]:
print(df_movies.shape)
print(df_ratings.shape)
print(movie_data.shape)

(9742, 3)
(100836, 4)
(100836, 6)


In [7]:
movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
ratings_mean=movie_data.groupby(['title']).agg({'rating':'mean','movieId':'count'})

In [9]:
ratings_mean.columns=['Avg_rat','Tot_rat']

In [10]:
ratings_mean.head()

Unnamed: 0_level_0,Avg_rat,Tot_rat
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


    Above table displays, or each movie Average_Rating and Total_Users_Rated that movie

In [11]:
dfim=ratings_mean['Tot_rat'].sort_values(ascending=False)
trace=go.Bar(x=dfim.index,y=dfim,
             marker=dict(color=np.random.randn(ratings_mean['Tot_rat'].size),colorscale='Portland'))
iplot(go.Figure(data=[trace],layout=go.Layout(title='Moive vs Total Ratings',yaxis=dict(title='Total Ratings'))))

    Long-Tail Phenomenon:
        Vertical line    = Popularity
        Horizontal line  = Movies Ordered On basis of Popularity
        Then plot should long tail from left to right.
    The long-tail phenomenon encourages Recommender Systems to recommend movies to individual users who hasn't
    watched yet.

In [12]:
ratings_mean['Avg_rat'].iplot(kind='histogram',color='yellow',bins=50,xTitle='Avg_Ratings',yTitle='Tot_Movie_Count')

    User Ratings are around 3-4 mostly.
    Also distribution is concentrated mostly for Ratings>=3.
    So it's good to recommended movies which got ratings>=3 to users not seen yet.

In [13]:
fig=ff.create_2d_density(x=ratings_mean['Avg_rat'],y=ratings_mean['Tot_rat'],
                         colorscale='Cividis',
                         hist_color='rgb(0,0,0)')
fig.add_trace(go.Scatter(x=ratings_mean['Avg_rat'],y=ratings_mean['Tot_rat']
                         ,mode='markers',marker=dict(color='black')))
fig['layout'].update(height=1000)
fig['layout'].update(width=1000)
fig['layout'].update(xaxis=dict(title='Avg_rat'))
fig['layout'].update(yaxis=dict(title='Tot_rat'))
iplot(fig)

    If a movie is popular it must get high viewers and best ratings.
    From above plot most viewed movies got higher ratings.

<h3 style='color:rgb(23,24,123)'>From above 3 plots and analysis this DataSet is good to build Recommender System

<h2 style='background-color:#abccdd;color:ivory'><font size=50>Build User-Profile

In [14]:
df_movies['genres']=df_movies['genres'].apply(lambda x:x.split('|'))

In [15]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [16]:
genres=list(set([j for i in df_movies.genres for j in i]))

In [17]:
genres.sort()

In [18]:
genres

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

    There are total 19 Genres and movies which doesn't listed any genre represented as (no genres listed)

In [19]:
doc_freq=defaultdict(int)
for tags in df_movies.genres:
    for genre in tags:
        doc_freq[genre]+=1

In [20]:
iplot(go.Figure(data=[go.Bar(x=list(doc_freq.keys()),y=list(doc_freq.values()),
                             marker=dict(color=np.random.randn(len(doc_freq)),
                             colorscale=[[0,'rgb(32,220,130)'],[0.5,'rgb(234,232,12)'],[1,'rgb(32,153,54)']]),
                             text=list(doc_freq.values()),textposition='outside')
                     ]))

    Most movies are Drama,Comedy,Thriller.

<h6 style='background-color:black;color:ivory'><font size=5> Calculate TF-IDF for genres of Movies

In [21]:
D=df_movies.shape[0]
genre_tf_idf=[]
for tags in df_movies.genres:
    sp_mat=[]
    for genre in genres:
        tf= 1 if genre in tags else 0
        sp_mat.append(tf*log10(D/doc_freq[genre]))
    sp_mat=csr_matrix(sp_mat,dtype='float64',shape=(1,len(genres)))
    genre_tf_idf.append(sp_mat)

In [22]:
df_movies['genres_tf_idf']=genre_tf_idf

In [23]:
df_movies['genres_tf_idf'].head()

0      (0, 2)\t0.8872447746804204\n  (0, 3)\t1.2026...
1      (0, 2)\t0.8872447746804204\n  (0, 4)\t1.1664...
2      (0, 5)\t0.41392254164167785\n  (0, 15)\t0.78...
3      (0, 5)\t0.41392254164167785\n  (0, 8)\t0.349...
4                          (0, 5)\t0.41392254164167785
Name: genres_tf_idf, dtype: object

In [24]:
giv_rat=df_ratings[df_ratings['rating']>=3].sort_values(['userId']).groupby('userId')['userId'].count()

In [25]:
fig=ff.create_distplot([giv_rat],['Users Rated-Movie Count Distribution'], bin_size=30, 
                       curve_type='normal',colors=['rgb(0,52,163)'],show_rug=True)
fig['layout'].update(xaxis=dict(title='Rated Movies',tickvals=[i for i in range(0,2400,200)]))
fig['layout'].update(yaxis=dict(title='PDF'))
iplot(fig)

    Most Users watched less than 350 films 

<h2 style='background-color:#abdddd;color:ivory'><font size=50>Build Item-Profile

In [26]:
dff=df_ratings.copy()

In [27]:
dff.set_index(['userId','movieId'],inplace=True)

In [28]:
rat_gt_3=dff[dff['rating']>=3]

In [29]:
rat_gt_3.index.get_level_values(0).nunique()

609

In [30]:
set(dff.index.get_level_values(0).unique()) - set(rat_gt_3.index.get_level_values(0).unique())

{442}

    Among 610 users, user 442 hasn't rated any movie >=3

<h6 style='background-color:black;color:ivory'><font size=5> Calculate Weighed average of Genres for each User for movies rated >= 3 rating. 

In [31]:
user_imp=dict()
for userid in rat_gt_3.index.get_level_values(0).unique().tolist():
    genre_count=defaultdict(int)
    tot_mov=0
    mov_rat=rat_gt_3.loc[userid]['rating'].values
    for tags in np.squeeze(df_movies[df_movies['movieId'].isin(rat_gt_3.loc[userid].index)][['genres']].values):
        rat=mov_rat[tot_mov]
        tot_mov+=1
        for tag in tags:
            genre_count[tag]+=rat
    genre_imp=[]
    for tag in genres:
        genre_imp.append(genre_count[tag]/tot_mov)
    genre_imp=csr_matrix(genre_imp,dtype='float64',shape=(1,len(genres)))
    user_imp[userid]=genre_imp

In [32]:
df_user_imp=pd.DataFrame(data=user_imp.items(),columns=['userId','genres_imp'])

In [33]:
fig=ff.create_table(df_user_imp.sample(5),height_constant=40,colorscale='YlGnBu')
fig['layout'].update(width=1500)
iplot(fig)

In [34]:
fig=ff.create_table(df_movies.sample(5),height_constant=40,colorscale='YlGnBu')
fig['layout'].update(width=2000)
iplot(fig)

<h2 style='background-color:#abefdd;color:ivory'><font size=7>Recommend Movies For User 414

In [35]:
dff.loc[414].index.values.size

2698

    User 414 Rated total movies 2698

In [36]:
user_414=df_user_imp[df_user_imp['userId']==414]

In [37]:
user_414

Unnamed: 0,userId,genres_imp
413,414,"(0, 0)\t0.00401511572980633\n (0, 1)\t0.876..."


<h6 style='background-color:black;color:ivory'><font size=5>Multiply User_Genre_Importance with Movie_Genre_importance

In [38]:
user_414_rated_movies=df_movies[df_movies['movieId'].isin(dff.loc[414].index.values)]

In [39]:
user_414_rated_movies['rating']=dff.loc[414]['rating'].values

In [40]:
user_414_rated_movies.reset_index(inplace=True,drop=True)

In [41]:
user_414_genre=user_414.iloc[0]['genres_imp'].toarray()

In [42]:
cal_genre=[]
user_414_genre=user_414['genres_imp'].iloc[0].toarray()
for genre_imp in user_414_rated_movies['genres_tf_idf'].values:
    cal_score= genre_imp.toarray() * user_414_genre
    cal_score= csr_matrix(cal_score,shape=(1,len(genres)))
    cal_genre.append(cal_score)

    Here User 414 Genre importance is multiplied to every movie did watch.
    It is useful to build model based on User 414 choice of Genres.

In [43]:
user_414_rated_movies['cal_genres_score']=cal_genre

In [44]:
user_414_rated_movies['rating'].unique()

array([4. , 3. , 2. , 5. , 1. , 3.5, 2.5, 4.5, 1.5, 0.5])

    User 414 user used all ratings in rating scale (1-5)

In [45]:
X=np.array([np.squeeze(i.toarray()) for i in user_414_rated_movies['cal_genres_score']])
Y=np.squeeze(user_414_rated_movies['rating'].values)

In [46]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [47]:
rfr=RandomForestRegressor(n_estimators=100,max_features='sqrt',max_depth=9,
                          random_state=0,bootstrap=True)

In [48]:
rfr.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [49]:
rf_pred=rfr.predict(X_test)

In [50]:
mse(Y_test,rf_pred)

0.7388036709678802

In [51]:
not_seen=list(set(df_movies['movieId'].values) -  set(dff.loc[414].index.values))

In [52]:
len(not_seen)

7044

    User 414 hasn't watched 7044 movies.
    Recommend movies among these 7044 movies predictings ratings for each movie like how much would User 414 
    give if watches movie.

In [53]:
to_be=df_movies[df_movies.movieId.isin(not_seen)][['title','genres','genres_tf_idf']]

In [54]:
cal_genre_score=[]
for i in to_be['genres_tf_idf']:
    cal_genre_score.append(i.toarray() * user_414_genre )

    Here User 414 Genre importance is multiplied to every movie didn't watch.

In [55]:
X_to_be=np.squeeze(np.array(cal_genre_score))

In [56]:
X_to_be.shape

(7044, 20)

In [57]:
pred_to_be=rfr.predict(X_to_be)

In [58]:
to_be['Rating']=pred_to_be

In [59]:
user_414_like_genres=[]
for i,j in zip(genres,list(np.squeeze(user_414['genres_imp'].iloc[0].toarray()))):
    user_414_like_genres.append([i,j])
sorted(user_414_like_genres,key=lambda x:x[1],reverse=True)

[['Drama', 2.014170996693434],
 ['Comedy', 1.3856872933396316],
 ['Action', 0.8769485120453472],
 ['Thriller', 0.8230987246102975],
 ['Romance', 0.7054794520547946],
 ['Adventure', 0.6471421823334907],
 ['Crime', 0.6126594237128011],
 ['Sci-Fi', 0.45606991025035426],
 ['Fantasy', 0.31577704298535664],
 ['Children', 0.25082664147378364],
 ['Mystery', 0.23311289560699103],
 ['War', 0.2135096835144072],
 ['Animation', 0.20736891828058573],
 ['Musical', 0.15824279641001418],
 ['Horror', 0.15470004723665565],
 ['Documentary', 0.10344827586206896],
 ['IMAX', 0.09352857817666509],
 ['Western', 0.079357581483231],
 ['Film-Noir', 0.02456306093528578],
 ['(no genres listed)', 0.00401511572980633]]

    User 414 likes mostly 
        * Drama
        * Comedy
        * Action
        * Thriller
        * Romance
        * Adventure
        * Crime

<h6 style='background-color:black;color:ivory'><font size=5>Top 20 Recommendations 

In [60]:
to_be.sort_values(by=['Rating'],ascending=False).iloc[0:20][['title','genres','Rating']]

Unnamed: 0,title,genres,Rating
5592,"Pure Formality, A (Pura formalità, Una) (1994)","[Crime, Film-Noir, Mystery, Thriller]",4.479307
3019,House of Games (1987),"[Crime, Film-Noir, Mystery, Thriller]",4.479307
137,Devil in a Blue Dress (1995),"[Crime, Film-Noir, Mystery, Thriller]",4.479307
7492,Little Big Soldier (Da bing xiao jiang) (2010),"[Action, Adventure, Comedy, Drama, War]",4.402939
5476,"White Sun of the Desert, The (Beloe solntse pu...","[Action, Adventure, Comedy, Drama, Romance, War]",4.382953
8988,Afro Samurai (2007),"[Action, Adventure, Animation, Drama, Fantasy]",4.333481
3680,Escaflowne: The Movie (Escaflowne) (2000),"[Action, Adventure, Animation, Drama, Fantasy]",4.333481
6624,I Served the King of England (Obsluhoval jsem ...,"[Comedy, Drama, Romance, War]",4.305049
1730,Life Is Beautiful (La Vita è bella) (1997),"[Comedy, Drama, Romance, War]",4.305049
6296,"Tiger and the Snow, The (La tigre e la neve) (...","[Comedy, Drama, Romance, War]",4.305049


<h1 style='background-color:#abcddd;color:ivory'><font size=100>Final Model

In [62]:
import numpy as np
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import cufflinks as cf
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
cf.go_offline(connected=True)
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')
from scipy.sparse import csr_matrix
from math import log10
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse


def user_profile(df_movies,genres):
    doc_freq=defaultdict(int)
    for tags in df_movies.genres:
        for genre in tags:
            doc_freq[genre]+=1
    D=df_movies.shape[0]
    genre_tf_idf=[]
    for tags in df_movies.genres:
        sp_mat=[]
        for genre in genres:
            tf= 1 if genre in tags else 0
            sp_mat.append(tf*log10(D/doc_freq[genre]))
        sp_mat=csr_matrix(sp_mat,dtype='float64',shape=(1,len(genres)))
        genre_tf_idf.append(sp_mat)
    df_movies['genres_tf_idf']=genre_tf_idf
    return df_movies

def item_profile(df_ratings,df_movies,genres):
    dff=df_ratings.copy()
    dff.set_index(['userId','movieId'],inplace=True)
    rat_gt_3=dff[dff['rating']>=3]
    user_imp=dict()
    for userid in rat_gt_3.index.get_level_values(0).unique().tolist():
        genre_count=defaultdict(int)
        tot_mov=0
        mov_rat=rat_gt_3.loc[userid]['rating'].values
        for tags in np.squeeze(df_movies[df_movies['movieId'].isin(rat_gt_3.loc[userid].index)][['genres']].values):
            rat=mov_rat[tot_mov]
            tot_mov+=1
            for tag in tags:
                genre_count[tag]+=rat
        genre_imp=[]
        for tag in genres:
            genre_imp.append(genre_count[tag]/tot_mov)
        genre_imp=csr_matrix(genre_imp,dtype='float64',shape=(1,len(genres)))
        user_imp[userid]=genre_imp
    df_user_imp=pd.DataFrame(data=user_imp.items(),columns=['userId','genres_imp'])
    return dff,df_user_imp

def cal_genre_score(df_movies,df_user_imp,dff,genres,userId):
    user=df_user_imp[df_user_imp['userId']==userId]
    user_rated_movies=df_movies[df_movies['movieId'].isin(dff.loc[userId].index.values)]
    user_rated_movies['rating']=dff.loc[userId]['rating'].values
    user_rated_movies.reset_index(inplace=True,drop=True)
    user_genre=user.iloc[0]['genres_imp'].toarray()
    cal_genre=[]
    user_genre=user['genres_imp'].iloc[0].toarray()
    for genre_imp in user_rated_movies['genres_tf_idf'].values:
        cal_score= genre_imp.toarray() * user_genre
        cal_score= csr_matrix(cal_score,shape=(1,len(genres)))
        cal_genre.append(cal_score)
    user_rated_movies['cal_genres_score']=cal_genre
    return user_rated_movies,user_genre

def random_forest_model(user_rated_movies):
    X=np.array([np.squeeze(i.toarray()) for i in user_rated_movies['cal_genres_score']])
    Y=np.squeeze(user_rated_movies['rating'].values)
    rfr=RandomForestRegressor(n_estimators=100,max_features='sqrt',max_depth=10,random_state=0,bootstrap=True)
    _=rfr.fit(X_train,Y_train)
    mse_error=mse(Y_test,rf_pred)
    return rfr,mse_error

def calculate_recommendations(df_movies,dff,rfr,userId,user_genre):
    not_seen=list(set(df_movies['movieId'].values) -  set(dff.loc[userId].index.values))
    to_be=df_movies[df_movies.movieId.isin(not_seen)][['title','genres','genres_tf_idf']]
    cal_genre_score=[]
    for i in to_be['genres_tf_idf']:
        cal_genre_score.append(i.toarray() * user_genre )
    X_to_be=np.squeeze(np.array(cal_genre_score))
    pred_to_be=rfr.predict(X_to_be)
    to_be['Rating']=pred_to_be
    return to_be

def recommender_model(df_movies,df_ratings,userId,top):
    df_movies['genres']=df_movies['genres'].apply(lambda x:x.split('|'))
    genres=list(set([j for i in df_movies.genres for j in i]))
    genres.sort()
    df_movies=user_profile(df_movies,genres)
    dff,df_user_imp=item_profile(df_ratings,df_movies,genres)
    user_rated_movies,user_genre=cal_genre_score(df_movies,df_user_imp,dff,genres,userId)
    rfr,mse_error=random_forest_model(user_rated_movies)
    to_be=calculate_recommendations(df_movies,dff,rfr,userId,user_genre)
    top_movies=to_be.sort_values(by=['Rating'],ascending=False).iloc[0:top+1][['title','genres','Rating']]
    return top_movies

if __name__=='__main__':
    df_movies=pd.read_csv('movies.csv')
    df_ratings=pd.read_csv('ratings.csv')
    print('UserId:')
    userId=int(input())
    print('Top Movies?')
    top=int(input())
    top_movies=recommender_model(df_movies,df_ratings,userId,top)[['title','Rating']]
    ff.create_table(top_movies,colorscale=[[0,'#e80000'],[0.5,'#fff86e'],[1,'#a8f5ff']]).iplot()

UserId:
600
Top Movies?
10
