### Drive Mounter

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Preprocessing

In [3]:
fldr="drive/My Drive/movie recommendation system/Movielens"

In [3]:
import os
os.listdir(fldr)

['genome_scores.csv',
 'rating.csv',
 'genome_tags.csv',
 'movie.csv',
 'link.csv',
 'tag.csv']

In [4]:
import pandas as pd
df_ratings=pd.read_csv(fldr+'/rating.csv')

In [5]:
print(df_ratings.head())

   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40


In [6]:
df_ratings_2=df_ratings.drop(['timestamp'],axis=1)

In [7]:
print(df_ratings_2.head())

   userId  movieId  rating
0       1        2     3.5
1       1       29     3.5
2       1       32     3.5
3       1       47     3.5
4       1       50     3.5


In [8]:
print(len(df_ratings_2['userId'].unique()))

138493


In [9]:
print(len(df_ratings_2['movieId'].unique()))

26744


In [10]:
df_ratings_2['userId'].value_counts(ascending=True)

58028       20
80291       20
34668       20
23558       20
59390       20
          ... 
125794    5491
121535    5520
82418     5646
8405      7515
118205    9254
Name: userId, Length: 138493, dtype: int64

In [11]:
df_movie=pd.read_csv(fldr+'/movie.csv')

In [12]:
print(df_movie.head())

   movieId  ...                                       genres
0        1  ...  Adventure|Animation|Children|Comedy|Fantasy
1        2  ...                   Adventure|Children|Fantasy
2        3  ...                               Comedy|Romance
3        4  ...                         Comedy|Drama|Romance
4        5  ...                                       Comedy

[5 rows x 3 columns]


In [13]:
print(df_movie.columns)

Index(['movieId', 'title', 'genres'], dtype='object')


### Content Based Filtering

In [14]:
merged=df_movie.merge(df_ratings_2,on='movieId')

In [15]:
len(merged)

20000263

In [16]:
merged.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating'], dtype='object')

In [17]:
print(merged.head())

   movieId             title  ... userId  rating
0        1  Toy Story (1995)  ...      3     4.0
1        1  Toy Story (1995)  ...      6     5.0
2        1  Toy Story (1995)  ...      8     4.0
3        1  Toy Story (1995)  ...     10     4.0
4        1  Toy Story (1995)  ...     11     4.5

[5 rows x 5 columns]


In [18]:
ratings_sum=merged.groupby(['movieId']).sum()['rating']

In [19]:
len(ratings_sum)

26744

In [20]:
rating_d=ratings_sum.to_frame()

In [21]:
rating_n=rating_d.reset_index()

In [22]:
print(rating_n.head())

   movieId    rating
0        1  194866.0
1        2   71444.0
2        3   40128.5
3        4    7886.0
4        5   37268.5


In [23]:
df_movie.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [24]:
len(df_movie)

27278

In [25]:
final=df_movie.merge(rating_n,on='movieId')

In [26]:
print(final.head())

   movieId  ...    rating
0        1  ...  194866.0
1        2  ...   71444.0
2        3  ...   40128.5
3        4  ...    7886.0
4        5  ...   37268.5

[5 rows x 4 columns]


In [27]:
final.columns

Index(['movieId', 'title', 'genres', 'rating'], dtype='object')

In [28]:
print(final['rating'].min())

0.5


In [29]:
print(final['rating'].max())

281788.0


In [30]:
print(final['rating'].isnull().sum())

0


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1)
gen = vectorizer.fit_transform(final['genres'])

In [41]:
names=[]
i=0
while i<len(final):
   names.append(final.iloc[i]['title'].split(' (')[0])
   i+=1

final['title_mod']=names


In [42]:
vectorizer.get_feature_names()[0:10]

['action',
 'action adventure',
 'action animation',
 'action children',
 'action comedy',
 'action crime',
 'action documentary',
 'action drama',
 'action fantasy',
 'action film']

Linear kernel means cosine Similarity

In [33]:
from sklearn.metrics.pairwise import linear_kernel
model = linear_kernel(gen, gen)

In [48]:
final.columns

Index(['movieId', 'title', 'genres', 'rating', 'title_mod'], dtype='object')

In [58]:
def get_recommendations(title):
    #print("a")
    indices = pd.Series(final.index, index=final['title_mod']).drop_duplicates()
    #print("b")
    idx = indices[title]
    scores = list(enumerate(model[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    #print(scores)
    scores = scores[1:11]
    movie_indices = [i[0] for i in scores]
    return final['title_mod'].iloc[movie_indices]

In [59]:
get_recommendations('Toy Story')

2209                                                 Antz
3027                                          Toy Story 2
3663              Adventures of Rocky and Bullwinkle, The
3922                            Emperor's New Groove, The
4790                                       Monsters, Inc.
10106    DuckTales: The Movie - Treasure of the Lost Lamp
10978                                           Wild, The
11861                                     Shrek the Third
13325                             Tale of Despereaux, The
18230                             Asterix and the Vikings
Name: title_mod, dtype: object

In [60]:
get_recommendations('Before and After')

111             Before and After
809                Kaspar Hauser
906                 Citizen Kane
1108            Jean de Florette
1324          Breaking the Waves
1635                     Amistad
1829      Picnic at Hanging Rock
1866    In the Heat of the Night
1880                       Klute
1940                Rapture, The
Name: title_mod, dtype: object

We can create content based filtering to filter based on cast and director. We just need to append all of them in a single feature.

So, [action,comedy,director name, actor name] and then use count vectorizer not tfidf because here we want to focus if a particular actor or director appeared in multple movies. Then use cosine_similarity to predict