In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# importing the required libraries
import pandas as pd
import numpy as np


In [3]:
# Importing the dataset
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
# This file contains the data of the ratings that a user has given to a movie
ratings = pd.read_csv('drive/My Drive/mini_project_mov/u.data', sep='\t', names=r_cols,encoding='latin-1') #  Rating file contains different users with the ratings they have given to various movies

i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
# This file contains the Genre of all the movies in the ratings dataset
items = pd.read_csv('drive/My Drive/mini_project_mov/u.item', sep='|', names=i_cols,encoding='latin-1') # items file contains movie details,  like their title, genre, etc  


## PRE-PROCESING OF THE DATA

# sorting in ascending order in accordance with movie_id
ratings = ratings.sort_values('movie_id') 
# removing the coloumns with non-usefull feature
keep_col = ['movie_id','movie title','Action','Adventure','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']

# counts number of raters for each movie (used for Popularity based(Trending Movies))
count = ratings.groupby("movie_id", as_index=False).count()
# Finding the average rating for each movie by finding the mean 
mean = ratings.groupby("movie_id", as_index=False).mean() 



In [4]:
## CLEANING OF THE DATA
rating_new = items[keep_col]
# Inserting a column for average rating for all the movies in the Dataframe
rating_new= pd.merge(rating_new,mean)

rating_new['avg_rating']=rating_new['rating']
keep_col_3 = ['movie_id','movie title','Action','Adventure','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','avg_rating']
rating_new = rating_new[keep_col_3]
rating_new['rating_count'] = count['rating']

count_2 =count
count_2['avg_rating'] = rating_new['avg_rating']

#rating_new = rating_new[rating_new.rating_count >= 10] # taking only those movies which has been rated by more than 10 users

# Keeping only those movies that are rated by 10 or more users
count_2 = count[count.user_id >= 10]
count_2 = count_2.drop(['user_id', 'rating','unix_timestamp'], axis='columns')
count_2 = pd.merge(count_2,items)
count_2 = count_2.drop(['release date', 'video release date','IMDb URL','unknown'], axis='columns')
count_3 =count_2.iloc[:,:]
count_3=count_3[['movie_id','movie title','Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','avg_rating']]



In [5]:
## To check the Genre
check = 'Copycat (1995)'
for i in range(1152):
  if (count_3['movie title'][i] == check):
    print(count_3.loc[i])
    break;
  if (i==1151):
    if (count_3['movie title'][i] != check):
      print('Invalid Movie Name')

movie_id                    5
movie title    Copycat (1995)
Action                      0
Adventure                   0
Animation                   0
Children's                  0
Comedy                      0
Crime                       1
Documentary                 0
Drama                       1
Fantasy                     0
Film-Noir                   0
Horror                      0
Musical                     0
Mystery                     0
Romance                     0
Sci-Fi                      0
Thriller                    1
War                         0
Western                     0
avg_rating            3.30233
Name: 4, dtype: object


In [6]:
## Content Based using Euclidean Distance inbuild library

R_euc = np.zeros([1152, 19])
for j in range(1152):
    R_euc[j][:]= count_3.iloc[j,2:21]

R_euc = pd.DataFrame(R_euc,index=count_3.index,columns = ['Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','avg_rating'])
R_euc['movie_title']=count_3['movie title']
X_euc = R_euc.iloc[:, 0:18].values
Y_euc = np.zeros([1152, 1])
from sklearn.metrics.pairwise import euclidean_distances

# Function for Content-based Filtering
def content_based_euc(movie_input,R_euc):
    def get_similar_euc(movie_input):
        for  i in range(1152):
            aa_euc = X_euc[i].reshape(1,18)
            ba_euc = movie_input.reshape(1,18)
            Y_euc[i]=euclidean_distances(aa_euc,ba_euc)             
            
        return Y_euc
    
    R_euc['euclidean distance']=get_similar_euc(movie_input)
    recom_movie_euc = R_euc[['movie_title','euclidean distance']]
           
    return recom_movie_euc

In [7]:
 
## ENTER A MOVIE that YOU HAVE WATCHED
watched_movie_euc='Toy Story (1995)'  
movie_input_euc=[]
dummy_euc = R_euc.iloc[:,:].values
for i in range(len(dummy_euc)):
    if dummy_euc[i][19] == watched_movie_euc:
        for j in range(18):
            movie_input_euc.append(dummy_euc[i][j])

watched_movie_euc=np.array(movie_input_euc)
movie_for_you_euc = content_based_euc(watched_movie_euc,R_euc)

Top_movie_for_you_euc=movie_for_you_euc.sort_values(ascending=True, by='euclidean distance')
Top_movie_for_you_euc =  Top_movie_for_you_euc.movie_title

print("Recommendations On the basis of your history(already watched movie) by using Euclidean Distance: ")
print("        "+Top_movie_for_you_euc.head(5))



Recommendations On the basis of your history(already watched movie) by using Euclidean Distance: 
0                                    Toy Story (1995)
410            Aladdin and the King of Thieves (1996)
235                              Jungle2Jungle (1997)
232            Beavis and Butt-head Do America (1996)
217                             101 Dalmatians (1996)
Name: movie_title, dtype: object


In [8]:

## CONTENT BASED FILTERING using other method
R = np.zeros([1152, 19]) # creating an zero valued array with the mentioned size

# About this loop: Wherever there is 1 in the genre field, replace it with the desired movie's average rating

for j in range(1152):
    R[j][:]= count_3.iloc[j,2:21]

R = pd.DataFrame(R,index=count_3.index,columns = ['Action','Adventure','Animation','Children\'s','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','avg_rating'])

X = R.iloc[:, 0:18].values
Y = np.zeros([1152, 1])
# Finding the Euclidean distance vector wise for a given item vector with all the vectors in the R
#from sklearn.metrics.pairwise import euclidean_distances

def content_based(movie_input,R,count_3):
    def get_similar(movie_input):
        for  i in range(1152):
            aa = X[i].reshape(1,18)
            ba = movie_input.reshape(1,18)
            for j in range(16):
                if ba[0][j]==1:
                    if aa[0][j]==1:
                        Y[i] = 1             
            
        return Y
    
    R['similar_movie_id']=get_similar(movie_input)
    recom_movie = R[['similar_movie_id','avg_rating']]
    m = recom_movie.iloc[:,:].values
    n =  count_3.iloc[:,:].values
    contenRecomm=[]
    contentRecommRating=[]
    #contentRecommRating=[]
    for i in range(1152):
        if m[i][0]==1:
            contenRecomm.append(n[i][1])
            contentRecommRating.append(n[i][20])
           
    return contenRecomm, contentRecommRating     


In [9]:

    ## ENTER A MOVIE YOU HAVE WATCHED
watched_movie='Toy Story (1995)'  
movie_input=[]
dummy = count_3.iloc[:,:].values
for i in range(len(dummy)-1):
    if dummy[i][1] == watched_movie:
        for j in range(18):
            movie_input.append(dummy[i][j+2])
            
movie_input=np.array(movie_input)   # input a vector which has movies's genre details with their average rating
movie_for_you,movie_for_you_rating = content_based(movie_input,R,count_3)

Top_Movie_For_You_1 =[[]]*len(movie_for_you)
for i in range(len(movie_for_you)):
    Top_Movie_For_You_1[i]=(movie_for_you[i])


Top_Movie_For_You_2 =[[]]*len(movie_for_you)
for i in range(len(movie_for_you)):
    Top_Movie_For_You_2[i]=(movie_for_you_rating[i])
    
Top_movie_for_you=pd.DataFrame()    
Top_movie_for_you['movie']=Top_Movie_For_You_1
Top_movie_for_you['rating']=Top_Movie_For_You_2

Top_movie_for_you=Top_movie_for_you.sort_values(ascending=False, by='rating')
Top_movie_for_you =  Top_movie_for_you.movie

print("Recommendations On the basis of your history(already watched movie): ")
for i in range(5):
    print("        "+Top_movie_for_you[i])



Recommendations On the basis of your history(already watched movie): 
        Toy Story (1995)
        Get Shorty (1995)
        Babe (1995)
        Mighty Aphrodite (1995)
        French Twist (Gazon maudit) (1995)


In [10]:

 
## COLLABORATIVE FILTERING ( item-item)

ratings_new2 = pd.merge(ratings,items)
keep_col1 = ['movie_id','movie title','user_id','rating']
ratings_new2=ratings_new2[keep_col1]

userRatings = ratings_new2.pivot_table(index=['user_id'],columns=['movie title'],values='rating')
userRatings = userRatings.dropna(thresh=10, axis=1).fillna(0,axis=1)

def standardize(row): # standardizing to find the coorelation btw the movie's ratings
    new_row = (row - row.mean())/(row.max()-row.mean())
    return new_row

userRating_std = userRatings.apply(standardize) # applying this standardize function to the userrating dataframe

from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(userRating_std.T) # using cosine similarity to find the similarity between the movies on the basis of their ratings

item_similarity_df = pd.DataFrame(item_similarity,index=userRatings.columns,columns = userRatings.columns)

# This function will find the similarity between the given movie and the Dataframe which we created above and show thenm in descending order
def get_similar(movie_name,rating):
    similar_ratings = item_similarity_df[movie_name]*(rating-2.5)
    similar_ratings = similar_ratings.sort_values(ascending=False)
    return similar_ratings



In [11]:
# Enter a movie
enter_movie="Toy Story (1995)"
enter_rating=4
sim_try = get_similar(enter_movie,enter_rating)
sim_try=sim_try.index
print("Recommendations for the movie", enter_movie ," when the user give the rating to this movie as", enter_rating, ":     ")
for i in range(5):
    print("        "+sim_try[i])


Recommendations for the movie Toy Story (1995)  when the user give the rating to this movie as 4 :     
        Toy Story (1995)
        Star Wars (1977)
        Independence Day (ID4) (1996)
        Rock, The (1996)
        Willy Wonka and the Chocolate Factory (1971)


In [12]:

# entering a user who has rated more then 1 movie and checking the recommendation
dummy_user = [("GoldenEye (1995)",5),("Toy Story (1995)",4),("Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)",2)]
similar_movies = pd.DataFrame()

for movie,rating in dummy_user:
    similar_movies = similar_movies.append(get_similar(movie,rating),ignore_index = True)

#similar_movies.head()

similar_movies = similar_movies.sum().sort_values(ascending=False).head(10)
similar_movies =  similar_movies.index
print("User History: ", dummy_user)
print("Recommendations for the user On the basis  of his/her older ratings: ")
for i in range(5):
    print("        "+similar_movies[i])

  

User History:  [('GoldenEye (1995)', 5), ('Toy Story (1995)', 4), ('Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 2)]
Recommendations for the user On the basis  of his/her older ratings: 
        GoldenEye (1995)
        Toy Story (1995)
        Under Siege (1992)
        Top Gun (1986)
        Batman (1989)


In [13]:
  ## Popularity Based
    # checking the most rated movie (On the based of number of Raters)
count_1 = pd.merge(count,items)
count_1['No_of_raters']=count_1['user_id']
keep_col4= ['movie title','No_of_raters']
count_1=count_1[keep_col4]
count_1['movie_title']=count_1['movie title']
count_1=count_1.drop('movie title',axis='columns')

pop_movies = count_1.sort_values(ascending=False, by='No_of_raters')
pop_movies=pop_movies.movie_title
# The movie which is rated by most of the users shows that it's a trending movie
print("Trending Movies: ")
for i in range(5):
    print("        "+pop_movies[i])


Trending Movies: 
        Toy Story (1995)
        GoldenEye (1995)
        Four Rooms (1995)
        Get Shorty (1995)
        Copycat (1995)


In [14]:
print('Thank You!')

Thank You!
