<a href="https://colab.research.google.com/github/aloysiusw/movierecsys-sim-matrix/blob/main/MovieRecSys_DataMining_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initial Setup

Library import, etc

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics import jaccard_score 


from itertools import chain

#Part 1 - Pre-Processing and Data Preparation

#1.1 - Data Import and Check

In [None]:
#Read the csv
movieOriginalDF = pd.read_csv('/content/mymoviedb.csv', lineterminator='\n') #not using lineterminator='\n' outputs an error

In [None]:
#Check the upper part of the data
movieOriginalDF.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [None]:
movieOriginalDF.shape

(9827, 9)

In [None]:
#Sort data by title in alphabetical order
movieOriginalDF  = movieOriginalDF.sort_values(by='Title')

In [None]:
#Remove duplicates based on the plot summary
movieOriginalDF = movieOriginalDF.drop_duplicates(subset=['Overview'], keep='first')

In [None]:
#Resets index in sorted dataframe to prevent future issues
movieOriginalDF.reset_index(drop=True, inplace=True)

In [None]:
#Check number of unique values
movieOriginalDF.Genre.nunique()

2337

# 1.2 - Processing the genres

Genre in the dataset is lumped into one column, separated by ", " and thus needs to be split and counted so the data is workable

In [None]:
#Splits all the genres in the dataset into unique values into a new dataframe
movieGenreDF = movieOriginalDF.Genre.str.split(r", ", expand=True) #expand=True makes it into a dataframe instead of a series/list

In [None]:
#Change the column names into something we can read
movieGenreDF.columns=['A','B','C','D','E','F','G','H']

In [None]:
#Check the upper 5 values of that new genre dataframe
movieGenreDF.head()

Unnamed: 0,A,B,C,D,E,F,G,H
0,Action,Horror,Thriller,,,,,
1,Documentary,Drama,History,,,,,
2,Comedy,,,,,,,
3,Comedy,Drama,Romance,,,,,
4,Science Fiction,Comedy,Family,Fantasy,,,,


In [None]:
#Use the "Title" column as a reference column, since index is preserved it should still be in the correct order
movieGenreDF = movieGenreDF.join(movieOriginalDF['Title'])
#Add in "Overview" as an additional key, because some of the title are duplicates
movieGenreDF = movieGenreDF.join(movieOriginalDF['Overview'])

In [None]:
#Check that the joining was performed correctly
movieGenreDF.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Title,Overview
0,Action,Horror,Thriller,,,,,,#Alive,"As a grisly virus rampages a city, a lone man ..."
1,Documentary,Drama,History,,,,,,#AnneFrank. Parallel Stories,One single Anne Frank moves us more than the c...
2,Comedy,,,,,,,,#realityhigh,When nerdy high schooler Dani finally attracts...
3,Comedy,Drama,Romance,,,,,,(500) Days of Summer,"Tom, greeting-card writer and hopeless romanti..."
4,Science Fiction,Comedy,Family,Fantasy,,,,,*batteries not included,In a soon to be demolished block of apartments...


In [None]:
#Get all unique genres from the columns
uniqueGenresPreConv = np.hstack((movieGenreDF.A.unique(),
                      movieGenreDF.B.unique(),
                      movieGenreDF.C.unique(),
                      movieGenreDF.D.unique(),
                      movieGenreDF.E.unique(),
                      movieGenreDF.F.unique(),
                      movieGenreDF.G.unique(),
                      movieGenreDF.H.unique()))
#List of all genres, with duplicates
print(uniqueGenresPreConv)

['Action' 'Documentary' 'Comedy' 'Science Fiction' 'Crime' 'Thriller'
 'Adventure' 'Drama' 'Animation' 'Family' 'Horror' 'History' 'War'
 'Fantasy' 'Romance' 'Western' 'Music' 'TV Movie' 'Mystery' 'Horror'
 'Drama' None 'Comedy' 'Science Fiction' 'Action' 'Romance' 'Adventure'
 'Family' 'Thriller' 'History' 'Fantasy' 'Crime' 'Mystery' 'Music'
 'Animation' 'TV Movie' 'Western' 'War' 'Documentary' 'Thriller' 'History'
 None 'Romance' 'Family' 'Drama' 'Action' 'Fantasy' 'Comedy' 'Crime' 'War'
 'Horror' 'Adventure' 'Science Fiction' 'Mystery' 'Animation' 'TV Movie'
 'Music' 'Western' 'Documentary' None 'Fantasy' 'Horror' 'Mystery'
 'Animation' 'Crime' 'History' 'Thriller' 'Science Fiction' 'Romance'
 'Comedy' 'Adventure' 'Family' 'Music' 'Action' 'Drama' 'TV Movie'
 'Western' 'War' 'Documentary' None 'Drama' 'War' 'Science Fiction'
 'Mystery' 'Thriller' 'Adventure' 'Family' 'Fantasy' 'Comedy' 'Romance'
 'Horror' 'History' 'Animation' 'Music' 'Action' 'Crime' 'TV Movie'
 'Western' None 'Thr

In [None]:
#Convert to set and to remove duplicates and then back to list
uniqueGenres = list(set(uniqueGenresPreConv))
print(uniqueGenres)

['Fantasy', 'Western', 'Mystery', 'Adventure', 'TV Movie', 'Comedy', 'Family', 'Drama', None, 'Horror', 'War', 'Thriller', 'Music', 'Science Fiction', 'Romance', 'Crime', 'Documentary', 'Animation', 'Action', 'History']


In [None]:
#Remove "None" value
uniqueGenresNoNA = list(filter(None, uniqueGenres))

In [None]:
#Sort the list
uniqueGenresNoNA.sort()

In [None]:
#List of all genres, without duplicates
print(uniqueGenresNoNA)

['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']


In [None]:
#Create columns with the list, and fill it with "0"
movieGenreDF[uniqueGenresNoNA] = "0"

In [None]:
#Checking and checking and checking
movieGenreDF.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Title,Overview,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Action,Horror,Thriller,,,,,,#Alive,"As a grisly virus rampages a city, a lone man ...",...,0,0,0,0,0,0,0,0,0,0
1,Documentary,Drama,History,,,,,,#AnneFrank. Parallel Stories,One single Anne Frank moves us more than the c...,...,0,0,0,0,0,0,0,0,0,0
2,Comedy,,,,,,,,#realityhigh,When nerdy high schooler Dani finally attracts...,...,0,0,0,0,0,0,0,0,0,0
3,Comedy,Drama,Romance,,,,,,(500) Days of Summer,"Tom, greeting-card writer and hopeless romanti...",...,0,0,0,0,0,0,0,0,0,0
4,Science Fiction,Comedy,Family,Fantasy,,,,,*batteries not included,In a soon to be demolished block of apartments...,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Script to update the column values
for idx, value in movieGenreDF.iterrows(): #For each row in df based on index
  for a in uniqueGenresNoNA: #For each genre type in genre list
      if(movieGenreDF.at[idx,'A'] == a #check if in the A:H rows is in the list
         or movieGenreDF.at[idx,'B'] == a 
         or movieGenreDF.at[idx,'C'] == a
         or movieGenreDF.at[idx,'D'] == a
         or movieGenreDF.at[idx,'E'] == a
         or movieGenreDF.at[idx,'F'] == a
         or movieGenreDF.at[idx,'G'] == a
         or movieGenreDF.at[idx,'H'] == a): #this is a really dirty solution but it works
        # debugging, replace + loc did not work
        # you know tech have been teaching us Java, not Python
        #print(movieGenreDF.loc[idx,'A']) 
        #print(a) 
        #print(movieGenreDF.loc[idx,a])

        movieGenreDF.at[idx,a] = '1' #dont use loc, loc creates copy, use at/iat

In [None]:
movieGenreDF.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Title,Overview,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Action,Horror,Thriller,,,,,,#Alive,"As a grisly virus rampages a city, a lone man ...",...,0,1,0,0,0,0,0,1,0,0
1,Documentary,Drama,History,,,,,,#AnneFrank. Parallel Stories,One single Anne Frank moves us more than the c...,...,1,0,0,0,0,0,0,0,0,0
2,Comedy,,,,,,,,#realityhigh,When nerdy high schooler Dani finally attracts...,...,0,0,0,0,0,0,0,0,0,0
3,Comedy,Drama,Romance,,,,,,(500) Days of Summer,"Tom, greeting-card writer and hopeless romanti...",...,0,0,0,0,1,0,0,0,0,0
4,Science Fiction,Comedy,Family,Fantasy,,,,,*batteries not included,In a soon to be demolished block of apartments...,...,0,0,0,0,0,1,0,0,0,0


In [None]:
#Delete the now unused columns, including Title because the overview will be used as the key
movieGenreDF = movieGenreDF.drop(columns=['Title','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])

# 1.3 Recombining the dataframe

In [None]:
#Recombine the two dataframes
movieCombinedDF = movieOriginalDF.merge(right=movieGenreDF, left_on='Overview', right_on='Overview', how='inner', copy=False)

In [None]:
#Remove unused columns, genre is now redundant and poster_url is unused
movieCombinedDF = movieCombinedDF.drop(columns=['Genre','Poster_Url'])

In [None]:
#Always check the data to see nothing is wrong
movieCombinedDF.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,2020-06-24,#Alive,"As a grisly virus rampages a city, a lone man ...",28.395,1210,7.3,ko,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,2019-10-18,#AnneFrank. Parallel Stories,One single Anne Frank moves us more than the c...,14.116,35,7.0,en,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2017-07-17,#realityhigh,When nerdy high schooler Dani finally attracts...,24.12,943,6.4,en,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2009-07-17,(500) Days of Summer,"Tom, greeting-card writer and hopeless romanti...",49.96,8198,7.3,en,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1987-12-18,*batteries not included,In a soon to be demolished block of apartments...,18.534,534,6.7,en,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
#Drop any duplicate entries that may have occurred in the merge process
movieFinalDF = movieCombinedDF.drop_duplicates(subset='Overview', keep='first', inplace=False, ignore_index=True)

In [None]:
#Checking as usual
movieFinalDF.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,2020-06-24,#Alive,"As a grisly virus rampages a city, a lone man ...",28.395,1210,7.3,ko,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,2019-10-18,#AnneFrank. Parallel Stories,One single Anne Frank moves us more than the c...,14.116,35,7.0,en,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2017-07-17,#realityhigh,When nerdy high schooler Dani finally attracts...,24.12,943,6.4,en,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2009-07-17,(500) Days of Summer,"Tom, greeting-card writer and hopeless romanti...",49.96,8198,7.3,en,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1987-12-18,*batteries not included,In a soon to be demolished block of apartments...,18.534,534,6.7,en,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
#Save to a new .csv so we don't have to start over when the runtime restarts
movieCombinedDF.to_csv('mymoviedb-preprocessed.csv', encoding='utf-8', index=False)

#Implementing Algorithms

#2.1 Algorithm 1

In [None]:
#Read the csv
movieDF = pd.read_csv('/content/mymoviedb-preprocessed.csv', lineterminator='\n') #not using lineterminator='\n' outputs an error

In [None]:
#movieSortedDF.iloc[0,25] #7:25
#movieEncoded = movieGenreDF.copy()
#movieEncoded = movieGenreDF.drop(columns=['Overview'])
#movieEncoded = movieEncoded.transpose()
movieEncoded = pd.read_csv('/content/mymoviedb-encoded.csv', lineterminator='\n')

In [None]:
movieEncoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9812,9813,9814,9815,9816,9817,9818,9819,9820,9821
0,1,0,0,0,0,0,0,1,0,1,...,0,0,1,1,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,1,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,1,1,1,1,0
4,0,0,0,0,0,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,1,0,1,1,0,1,1,...,1,1,0,0,0,1,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#movieEncoded.to_csv('mymoviedb-encoded.csv', encoding='utf-8', index=False)

In [None]:
"""
def GetJaccardSimilarity(title, titleSecond):
  print(title)
  index = movieDF.index[movieDF['Title']==title]
  A = movieEncoded.iloc[:,index].values.tolist()
  A = list(chain.from_iterable(A))
  A = [int(x) for x in A]
  print(A)

  print(titleSecond)
  indexSecond = movieDF.index[movieDF['Title']==titleSecond]
  B = movieEncoded.iloc[:,indexSecond].values.tolist()
  B = list(chain.from_iterable(B))
  B = [int(x) for x in B]
  print(B)

  jacScore = jaccard_score(A,B)

  return jaccard_score(A,B)
"""

"\ndef GetJaccardSimilarity(title, titleSecond):\n  print(title)\n  index = movieDF.index[movieDF['Title']==title]\n  A = movieEncoded.iloc[:,index].values.tolist()\n  A = list(chain.from_iterable(A))\n  A = [int(x) for x in A]\n  print(A)\n\n  print(titleSecond)\n  indexSecond = movieDF.index[movieDF['Title']==titleSecond]\n  B = movieEncoded.iloc[:,indexSecond].values.tolist()\n  B = list(chain.from_iterable(B))\n  B = [int(x) for x in B]\n  print(B)\n\n  jacScore = jaccard_score(A,B)\n\n  return jaccard_score(A,B)\n"

In [None]:
def GetJaccardSimilarity(title, indexSecond):
  #print(title)
  index = movieDF.index[movieDF['Title']==title]
  #print(index)
  A = movieEncoded.iloc[:,index].values.tolist()
  A = list(chain.from_iterable(A))
  A = [int(x) for x in A]
  #print(A)

  B = movieEncoded.iloc[:,indexSecond].values.tolist()
  #print(B)
  #B = list(chain.from_iterable(B)) #yesterday this was needed, now it doesnt
  #print(B)
  B = [int(y) for y in B]
  #print(B)

  jaccardScore = jaccard_score(A,B)
  #print(jaccardScore)
  return jaccardScore

In [None]:
print(movieEncoded.shape)
print(movieDF.shape)

(19, 9822)
(9822, 26)


In [None]:
GetJaccardSimilarity('The Batman',5)

0.5

In [None]:
def GetRecommendationsJaccard(title, simTolerance):
  currentMax=0
  currentMaxIndex=0
  originalKeyIndex = movieDF.index[movieDF['Title']==title]
  originalKey = movieDF.iloc[originalKeyIndex,1].to_string(index=False)
  recommendedMovies = pd.DataFrame({'Recommendation': [''],'Summary': [''],'Similarity': [''],'Popularity': [''],'Average Rating': ['']})
  recMovieCounter=0
  print('Queried key:',originalKey)
  print('Similarity threshold:',simTolerance)

  for idx,value in movieDF.iterrows():
    jaccardScore = GetJaccardSimilarity(title,idx)
    updateIndex = idx
    listUpdated = False

    currentTitleKey = movieDF.iloc[idx,1]
    if(currentTitleKey != originalKey):
      if(jaccardScore >= simTolerance):
        #print('js:',jaccardScore)
        #print('st:',simTolerance)
        updateIndex = idx
        listUpdated = True

        if(jaccardScore >= currentMax):

          firstIndex = movieDF.iloc[currentMaxIndex,3]
          #print(firstIndex)
          firstKey = movieDF.iloc[currentMaxIndex,1]
          #print(firstKey)

          secondIndex = movieDF.iloc[idx,3]
          #print(secondIndex)
          secondKey = movieDF.iloc[idx,1]
          #print(secondKey)

          #If more popular, they're not the same, and the new key is not the same as the original key
          if(firstIndex < secondIndex and firstKey != secondKey):
            print('Old key:',firstKey)
            print('New key:',secondKey,'\n')
            currentMax = jaccardScore
            currentMaxIndex = idx

        #Function to update the dataframe
        if(listUpdated):
          #print('Update index:',updateIndex)
          movieTitle = movieDF.iloc[updateIndex,1]
          summaryOverview = movieDF.iloc[updateIndex,2]
          moviePopularity = movieDF.iloc[updateIndex,3]
          averageRating = movieDF.iloc[updateIndex,5]

            #Add in the current movie being looked at, the titles, and the overview into the dataframe
          recommendedMovies.loc[recMovieCounter] = [movieTitle,summaryOverview,currentMax,moviePopularity,averageRating]
          recMovieCounter += 1

         
    else:
      print('New key (' + currentTitleKey + ') is identical to original key (' + originalKey + ')\n')
        
  print("Max Jaccard score is",currentMax)
  titleAtIndex = movieDF.iloc[currentMaxIndex,1]
  print("Most recommended title:",titleAtIndex)
  print("Index at",currentMaxIndex)

  recommendedMovies = recommendedMovies.sort_values(['Popularity'], ascending=False)
  recommendedMovies.reset_index(drop=True, inplace=True)
  return(recommendedMovies)

In [None]:
GetRecommendationsJaccard('Spider-Man: No Way Home', 0.9)

Queried key: Spider-Man: No Way Home
Similarity threshold: 0.9
Old key: #Alive
New key: 2012 

Old key: 2012
New key: Avengers: Age of Ultron 

Old key: Avengers: Age of Ultron
New key: Avengers: Endgame 

Old key: Avengers: Endgame
New key: Avengers: Infinity War 

New key (Spider-Man: No Way Home) is identical to original key (Spider-Man: No Way Home)

Old key: Avengers: Infinity War
New key: The Matrix Resurrections 

Old key: The Matrix Resurrections
New key: Venom: Let There Be Carnage 

Max Jaccard score is 1.0
Most recommended title: Venom: Let There Be Carnage
Index at 9405


Unnamed: 0,Recommendation,Summary,Similarity,Popularity,Average Rating
0,Venom: Let There Be Carnage,After finding a host body in investigative rep...,1.0,1053.615,7.1
1,The Matrix Resurrections,"Plagued by strange memories, Neo's life takes ...",1.0,941.024,6.8
2,Avengers: Infinity War,As the Avengers and their allies have continue...,1.0,338.402,8.3
3,Black Widow,"Natasha Romanoff, also known as Black Widow, c...",1.0,337.651,7.5
4,Moonfall,A mysterious force knocks the moon from its or...,1.0,328.678,5.9
...,...,...,...,...,...
102,"Ghidorah, the Three-Headed Monster",A meteor lands in Kurobe Valley as detective S...,1.0,14.739,7.2
103,The Last Starfighter,A video game expert Alex Rogan finds himself t...,1.0,14.529,6.6
104,Jurassic Galaxy,"In the near future, a ship of space explorers ...",1.0,14.482,5.2
105,Godzilla Raids Again,Two fishing scout pilots make a horrifying dis...,1.0,14.369,6.1


# 2.2 Algorithm 2

In [None]:
#Read the csv
movieDF = pd.read_csv('/content/mymoviedb-preprocessed.csv', lineterminator='\n') #not using lineterminator='\n' outputs an error

In [None]:
#Check the dataframe
movieDF.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,2020-06-24,#Alive,"As a grisly virus rampages a city, a lone man ...",28.395,1210,7.3,ko,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,2019-10-18,#AnneFrank. Parallel Stories,One single Anne Frank moves us more than the c...,14.116,35,7.0,en,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2017-07-17,#realityhigh,When nerdy high schooler Dani finally attracts...,24.12,943,6.4,en,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2009-07-17,(500) Days of Summer,"Tom, greeting-card writer and hopeless romanti...",49.96,8198,7.3,en,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1987-12-18,*batteries not included,In a soon to be demolished block of apartments...,18.534,534,6.7,en,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
#Create a column row that combines the title and overview
movieDF['Text'] = movieDF['Title'] + movieDF['Overview']  

In [None]:
#Calculate TF-IDF of the title + plot summary of the movie
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')

tfidf_matrix = tf.fit_transform(movieDF['Text'])

In [None]:
#Calculate the cosine similarity
cosineSim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Create a sample user dataframe
sampleUserDF = pd.DataFrame({
    'User': ['aloysius_w','aloysius_w','aloysius_w','aloysius_w','DevTeam.999','DevTeam.999','DevTeam.999','Speedwagon','Speedwagon','Speedwagon','Speedwagon'],
    'Titles_Watched': ['The Batman','Toy Story','Parasite','Spider-Man: No Way Home','Cars','Baby Driver','Matrix','The Pink Panther','Toy Story 2','Frozen','Frozen 2']
})

sampleUserDF

Unnamed: 0,User,Titles_Watched
0,aloysius_w,The Batman
1,aloysius_w,Toy Story
2,aloysius_w,Parasite
3,aloysius_w,Spider-Man: No Way Home
4,DevTeam.999,Cars
5,DevTeam.999,Baby Driver
6,DevTeam.999,Matrix
7,Speedwagon,The Pink Panther
8,Speedwagon,Toy Story 2
9,Speedwagon,Frozen


In [None]:
#Just in case you want to use it with a proper dataframe
userDF = sampleUserDF.copy()

In [None]:
#Grab just the titles
titles = movieDF['Title']
#Index all the movie titles
indices = pd.Series(movieDF.index, index=movieDF['Title'])

In [None]:
#Function to grab the movies that have been watched
def GetUserData(user):
  moviesWatched = userDF.loc[userDF['User'] == user]
  moviesWatched.reset_index(drop=True, inplace=True)
  #print(moviesWatched)
  return moviesWatched['Titles_Watched'].values.tolist()

In [None]:
#Script to plot the recommendation using the similarity matrix
def GetRecommendationsCosSim(user,cosineSim=cosineSim):
  #Get the movies that have been watched
  userMovies = GetUserData(user)
  #Make the dataframe to store the recommendation
  recommendedMovies = pd.DataFrame({'Recommendation': [''],'Based On': [''], 'Summary': ['']})
  #Counter to track the current row
  currentRow = 0
  #print(userMovies)
  #Make a recommendation based on each movie that has been watched
  for a in userMovies:
    idx = indices[a]
    simScores = list(enumerate(cosineSim[idx]))
    simScores = sorted(simScores, key=lambda x: x[1], reverse=True)
    simScores = simScores[1:11]
    movieIndices = [i[0] for i in simScores]
    tempTitle = movieDF['Title'].iloc[movieIndices]
    
    #For all the movie recommended, add to dataframe
    for b in tempTitle:
      index = movieDF.index[movieDF['Title']==b]
      #Get the plot overview from the main dataframe, remove the index from the search
      summaryOverview = movieDF.iloc[index,2].to_string(index=False)
      #Add in the current movie being looked at, the titles, and the overview into the dataframe
      recommendedMovies.loc[currentRow] = [b,a, summaryOverview]
      #Add to the current row counter
      currentRow += 1
  return recommendedMovies

In [None]:
GetRecommendationsCosSim('aloysius_w')

Unnamed: 0,Recommendation,Based On,Summary
0,Batman: Gotham by Gaslight,The Batman,"In an alternative Victorian Age Gotham City, B..."
1,"Batman: The Long Halloween, Part Two",The Batman,"As Gotham City's young vigilante, the Batman, ..."
2,"Batman: The Long Halloween, Part One",The Batman,Following a brutal series of murders taking pl...
3,Batman Beyond: The Movie,The Batman,"Fuelled by remorse and vengeance, a high schoo..."
4,LEGO DC Comics Super Heroes: Justice League - ...,The Batman,The caped crusader reluctantly agrees to let B...
5,Batman: Return of the Caped Crusaders,The Batman,Adam West and Burt Ward returns to their iconi...
6,Batman Begins,The Batman,"Driven by tragedy, billionaire Bruce Wayne ded..."
7,The Zodiac,The Batman,An elusive serial killer known as the Zodiac t...
8,Lego DC Batman: Family Matters,The Batman,"Suspicion is on high after Batman, Batgirl, Ro..."
9,Batman: Mystery of the Batwoman,The Batman,As if the Penguin wasn't enough to contend wit...


#Evaluation

In [None]:
cosSimDF = GetRecommendationsCosSim('aloysius_w')
jacSimDF = GetRecommendationsJaccard('The Batman',0.9)

Queried key: The Batman
Similarity threshold: 0.9
Old key: #Alive
New key: Beckett 

New key (The Batman) is identical to original key (The Batman)

Max Jaccard score is 1.0
Most recommended title: Beckett
Index at 1044


In [None]:
filteredCosSimDF = cosSimDF.loc[cosSimDF['Based On']=='The Batman']

In [None]:
filteredCosSimDF.head(10)

Unnamed: 0,Recommendation,Based On,Summary
0,Batman: Gotham by Gaslight,The Batman,"In an alternative Victorian Age Gotham City, B..."
1,"Batman: The Long Halloween, Part Two",The Batman,"As Gotham City's young vigilante, the Batman, ..."
2,"Batman: The Long Halloween, Part One",The Batman,Following a brutal series of murders taking pl...
3,Batman Beyond: The Movie,The Batman,"Fuelled by remorse and vengeance, a high schoo..."
4,LEGO DC Comics Super Heroes: Justice League - ...,The Batman,The caped crusader reluctantly agrees to let B...
5,Batman: Return of the Caped Crusaders,The Batman,Adam West and Burt Ward returns to their iconi...
6,Batman Begins,The Batman,"Driven by tragedy, billionaire Bruce Wayne ded..."
7,The Zodiac,The Batman,An elusive serial killer known as the Zodiac t...
8,Lego DC Batman: Family Matters,The Batman,"Suspicion is on high after Batman, Batgirl, Ro..."
9,Batman: Mystery of the Batwoman,The Batman,As if the Penguin wasn't enough to contend wit...


In [None]:
jacSimDF.head(10)

Unnamed: 0,Recommendation,Summary,Similarity,Popularity,Average Rating
0,Beckett,An American tourist in Greece finds himself on...,1.0,98.796,6.4
1,Se7en,Two homicide detectives are on a desperate hun...,1.0,46.685,8.3
2,The Raven,A fictionalized account of the last days of Ed...,1.0,34.724,6.3
3,Big Driver,"Based on a novella from Stephen King, A famous...",1.0,32.971,5.7
4,The Girl with the Dragon Tattoo,This English-language adaptation of the Swedis...,1.0,27.279,7.4
5,Basic Instinct 2,Novelist Catherine Tramell is once again in tr...,0.0,26.682,5.0
6,Solace,"A psychic doctor, John Clancy, works with an F...",1.0,24.405,6.3
7,Mindhunters,Trainees in the FBI's psychological profiling ...,1.0,24.121,6.5
8,The Dry,Aaron Falk returns to his drought-stricken hom...,1.0,21.678,6.8
9,Phone Booth,A slick New York publicist who picks up a ring...,1.0,20.635,6.8
