In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
url = "https://raw.githubusercontent.com/yinghaoz1/tmdb-movie-dataset-analysis/refs/heads/master/tmdb-movies.csv"
importedDf = pd.read_csv(url)

In [3]:
importedDf.iloc[0]

id                                                                 135397
imdb_id                                                         tt0369610
popularity                                                      32.985763
budget                                                          150000000
revenue                                                        1513528810
original_title                                             Jurassic World
cast                    Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...
homepage                                    http://www.jurassicworld.com/
director                                                  Colin Trevorrow
tagline                                                 The park is open.
keywords                monster|dna|tyrannosaurus rex|velociraptor|island
overview                Twenty-two years after the events of Jurassic ...
runtime                                                               124
genres                          Action

In [4]:
importedDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10866 non-null  int64  
 1   imdb_id               10856 non-null  object 
 2   popularity            10866 non-null  float64
 3   budget                10866 non-null  int64  
 4   revenue               10866 non-null  int64  
 5   original_title        10866 non-null  object 
 6   cast                  10790 non-null  object 
 7   homepage              2936 non-null   object 
 8   director              10822 non-null  object 
 9   tagline               8042 non-null   object 
 10  keywords              9373 non-null   object 
 11  overview              10862 non-null  object 
 12  runtime               10866 non-null  int64  
 13  genres                10843 non-null  object 
 14  production_companies  9836 non-null   object 
 15  release_date       

In [5]:
df = importedDf.loc[:,['imdb_id', 'original_title', 'cast', 'director', 'keywords', 'overview', 'genres', 'vote_average', 'release_year']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   imdb_id         10856 non-null  object 
 1   original_title  10866 non-null  object 
 2   cast            10790 non-null  object 
 3   director        10822 non-null  object 
 4   keywords        9373 non-null   object 
 5   overview        10862 non-null  object 
 6   genres          10843 non-null  object 
 7   vote_average    10866 non-null  float64
 8   release_year    10866 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 764.1+ KB


In [6]:
df = df.dropna().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   imdb_id         9303 non-null   object 
 1   original_title  9303 non-null   object 
 2   cast            9303 non-null   object 
 3   director        9303 non-null   object 
 4   keywords        9303 non-null   object 
 5   overview        9303 non-null   object 
 6   genres          9303 non-null   object 
 7   vote_average    9303 non-null   float64
 8   release_year    9303 non-null   int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 654.2+ KB


In [7]:
df.cast = df.cast.str.replace(' ','_')
df.cast = df.cast.str.replace('|',' ')

In [8]:
df.iloc[1].cast

'Tom_Hardy Charlize_Theron Hugh_Keays-Byrne Nicholas_Hoult Josh_Helman'

In [9]:
df.director = df.director.str.replace(" ", '_')
df.director

0          Colin_Trevorrow
1            George_Miller
2         Robert_Schwentke
3              J.J._Abrams
4                James_Wan
               ...        
9298           Bruce_Brown
9299    John_Frankenheimer
9300        Eldar_Ryazanov
9301           Woody_Allen
9302      Harold_P._Warren
Name: director, Length: 9303, dtype: object

In [10]:
df.keywords = df.keywords.str.replace(' ','_')
df.keywords = df.keywords.str.replace('|',' ')

df.keywords

0       monster dna tyrannosaurus_rex velociraptor island
1        future chase post-apocalyptic dystopia australia
2       based_on_novel revolution dystopia sequel dyst...
3                   android spaceship jedi space_opera 3d
4                     car_race speed revenge suspense car
                              ...                        
9298                             surfer surfboard surfing
9299                            car_race racing formula_1
9300                             car trolley stealing_car
9301                                                spoof
9302                  fire gun drive sacrifice flashlight
Name: keywords, Length: 9303, dtype: object

In [11]:
df.genres = df.genres.str.replace(' ','_')
df.genres = df.genres.str.replace('|',' ')

df.genres

0       Action Adventure Science_Fiction Thriller
1       Action Adventure Science_Fiction Thriller
2              Adventure Science_Fiction Thriller
3        Action Adventure Science_Fiction Fantasy
4                           Action Crime Thriller
                          ...                    
9298                                  Documentary
9299                       Action Adventure Drama
9300                               Mystery Comedy
9301                                Action Comedy
9302                                       Horror
Name: genres, Length: 9303, dtype: object

In [12]:
df.release_year = ((df.release_year)//10)*10
df.release_year

0       2010
1       2010
2       2010
3       2010
4       2010
        ... 
9298    1960
9299    1960
9300    1960
9301    1960
9302    1960
Name: release_year, Length: 9303, dtype: int64

In [13]:
df['tags'] = df.cast + ' ' + df.director + ' ' + df.keywords + ' ' + df.genres + ' ' + df.overview
df['tags'] = df['tags'].str.lower()
df.tags[0]

"chris_pratt bryce_dallas_howard irrfan_khan vincent_d'onofrio nick_robinson colin_trevorrow monster dna tyrannosaurus_rex velociraptor island action adventure science_fiction thriller twenty-two years after the events of jurassic park, isla nublar now features a fully functioning dinosaur theme park, jurassic world, as originally envisioned by john hammond."

In [14]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9303 entries, 0 to 9302
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   imdb_id         9303 non-null   object 
 1   original_title  9303 non-null   object 
 2   cast            9303 non-null   object 
 3   director        9303 non-null   object 
 4   keywords        9303 non-null   object 
 5   overview        9303 non-null   object 
 6   genres          9303 non-null   object 
 7   vote_average    9303 non-null   float64
 8   release_year    9303 non-null   int64  
 9   tags            9303 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 726.9+ KB


In [15]:
df = df.drop(columns = ['cast', 'director', 'keywords', 'overview', 'genres', 'release_year'])

#Using Count Vectorizer

In [16]:
# Using count vectorizer to convert text data into a matrix of token counts
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()

# def stem(text):
#   lis = []
#   for i in text.split():
#     lis.append(ps.stem(i))

#   return " ".join(lis)

# df['tags'] = df['tags'].apply(stem)

In [17]:
# CV = CountVectorizer(max_features = 5000, stop_words = 'english')
# tagsVector = CV.fit_transform(df.tags).toarray()

In [18]:
# print(CV.get_feature_names_out()[:100])

#Using TF-IDF

In [19]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# tagsVector = tfidf.fit_transform(df.tags).toarray()

In [20]:
# print(tagsVector[0],tagsVector.shape)
# print(df.info())

#Using Transformer

In [21]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

tagsVector = model.encode(df.tags.tolist(), show_progress_bar=True, convert_to_tensor=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 291/291 [03:09<00:00,  1.54it/s]


In [22]:
tagsVector.shape

torch.Size([9303, 384])

In [23]:
similarityVector = cosine_similarity(tagsVector)


In [24]:
print(similarityVector[-1], similarityVector.shape)
print(similarityVector[0], similarityVector.shape)


[0.29246935 0.39189014 0.44278872 ... 0.31385303 0.19538535 0.99999994] (9303, 9303)
[1.0000002  0.13748765 0.2334899  ... 0.1317269  0.20603037 0.29246935] (9303, 9303)


In [25]:
def recommend(movieId):
  index = df[ df['imdb_id'] == movieId ].index[0]
  distances = list(enumerate(similarityVector[index]))
  distances.sort(reverse=True, key=lambda x:x[1])
  return distances[:11]

def recommendFromTitle(title):
  index = df[ df['original_title'] == title ].index[0]
  distances = list(enumerate(similarityVector[index]))
  distances.sort(reverse=True, key=lambda x:x[1])
  return distances[:11]

In [26]:
def getMovieNameList(lis):
  ret = []
  for i,l in lis:
    ret.append((df.iloc[i].original_title,l))

  return ret

def getMoviewIdList(lis):
  ret = []
  for i,l in lis:
    ret.append(df.iloc[i].imdb_id)

  return ret

In [27]:
print((  recommendFromTitle('Iron Man 2')  ) )
print(getMovieNameList(  recommendFromTitle('Iron Man 2')  ) )

[(1516, 1.0), (2319, 0.7734235), (4425, 0.6816374), (14, 0.6581156), (4361, 0.61483735), (3564, 0.60183525), (2979, 0.5821423), (870, 0.5751971), (578, 0.5636698), (4535, 0.55285704), (3890, 0.55246115)]
[('Iron Man 2', 1.0), ('Iron Man', 0.7734235), ('Iron Man 3', 0.6816374), ('Avengers: Age of Ultron', 0.6581156), ('Steel', 0.61483735), ('The Avengers', 0.60183525), ('Marvel One-Shot: The Consultant', 0.5821423), ('Iron Man & Captain America: Heroes United', 0.5751971), ('Marvel One-Shot: All Hail the King', 0.5636698), ('Machete Kills', 0.55285704), ('Superman vs. The Elite', 0.55246115)]


In [28]:
print(getMovieNameList(  recommend('tt1228705')  ) )

[('Iron Man 2', 1.0), ('Iron Man', 0.7734235), ('Iron Man 3', 0.6816374), ('Avengers: Age of Ultron', 0.6581156), ('Steel', 0.61483735), ('The Avengers', 0.60183525), ('Marvel One-Shot: The Consultant', 0.5821423), ('Iron Man & Captain America: Heroes United', 0.5751971), ('Marvel One-Shot: All Hail the King', 0.5636698), ('Machete Kills', 0.55285704), ('Superman vs. The Elite', 0.55246115)]


In [29]:
import pickle
# pickle.dump(df, open('movies.pkl', 'wb'))

In [30]:
# pickle.dump(similarityVector, open('similarityVector.pkl', 'wb'))
# pickle.dump(similarityVector, open('similarityVectorTFIDF.pkl', 'wb'))
pickle.dump(similarityVector, open('similarityVectorTransformer.pkl', 'wb'))