## Colabrative Filter : Alternating Least Squares

[Dataset: https://grouplens.org/datasets/movielens/]

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re

In [2]:
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")
links = pd.read_csv("links.csv")

In [3]:
print(movies.columns, "\n", tags.columns, "\n", links.columns)

Index(['movieId', 'title', 'genres'], dtype='object') 
 Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object') 
 Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')


In [4]:
movie_links = movies.merge(links, on='movieId')

movies['imdb_url'] = movie_links['imdbId'].apply(lambda x: f"https://www.imdb.com/title/tt{int(x):07d}/")
movies['tmdb_url'] = movie_links['tmdbId'].apply(lambda x: f"https://www.themoviedb.org/movie/{int(x)}" if not pd.isna(x) else None)

In [5]:
movies

Unnamed: 0,movieId,title,genres,imdb_url,tmdb_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://www.imdb.com/title/tt0114709/,https://www.themoviedb.org/movie/862
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://www.imdb.com/title/tt0113497/,https://www.themoviedb.org/movie/8844
2,3,Grumpier Old Men (1995),Comedy|Romance,https://www.imdb.com/title/tt0113228/,https://www.themoviedb.org/movie/15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://www.imdb.com/title/tt0114885/,https://www.themoviedb.org/movie/31357
4,5,Father of the Bride Part II (1995),Comedy,https://www.imdb.com/title/tt0113041/,https://www.themoviedb.org/movie/11862
...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,https://www.imdb.com/title/tt26812510/,https://www.themoviedb.org/movie/1032473
87581,292737,Shelter in Solitude (2023),Comedy|Drama,https://www.imdb.com/title/tt14907358/,https://www.themoviedb.org/movie/986674
87582,292753,Orca (2023),Drama,https://www.imdb.com/title/tt12388280/,https://www.themoviedb.org/movie/948139
87583,292755,The Angry Breed (1968),Drama,https://www.imdb.com/title/tt0064027/,https://www.themoviedb.org/movie/182776


In [6]:
movies.isnull().sum()

movieId       0
title         0
genres        0
imdb_url      0
tmdb_url    124
dtype: int64

In [7]:
tags.isnull().sum()

userId        0
movieId       0
tag          17
timestamp     0
dtype: int64

In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


In [9]:
movies.head()

Unnamed: 0,movieId,title,genres,imdb_url,tmdb_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://www.imdb.com/title/tt0114709/,https://www.themoviedb.org/movie/862
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://www.imdb.com/title/tt0113497/,https://www.themoviedb.org/movie/8844
2,3,Grumpier Old Men (1995),Comedy|Romance,https://www.imdb.com/title/tt0113228/,https://www.themoviedb.org/movie/15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://www.imdb.com/title/tt0114885/,https://www.themoviedb.org/movie/31357
4,5,Father of the Bride Part II (1995),Comedy,https://www.imdb.com/title/tt0113041/,https://www.themoviedb.org/movie/11862


In [10]:
# tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: " ".join(x)).reset_index()
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: " ".join(str(tag) for tag in x if pd.notnull(tag))).reset_index()

In [11]:
movies = movies.merge(tags_grouped, on='movieId', how='left')

In [12]:
movies['feature'] = (movies['genres'].fillna('') + ' ' +movies['tag'].fillna(''))

In [13]:
movies["feature"] = movies["feature"].str.replace("|", " ")

In [14]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special chars
    return text.lower().strip()

In [15]:
movies['feature'] = movies['feature'].apply(clean_text)

In [16]:
movies['feature'].shape

(87585,)

In [17]:
movies.head()

Unnamed: 0,movieId,title,genres,imdb_url,tmdb_url,tag,feature
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://www.imdb.com/title/tt0114709/,https://www.themoviedb.org/movie/862,children Disney animation children Disney Disn...,adventure animation children comedy fantasy ch...
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://www.imdb.com/title/tt0113497/,https://www.themoviedb.org/movie/8844,Robin Williams fantasy Robin Williams time tra...,adventure children fantasy robin williams fant...
2,3,Grumpier Old Men (1995),Comedy|Romance,https://www.imdb.com/title/tt0113228/,https://www.themoviedb.org/movie/15602,comedinha de velhinhos engraÃƒÂ§ada comedinha ...,comedy romance comedinha de velhinhos engraada...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://www.imdb.com/title/tt0114885/,https://www.themoviedb.org/movie/31357,characters slurs based on novel or book chick ...,comedy drama romance characters slurs based on...
4,5,Father of the Bride Part II (1995),Comedy,https://www.imdb.com/title/tt0113041/,https://www.themoviedb.org/movie/11862,Fantasy pregnancy remake family Steve Martin s...,comedy fantasy pregnancy remake family steve m...


In [18]:
vector = TfidfVectorizer(stop_words="english")
feature_vector = vector.fit_transform(movies["feature"])

In [19]:
print(feature_vector)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1492038 stored elements and shape (87585, 48958)>
  Coords	Values
  (0, 1345)	0.07942440479539474
  (0, 2327)	0.31380816996360966
  (0, 8445)	0.08180408016462808
  (0, 9464)	0.04258731256471066
  (0, 15502)	0.02644779528931117
  (0, 12653)	0.22125929895685947
  (0, 33225)	0.7015465878279963
  (0, 17163)	0.1605788892893002
  (0, 45230)	0.007781033095408701
  (0, 46571)	0.004073570262882824
  (0, 8898)	0.0827916133861187
  (0, 16983)	0.1190244510307698
  (0, 9656)	0.1504959751685042
  (0, 44607)	0.18629497573174067
  (0, 2326)	0.07939019071541825
  (0, 8990)	0.12624328550668593
  (0, 15445)	0.10015442115042575
  (0, 44312)	0.20190104135840717
  (0, 19352)	0.2623147867892257
  (0, 665)	0.021664708431963397
  (0, 20939)	0.09840395565772318
  (0, 44117)	0.04061626009997883
  (0, 44823)	0.036002687174594566
  (0, 1058)	0.007024880868148313
  (0, 35528)	0.004101300026856605
  :	:
  (87570, 12883)	1.0
  (87571, 12883)	1.0
  (87572, 

In [20]:
# similarity = cosine_similarity(feature_vector)

In [21]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50)
knn.fit(feature_vector)

In [22]:
distance, indices = knn.kneighbors(feature_vector)

In [23]:
def recommend_movies(movie_index, top=50):
    print("Movies suggested for you \n")
    similar_indices = indices[movie_index][1:]
    for i, idx in enumerate(similar_indices, start=1):
        title = movies.loc[idx, 'title']
        url = movies.loc[idx, 'tmdb_url']
        print(f"{i}. {title} {url}")

In [24]:
# movie_name = input('Enter your favourite movie name: ').strip().lower()
# all_titles = movies['title'].tolist()

# # Case-insensitive substring match
# matches = [title for title in all_titles if movie_name in title.lower()]

# print(matches)

In [25]:
movie_name = input('Enter movie name: ').strip().lower()

# Find matches and extract year
matches = movies[movies['title'].str.lower().str.contains(movie_name)][['title', 'genres']]
matches['year'] = matches['title'].str.extract(r'\((\d{4})\)').astype(float)
matches = matches.dropna(subset=['year'])

# Automatically select the newest (most recent) match
if not matches.empty:
    close_match = matches.sort_values('year', ascending=False).iloc[0]['title']
    movieid = movies[movies.title == close_match].index[0]
    match_url = movies.loc[movieid, 'tmdb_url']
    print(f"Selected: {close_match} {match_url}")
else:
    print("No matching movies found.")
    close_match = None

Selected: Avengers: Infinity War - Part II (2019) https://www.themoviedb.org/movie/299534


In [26]:
movie_index = movies[movies.title == close_match].index[0]
print(movie_index)

25102


In [27]:
recommend_movies(movie_index)

Movies suggested for you 

1. Avengers: Age of Ultron (2015) https://www.themoviedb.org/movie/99861
2. Avengers: Infinity War - Part I (2018) https://www.themoviedb.org/movie/299536
3. Avengers, The (2012) https://www.themoviedb.org/movie/24428
4. Captain America: Civil War (2016) https://www.themoviedb.org/movie/271110
5. Thor: The Dark World (2013) https://www.themoviedb.org/movie/76338
6. Iron Man 2 (2010) https://www.themoviedb.org/movie/10138
7. Captain America: The Winter Soldier (2014) https://www.themoviedb.org/movie/100402
8. Iron Man (2008) https://www.themoviedb.org/movie/1726
9. Captain America: The First Avenger (2011) https://www.themoviedb.org/movie/1771
10. Iron Man 3 (2013) https://www.themoviedb.org/movie/68721
11. Thor (2011) https://www.themoviedb.org/movie/10195
12. Ant-Man (2015) https://www.themoviedb.org/movie/102899
13. Marvel Studios: Assembling a Universe (2014) https://www.themoviedb.org/movie/259910
14. Thor: Ragnarok (2017) https://www.themoviedb.org/movie

In [29]:
import pickle

# Save model
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn, f)

In [30]:
with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(feature_vector, f)

In [31]:
movies.to_csv("movies_processed.csv", index=False)