In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('u.data', sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'unix_timestamp'],encoding='latin-1')
movies = pd.read_csv('u.item', sep='|', header=None, names=['movie_id', 'title', 'release_date', 'url'], encoding='latin-1',usecols=range(4))
users = pd.read_csv('u.user', sep='|', header=None, names=['user_id', 'age', 'gender', 'occupation', 'zip_code'],parse_dates=True,encoding='latin-1')

data = pd.merge(ratings, movies)
data = pd.merge(data, users)
data.to_csv('data.csv', index=False)

In [3]:
df = pd.read_csv('data.csv')
df = df[['user_id', 'movie_id', 'rating','title']]

movie_id_to_title = df.drop_duplicates(subset=['movie_id'])[['movie_id', 'title']].set_index('movie_id')['title'].to_dict()
title_to_movie_id = {v: k for k, v in movie_id_to_title.items()}

print(df.head(2))

   user_id  movie_id  rating                     title
0      196       242       3              Kolya (1996)
1      186       302       3  L.A. Confidential (1997)


In [4]:
df.dropna(inplace=True)

In [5]:
user_movie_matrix = df.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)

print(user_movie_matrix)

movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5          4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   
940        0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   
941        5.0   0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0  ...   
942        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
943        0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   3.0   0.0  ...   

In [6]:
movie_user_matrix = user_movie_matrix.T.values

dot = np.dot(movie_user_matrix, movie_user_matrix.T)
norms = np.linalg.norm(movie_user_matrix, axis=1)
cos_sim = dot / np.outer(norms, norms)

movie_ids = user_movie_matrix.columns.tolist()
movie_index_to_id = {i: movie_id for i, movie_id in enumerate(movie_ids)}
movie_id_to_index = {mid: i for i, mid in enumerate(movie_ids)}

In [9]:
def get_similar_movies(input_title, top_n=5):
    print("title_to_movie_id",title_to_movie_id)
    if input_title not in title_to_movie_id:
        print(f"Movie '{input_title}' not found in the dataset.")
        return []
    
    input_id = title_to_movie_id[input_title]
    input_idx = movie_id_to_index[input_id]
    
    scores = list(enumerate(cos_sim[input_idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    top_ids = [movie_index_to_id[i] for i, _ in scores[1:top_n+1]]
    
    return [movie_id_to_title[mid] for mid in top_ids]

In [12]:
if __name__ == '__main__':
    movie_input  = 'Men in Black (1997)' #input("Enter a movie title: ")
    
    recommendation = get_similar_movies(movie_input)
    
    if recommendation:
        print("\nYou may also like:")
        for movie in recommendation:
            print(f"- {movie}")
    else:
        print("No recommendations found.")

title_to_movie_id {'Kolya (1996)': 242, 'L.A. Confidential (1997)': 302, 'Heavyweights (1994)': 377, 'Legends of the Fall (1994)': 51, 'Jackie Brown (1997)': 346, 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)': 474, 'Hunt for Red October, The (1990)': 265, 'Jungle Book, The (1994)': 465, 'Grease (1978)': 451, 'Remains of the Day, The (1993)': 86, 'Men in Black (1997)': 257, "Romy and Michele's High School Reunion (1997)": 1014, 'Star Trek: First Contact (1996)': 222, 'To Wong Foo, Thanks for Everything! Julie Newmar (1995)': 40, 'Batman Forever (1995)': 29, 'Only You (1994)': 785, 'Age of Innocence, The (1993)': 387, 'Sabrina (1995)': 274, 'Just Cause (1995)': 1042, 'Endless Summer 2, The (1994)': 1184, 'Man Without a Face, The (1993)': 392, 'Sabrina (1954)': 486, 'Die Hard (1988)': 144, 'Twister (1996)': 118, 'Toy Story (1995)': 1, 'Broken Arrow (1996)': 546, 'Aladdin (1992)': 95, 'Casper (1995)': 768, 'Restoration (1995)': 277, 'Jaws (1975)': 234, 'Chas