In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import io
import re

In [4]:
file_path = r"movies.dat"
delimiter = '::'  # The delimiter used in the .dat file

# Read the .dat file using pandas
with open(file_path, 'r', encoding='latin1') as file:
    content = file.read()
    movies = pd.read_csv(io.StringIO(content), delimiter=delimiter, header=None,engine='python')

new_column_names = {
    0: 'movieId',
    1: 'title',
    2: 'genres'
}

movies.rename(columns=new_column_names, inplace=True)

movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [7]:
movies['Year'] = movies['title'].apply(lambda x: re.search(r'\((\d{4})\)', x).group(1))
movies['title'] = movies['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)', '', x))

In [10]:
movies.head()
new_movies=movies

In [12]:
new_movies['Text_features']=(new_movies['title'] 
                             +' '+ new_movies['Year']   +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['genres'] +
                             ' '+ new_movies['Year']      )

In [13]:
new_movies.head()

Unnamed: 0,movieId,title,genres,Year,Text_features
0,1,Toy Story,Animation|Children's|Comedy,1995,Toy Story 1995 Animation|Children's|Comedy Ani...
1,2,Jumanji,Adventure|Children's|Fantasy,1995,Jumanji 1995 Adventure|Children's|Fantasy Adve...
2,3,Grumpier Old Men,Comedy|Romance,1995,Grumpier Old Men 1995 Comedy|Romance Comedy|Ro...
3,4,Waiting to Exhale,Comedy|Drama,1995,Waiting to Exhale 1995 Comedy|Drama Comedy|Dra...
4,5,Father of the Bride Part II,Comedy,1995,Father of the Bride Part II 1995 Comedy Comedy...


In [14]:
tfidf_vectorizer = TfidfVectorizer()


tfidf_matrix = tfidf_vectorizer.fit_transform(new_movies['Text_features'])

In [21]:
def recommendation(IDS, df):
    texts = df[df['movieId'].isin(IDS)]['Text_features']
    
   
    input_tfidf_matrix = tfidf_vectorizer.transform(texts)
    
    similarities = cosine_similarity(input_tfidf_matrix, tfidf_matrix)
    
    
    average_similarity = np.mean(similarities, axis=0)
    
    
    sorted_indices = average_similarity.argsort()[::-1]
    
    recommended_artists = df.iloc[sorted_indices[0:20]]
    
    return recommended_artists

In [22]:
recommended=recommendation([1,5,20,10],new_movies)

In [23]:
recommended.head()

Unnamed: 0,movieId,title,genres,Year,Text_features
1563,1604,Money Talks,Action|Comedy,1997,Money Talks 1997 Action|Comedy Action|Comedy A...
2651,2720,Inspector Gadget,Action|Adventure|Children's|Comedy,1999,Inspector Gadget 1999 Action|Adventure|Childre...
1972,2041,Condorman,Action|Adventure|Children's|Comedy,1981,Condorman 1981 Action|Adventure|Children's|Com...
2654,2723,Mystery Men,Action|Adventure|Comedy,1999,Mystery Men 1999 Action|Adventure|Comedy Actio...
356,360,I Love Trouble,Action|Comedy,1994,I Love Trouble 1994 Action|Comedy Action|Comed...
