In [102]:
import pandas as pd
import numpy as np

In [103]:
movie = pd.read_csv(r"C:\Users\yasha\Desktop\movie_recommender\Backend\ml\data\processed\movies_clean.csv")
movies = movie.copy()

In [104]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208429 entries, 0 to 208428
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    208429 non-null  int64  
 1   title                 208429 non-null  object 
 2   vote_average          208429 non-null  float64
 3   vote_count            208429 non-null  int64  
 4   runtime               208429 non-null  int64  
 5   backdrop_path         208429 non-null  object 
 6   imdb_id               208429 non-null  object 
 7   popularity            208429 non-null  float64
 8   poster_path           208429 non-null  object 
 9   genres                193748 non-null  object 
 10  production_countries  178954 non-null  object 
 11  combined_text         208429 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 19.1+ MB


In [105]:
# Convert all object/string columns to lowercase
for col in movies.select_dtypes(include='object').columns:
    movies[col] = movies[col].str.lower()


In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [107]:
vector = TfidfVectorizer(stop_words='english', max_features=20000)
model = vector.fit_transform(movie['combined_text'])

In [108]:
from sklearn.decomposition import TruncatedSVD

In [109]:
svd = TruncatedSVD(n_components=200, random_state=42)
vector_reduced = svd.fit_transform(model)

In [110]:
from sklearn.preprocessing import StandardScaler

In [111]:
num_features = ["vote_average", "vote_count", "popularity"]
scaler = StandardScaler()
num_scaled = scaler.fit_transform(movies[num_features])

combined_vectors = np.hstack([vector_reduced, num_scaled])


In [112]:
import pickle

In [113]:
pickle.dump(vector, open(r"C:\Users\yasha\Desktop\movie_recommender\Backend\ml\models\vector.pkl", "wb"))
pickle.dump(model, open(r"C:\Users\yasha\Desktop\movie_recommender\Backend\ml\models\model.pkl", "wb"))
pickle.dump(vector_reduced, open(r"C:\Users\yasha\Desktop\movie_recommender\Backend\ml\models\vector_reduced.pkl", "wb"))


In [114]:
from sklearn.metrics.pairwise import cosine_similarity

In [115]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(query, top_n=10):

    movies_filtered = movie[(movie["vote_average"] > 2) & (movie["vote_count"] > 40)]
    query_vec = vector.transform([query.lower()])
    query_vec_reduced = svd.transform(query_vec)
    similarity = cosine_similarity(query_vec_reduced, vector_reduced[movies_filtered.index]).flatten()
    top_indices = similarity.argsort()[::-1][2:top_n+1]
    recommended = movies_filtered.iloc[top_indices][['id','title','vote_average','vote_count','genres','backdrop_path','poster_path']].copy()
    recommended['similarity'] = similarity[top_indices]
    
    return recommended


query_result = recommend_movies("Comedy  Drama  Romance Raj is a rich  carefree  happy-go-lucky second generation NRI. Simran is the daughter of Chaudhary Baldev Singh  who in spite of being an NRI is very strict about adherence to Indian values. Simran has left for India to be married to her childhood fiancé. Raj leaves for India with a mission at his hands  to claim his lady love under the noses of her whole family. Thus begins a saga. family's daily life  family history  love-hate relationship  family drama  love affair Hindi hi Come Fall In love  All Over Again.. India Yash Raj Films")
query_result.head(10)


Unnamed: 0,id,title,vote_average,vote_count,genres,backdrop_path,poster_path,similarity
30349,25869,Salaam-e-Ishq,4.893,42,"Comedy, Drama, Romance",https://image.tmdb.org/t/p/original/viP4tHMj0e...,https://image.tmdb.org/t/p/original/85XY4iBEZr...,0.749182
21984,15864,Bachna Ae Haseeno,6.081,73,"Drama, Comedy, Romance",https://image.tmdb.org/t/p/original/uoqDaSsJp5...,https://image.tmdb.org/t/p/original/HqUJdDNYJx...,0.731947
19327,4253,Hum Tum,6.422,90,"Romance, Family, Comedy, Drama",https://image.tmdb.org/t/p/original/fGxaC8xYB8...,https://image.tmdb.org/t/p/original/yYwxJ6Qgr2...,0.723308
29809,20929,Taal,6.4,44,"Drama, Romance",https://image.tmdb.org/t/p/original/amHXl0gpzM...,https://image.tmdb.org/t/p/original/n2VsCd4wPI...,0.723132
29121,237305,Gori Tere Pyaar Mein,5.9,45,"Comedy, Romance",https://image.tmdb.org/t/p/original/ewI2f0gaav...,https://image.tmdb.org/t/p/original/icfBtvHEYI...,0.708286
27530,399624,Befikre,5.2,50,"Romance, Comedy, Drama",https://image.tmdb.org/t/p/original/aZvsSlyZMh...,https://image.tmdb.org/t/p/original/qLmBAaylw1...,0.706045
16523,577328,Kabir Singh,6.569,116,"Drama, Romance",https://image.tmdb.org/t/p/original/yFwn006ETs...,https://image.tmdb.org/t/p/original/q2jY2IKv1h...,0.701952
27449,27624,Maine Pyar Kiya,6.2,50,"Romance, Drama",https://image.tmdb.org/t/p/original/3fmJiICJY0...,https://image.tmdb.org/t/p/original/aqZ9CEbXVL...,0.691031
10304,14072,Rab Ne Bana Di Jodi,7.108,244,"Comedy, Drama, Romance",https://image.tmdb.org/t/p/original/AcLsfw3TVZ...,https://image.tmdb.org/t/p/original/m8x6I2qf3R...,0.684637
