In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load the dataset
FILE_PATH="/kaggle/input/movies-dataset/movie.csv"
movies = pd.read_csv(FILE_PATH)
movies.shape

In [None]:
# Preview Dataset
# print(movies.head())
movies.sample(2)

In [None]:
# Step 1: Data Preprocessing
# Fill missing values in 'overview' with an empty string
#movies['overvi ew'] = movies['overview'].fillna('')
#movies.iloc[2259]
#movies.isnull().sum()
movies['description'] = movies['overview'] + '' + movies['genres'] + '' + movies['vote_average'].astype(str)

In [None]:
movies.drop(columns=['genres', 'overview', 'vote_average'], inplace=True)


In [None]:
movies['description'][0]

Lowercasing The Description Column

In [None]:
#movies['description'].dtype
movies.sample(2)

In [None]:
movies['description'] = movies['description'].str.lower()
movies.head(2)

In [None]:
movies['description'][0]

#Remove punctuations


In [None]:
import string
string.punctuation

In [None]:
punctuation = string.punctuation

In [None]:
def remove_punctuations(text):
    for char in punctuation:
        text = text.replace(char, '')
    return text

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)
movies.shape

In [None]:
movies['description'] = movies['description'].apply(remove_punctuations)

In [None]:
movies['description'][0]

#Remove Stopwordsabs

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
def remove_stopwords(text):#
    tokens = text.split()  # Tokenize by splitting on spaces
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)  # Rejoin with spaces
    # new_text = []
    # for word in text.split():
    #     if not word in swords:
    #         new_text.append(word)
    # x = new_text[:]
    # new_text.clear()
    # return ''.join(x)

In [None]:
movies['description'] = movies['description'].apply(remove_stopwords)
print(movies['description'].head())

In [None]:
movies['description'][0]

Model Building

In [None]:
import sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to top 5000 terms
vector = tfidf.fit_transform(movies['description']).toarray()

print("Vector Shape:", vector.shape)  # Expect (9745, 5000)


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300)  # Reduce to 300 dimensions
reduced_vector = svd.fit_transform(vector)

print("Reduced Vector Shape:", reduced_vector.shape)  # Expect (9745, 300)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(reduced_vector)
print("Similarity Matrix Shape:", similarity.shape)  # Expect (9745, 9745)
print("Example Similarity for First Movie:", similarity[0][:10])  # First 10 similarities

In [None]:
similarity.shape

Normalization for scaling all values to a range b/w 0  & 1

In [None]:
similarity = (similarity - similarity.min()) / (similarity.max() - similarity.min())
print("Normalized Similarity Matrix Example:", similarity[0][:10])


Model Evaluation

In [None]:
movies[movies['title'] == "The Godfather"].index[0]

In [None]:
# for val in sorted(list(enumerate(similarity[1])), reverse = True):
#     print(val)
distance = sorted(list(enumerate(similarity[1])), reverse = True, key = lambda vector:vector[1])
distance[:10]

In [None]:
for i in distance[:10]:
    print(movies.iloc[i[0]].title)

Recommendation

In [None]:
def recommendation(movie_name, no_of_recommendation):
    idx = movies[movies['title'] == movie_name].index[0]
    distance = sorted(list(enumerate(similarity[idx])), reverse = True, key = lambda vector:vector[1])
    for i in distance[1:no_of_recommendation]:
        print(movies.iloc[i[0]].title)

In [None]:
recommendation('Life', 10)

Model Saving

In [None]:
import pickle
pickle.dump(movies, open('movies_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))