In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('wordnet')    
import re
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_csv(r"C:\Users\Dell\Desktop\MovieRecommend\Data\clean_movies_data.csv")

In [6]:
df.head(2)

Unnamed: 0,movie_id,title,original_language,overview,release_year,runtime,popularity,vote_average,vote_count,genres,top_cast,director,writers,production_companies,spoken_languages
0,100,"Lock, Stock and Two Smoking Barrels",en,A card shark and his unwillingly-enlisted frie...,1998,105,2.1519,8.105,6944,Comedy|Crime,Vinnie Jones|Jason Flemyng|Dexter Fletcher|Nic...,Guy Ritchie,Guy Ritchie,The Steve Tisch Company|SKA Films|Handmade Films,en
1,10000,The Strategy of the Snail,es,A group of tenants facing eviction due to a ci...,1993,116,3.0857,7.4,84,Comedy|Drama,Fausto Cabrera|Frank Ramírez|Delfina Guido|Vic...,Sergio Cabrera,Frank Ramírez|Humberto Dorado|Jorge Goldenberg...,Ministère de la Culture et de la Francophonie|...,es


In [7]:
len(df.columns)

15

In [8]:
df.shape

(30865, 15)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30865 entries, 0 to 30864
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              30865 non-null  int64  
 1   title                 30865 non-null  object 
 2   original_language     30865 non-null  object 
 3   overview              30865 non-null  object 
 4   release_year          30865 non-null  int64  
 5   runtime               30865 non-null  int64  
 6   popularity            30865 non-null  float64
 7   vote_average          30865 non-null  float64
 8   vote_count            30865 non-null  int64  
 9   genres                30865 non-null  object 
 10  top_cast              30865 non-null  object 
 11  director              30865 non-null  object 
 12  writers               30865 non-null  object 
 13  production_companies  30865 non-null  object 
 14  spoken_languages      30865 non-null  object 
dtypes: float64(2), int6

In [10]:
df.describe()

Unnamed: 0,movie_id,release_year,runtime,popularity,vote_average,vote_count
count,30865.0,30865.0,30865.0,30865.0,30865.0,30865.0
mean,260003.1,2004.244192,101.13802,4.2373,6.327077,755.729013
std,319221.8,17.630665,25.29709,5.959043,0.877854,2075.345955
min,2.0,1950.0,0.0,0.0054,1.518,50.0
25%,20458.0,1996.0,90.0,2.4338,5.8,81.0
50%,72349.0,2010.0,99.0,3.6912,6.4,161.0
75%,440424.0,2017.0,111.0,5.2021,6.939,474.0
max,1571470.0,2025.0,960.0,588.0777,9.853,38316.0


In [11]:
df[df["runtime"]<=60].shape

(1039, 15)

In [12]:
df[(df["runtime"]>60) & (df["runtime"]<=150)].shape

(28947, 15)

In [13]:
df[(df["runtime"]>150)].shape

(879, 15)

In [14]:
df["release_year"].unique()

array([1998, 1993, 2024, 2023, 1988, 1986, 1997, 2006, 2014, 2005, 2012,
       2003, 2022, 1985, 2025, 2001, 1987, 1991, 2009, 2011, 1951, 2007,
       2000, 1957, 2004, 1999, 2002, 2013, 1955, 1964, 1965, 1996, 2008,
       1984, 1992, 1982, 1994, 1973, 1970, 1975, 1989, 2010, 2018, 1995,
       1981, 1990, 1953, 2016, 1954, 1952, 1967, 1978, 1972, 1962, 1983,
       2015, 1976, 1974, 1979, 1968, 1956, 1960, 1963, 1969, 1971, 1959,
       1966, 1958, 1977, 1980, 1961, 1950, 2017, 2019, 2021, 2020],
      dtype=int64)

In [15]:
df.columns

Index(['movie_id', 'title', 'original_language', 'overview', 'release_year',
       'runtime', 'popularity', 'vote_average', 'vote_count', 'genres',
       'top_cast', 'director', 'writers', 'production_companies',
       'spoken_languages'],
      dtype='object')

In [16]:
df.drop(columns=["movie_id"], inplace=True)

In [17]:
def runtime_category(runtime):
    if runtime <= 60:
        return "_Short_"
    elif 60 < runtime <= 150:
        return "_Medium_"
    else:
        return "_Long_"
    
df['runtime_category'] = df['runtime'].apply(runtime_category)

def join_all(df):
    return " ".join([df['title'], df['original_language'], df['overview'], str(df['release_year']), 
               df['runtime_category'],df['genres'], df['top_cast'], df['director'], df['writers'], 
               df['production_companies'], df['spoken_languages']])

df["all_combined"]= df.apply(join_all, axis=1)

In [18]:
df.head(2)

Unnamed: 0,title,original_language,overview,release_year,runtime,popularity,vote_average,vote_count,genres,top_cast,director,writers,production_companies,spoken_languages,runtime_category,all_combined
0,"Lock, Stock and Two Smoking Barrels",en,A card shark and his unwillingly-enlisted frie...,1998,105,2.1519,8.105,6944,Comedy|Crime,Vinnie Jones|Jason Flemyng|Dexter Fletcher|Nic...,Guy Ritchie,Guy Ritchie,The Steve Tisch Company|SKA Films|Handmade Films,en,_Medium_,"Lock, Stock and Two Smoking Barrels en A card ..."
1,The Strategy of the Snail,es,A group of tenants facing eviction due to a ci...,1993,116,3.0857,7.4,84,Comedy|Drama,Fausto Cabrera|Frank Ramírez|Delfina Guido|Vic...,Sergio Cabrera,Frank Ramírez|Humberto Dorado|Jorge Goldenberg...,Ministère de la Culture et de la Francophonie|...,es,_Medium_,The Strategy of the Snail es A group of tenant...


In [19]:
lemitizer = WordNetLemmatizer()

In [20]:
def clean_text(text):
    text = text.replace("|", " ")
    lower_text = str(text).lower()

    punctuation_remove = re.sub(f"[{re.escape(string.punctuation)}]","",lower_text)
    
    extra_space_remove = re.sub(r'\s+', ' ',punctuation_remove).strip()
    lemitize_text = [lemitizer.lemmatize(word) for word in extra_space_remove.split(" ")]
    return " ".join(lemitize_text)



In [21]:
dataset = df["all_combined"].apply(clean_text)

In [22]:
vectorizer = TfidfVectorizer(stop_words='english')
x = vectorizer.fit_transform(dataset)

In [23]:
t = "Christopher Nolan directed a mind-bending thriller with stunning visuals and a gripping plot."
t = clean_text(t)
y = vectorizer.transform([t])

In [25]:
t

'christopher nolan directed a mindbending thriller with stunning visuals and a gripping plot'

In [26]:
def recommand_movies(text):
    text = clean_text(text)
    y = vectorizer.transform([text])
    cosine_sim = list(enumerate(cosine_similarity(x, y).flatten()))
    sort = sorted(cosine_sim, reverse=True, key = lambda x: x[1])
    index = [idx[0] for idx in sort[:10]]
    movie = df['title'].iloc[index]
    return movie
recommand_movies(t)

1876              The Prestige
2445                 Following
5559              Interstellar
27087                  Memento
5437           The Dark Knight
23185                    Tenet
28873              Oppenheimer
20355    The Dark Knight Rises
10915            Batman Begins
15238                  Dunkirk
Name: title, dtype: object

In [27]:
import pickle
with open("model.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [29]:
with open("movies_vectors.pkl", "wb") as f:
   pickle.dump(x, f)