In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Memory
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [6]:
movies = pd.read_csv(r'C:\Users\Lenovo\Documents\Movie Recommendation System\movies.csv',encoding = 'unicode_escape')


In [7]:
ratings = pd.read_csv("ratings.csv")

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
import re

def clean_title(title):
    # Using the re.sub() function to substitute non-alphanumeric characters with an empty string
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title


In [12]:
movies["clean_title"] = movies["title"].apply(clean_title)


In [13]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [14]:

nltk.download('punkt')
nltk.download('stopwords')

# Load the stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Preprocessing function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

# Apply preprocessing to the movie titles
movies["clean_title"] = movies["title"].apply(preprocess_text)

# Create the TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_df=0.8, max_features=5000)

# Fit and transform the cleaned movie titles using the vectorizer
tfidf_matrix = vectorizer.fit_transform(movies["clean_title"])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
def search(title):
    cleaned_title = clean_title(title)
    query_vec = vectorizer.transform([cleaned_title])
    similarity = cosine_similarity(query_vec, tfidf)
    
    # Get the indices of top similar movies using argsort and slicing
    top_indices = np.argsort(similarity[0])[::-1][:5]
    
    # Retrieve the top similar movies
    results = movies.iloc[top_indices]
    
    
    return results


In [16]:
import ipywidgets as widgets
from IPython.display import display, clear_output

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(change):
    title = change['new']
    with movie_list:
        clear_output(wait=True)
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [17]:
def find_similar_movies(movie_id, tfidf_matrix, num_similar=5):
    # Get the movie based on the provided movie_id
    movie = movies[movies["movieId"] == movie_id]

    if not movie.empty:
        # Calculate cosine similarity between the selected movie and all movies
        movie_tfidf = tfidf_matrix[movie.index]
        similarity_scores = cosine_similarity(movie_tfidf, tfidf_matrix)

        # Get the indices of the most similar movies
        similar_movie_indices = np.argsort(similarity_scores[0])[::-1][1:num_similar + 1]

        # Get the details of similar movies
        similar_movies = movies.iloc[similar_movie_indices]

        return similar_movies
    else:
        return None

# Call the function with a specific movie_id
movie_id = 209157	
similar_movies = find_similar_movies(movie_id, tfidf_matrix)

if similar_movies is not None:
    print(similar_movies)
else:
    print("Movie not found.")


       movieId                  title                genres  \
61164   205132      I, Dolours (2018)           Documentary   
52020   184401          Selfie (2018)    (no genres listed)   
54038   188675          Dogman (2018)           Crime|Drama   
61228   205321            Awe! (2018)  Action|Drama|Romance   
54040   188679  Fahrenheit 451 (2018)          Drama|Sci-Fi   

                   clean_title  
61164        , dolour ( 2018 )  
52020           selfi ( 2018 )  
54038          dogman ( 2018 )  
61228           awe ! ( 2018 )  
54040  fahrenheit 451 ( 2018 )  


In [18]:
ratings = pd.read_csv(r'C:\Users\Lenovo\Documents\Movie Recommendation System/ratings.csv',encoding = 'unicode_escape')

In [19]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [20]:
rating_threshold = 4  # Change this value to our desired rating threshold
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > rating_threshold)]["userId"].unique()


In [21]:
rating_threshold = 4

similar_user_recs = ratings.loc[
    (ratings["userId"].isin(similar_users)) & (ratings["rating"] > rating_threshold),
    "movieId"
]



In [22]:

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]


In [23]:
similar_user_recs

Series([], Name: count, dtype: float64)

In [24]:


all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [25]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())


In [26]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [27]:
rec_percentages.head()

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1


In [28]:
rec_percentages["score"] = rec_percentages["similar"].div(rec_percentages["all"])


In [29]:
rec_percentages = rec_percentages.sort_values("score", ascending=False, kind="quicksort")


In [30]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")



Unnamed: 0,similar,all,score,movieId,title,genres,clean_title


In [31]:
print(rec_percentages.info())
print(movies.info())


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   similar  0 non-null      float64
 1   all      0 non-null      float64
 2   score    0 non-null      float64
dtypes: float64(3)
memory usage: 0.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      62423 non-null  int64 
 1   title        62423 non-null  object
 2   genres       62423 non-null  object
 3   clean_title  62423 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.9+ MB
None


In [32]:
print(rec_percentages.head(10))
print(movies.head(10))


Empty DataFrame
Columns: [similar, all, score]
Index: []
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6                         Heat (1995)   
6        7                      Sabrina (1995)   
7        8                 Tom and Huck (1995)   
8        9                 Sudden Death (1995)   
9       10                    GoldenEye (1995)   

                                        genres                    clean_title  
0  Adventure|Animation|Children|Comedy|Fantasy             toy stori ( 1995 )  
1                   Adventure|Children|Fantasy               jumanji ( 1995 )  
2                               Comedy|Romance      grumpier old men ( 1995 )  
3                         Comedy|Drama|Romance            wait exhal ( 1

In [33]:
def find_similar_movies(movie_id):
    def calculate_scores(df):
        similar_users = df[(df["movieId"] == movie_id) & (df["rating"] > 4)]["userId"].unique()
        similar_user_recs = df[(df["userId"].isin(similar_users)) & (df["rating"] > 4)]["movieId"]
        similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
        similar_user_recs = similar_user_recs[similar_user_recs > .10]
        
        all_users = df[(df["movieId"].isin(similar_user_recs.index)) & (df["rating"] > 4)]
        all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
        
        rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
        rec_percentages.columns = ["similar", "all"]
        rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
        rec_percentages = rec_percentages.sort_values("score", ascending=False)
        return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    
    return ratings.pipe(calculate_scores)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movies dataset
movies = pd.read_csv("movies.csv")

# Define a function to clean movie titles
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

# Clean the movie titles
movies["clean_title"] = movies["title"].apply(clean_title)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

# Define a function to search for movie recommendations
def search_engine(query, num_recommendations=5):
    query = clean_title(query)
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = similarity.argsort()[-num_recommendations-1:-1][::-1]  # Exclude the query movie
    recommended_movies = movies.iloc[indices]
    return recommended_movies[["title", "genres"]]

# Test the search engine
query = input("Enter a movie title: ")
recommendations = search_engine(query)

# Display the recommended movies
if not recommendations.empty:
    print("Recommended Movies:")
    print(recommendations)
else:
    print("No recommendations found for the entered movie.")
