# Movies Recommender System
#### By Mutholib Yusira

## Background:
Netnaija.com is a website for downloading movies. This project scraps the movies data from the first 100 pages on the website (title, synopsis, genre and language), and uses a title, plot based, genre and language recommender system to recommend top 5 movies related to the movie the user watches/inputs.

### Importing Packages

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

### Web Scraping

In [None]:
movie_titles = []
movie_synopses = []
movie_genres = []
movie_languages = []

for i in range(1, 101):
    results = requests.get("https://www.thenetnaija.net/videos/movies/page/{}".format(i))
    soup = BeautifulSoup(results.content)
    titles = soup.find_all("div", attrs={'class':'info'})
    links = soup.find_all("div", {'class':'info'})
    movie_links=[]
    for i in range(len(titles)):
        movie_titles.append(titles[i].a.text)
        movie_links.append(links[i].find("a").attrs['href'])
        response = requests.get(movie_links[i])
        sub_soup = BeautifulSoup(response.content)
        text_1 = sub_soup.find("article", attrs={"class":"post-body"}).find_all('p')[0].text
        text_2 = sub_soup.find("article", attrs={"class":"post-body"}).find_all('p')[1].text
        text = " ".join([text_1, text_2])
        genre = sub_soup.find("blockquote", attrs={"class":"quote-content"}).find_all('p')[1].text[7:]
        lang = sub_soup.find("blockquote", attrs={"class":"quote-content"}).find_all('p')[5].text[10:]
        movie_synopses.append(text)
        movie_genres.append(genre)
        movie_languages.append(lang)

### Creating Database

In [None]:
df = pd.DataFrame({"Title":movie_titles, "Synopsis":movie_synopses, "Genre":movie_genres, "Language":movie_languages})

In [None]:
df.head()

Unnamed: 0,Title,Synopsis,Genre,Language
0,Detective Knight: Independence (2023),Detective James Knight 's last-minute assignme...,"Action, Crime, Thriller",English
1,Alkhallat+ (2023) [Arabic],These four tales explore trickery and deceptio...,"Comedy, Mystery","Arabic, English (Dual Audio)"
2,Sorry About the Demon (2023),A young man struggling with a broken heart lea...,"Comedy, Horror",English
3,The Price We Pay (2023),Reap what you sow. After a pawn shop robbery g...,"Action, Crime, Horror,Thriller",English
4,Seriously Red (2022),Think big. Dream bigger. Raylene 'Red' Delaney...,"Comedy, Drama, Music",English


In [None]:
df.to_csv('Net-naija movies dataset.csv', index=False)

### Reading the data

In [None]:
df = pd.read_csv('Net-naija movies dataset.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     1800 non-null   object
 1   Synopsis  1800 non-null   object
 2   Genre     1799 non-null   object
 3   Language  1800 non-null   object
dtypes: object(4)
memory usage: 56.4+ KB


In [None]:
df.duplicated().sum()

0

In [None]:
df_copy = df.copy()

In [None]:
df_copy['Text'] = df_copy['Title'] + " " + df_copy['Synopsis'] + " " + df_copy['Genre'] + " " + df_copy['Language']

In [None]:
df_copy.drop(columns = ['Title', 'Synopsis', 'Genre', 'Language'], inplace=True)

### Preprocessing the words

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yusir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yusir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yusir\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yusir\AppData\Roaming\nltk_data...


True

In [None]:
en_stopwords = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [None]:
def clean(text):
    text = re.sub("[^A-Za-z1-9 ]", "", str(text))
    text = text.lower()
    tokens = word_tokenize(text)
    clean_list = []
    for token in tokens:
        if token not in en_stopwords:
            clean_list.append(lemmatizer.lemmatize(token))
    return " ".join(clean_list)

In [None]:
df_copy['Text'] = df_copy['Text'].apply(clean)

In [None]:
df_copy.head()

Unnamed: 0,Text
0,detective knight independence 223 detective ja...
1,alkhallat 223 arabic four tale explore tricker...
2,sorry demon 223 young man struggling broken he...
3,price pay 223 reap sow pawn shop robbery go as...
4,seriously red 222 think big dream bigger rayle...


### Recommender System

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
matrix = vectorizer.fit_transform(df_copy['Text'])

In [None]:
matrix = matrix.toarray()

### Using Nearest Neighbors

In [None]:
nn = NearestNeighbors(n_neighbors=5)

In [None]:
nn.fit(matrix)

In [None]:
def recommender(text):
  text = clean(text)
  t_matrix = vectorizer.transform([text])
  return nn.kneighbors(n_neighbors=5, X=t_matrix, return_distance=False)

In [None]:
movie_user_likes = "Project Wolf Hunting (2022) [Korean] During transport from the Philippines to South Korea, a group of dangerous criminals unites to stage a coordinated escape attempt. As the jailbreak escalates into a bloody, all-out riot, the fugitives and their allies from the outside exact a brutal terror campaign against the special agents onboard the ship. Project Wolf Hunting (2022) // The Wolf Hunting Action, Thriller, Sci-Fi Korean"

In [None]:
movie_2nd_user_likes = "Glass Onion: A Knives Out Mystery (2022) World-famous detective Benoit Blanc heads to Greece to peel back the layers of a mystery surrounding a tech billionaire and his eclectic crew of friends. Glass Onion: A Knives Out Mystery (2022) Comedy, Crime, Drama, Mystery, Thriller English"

In [None]:
recommender(movie_user_likes)

array([[ 100,  291,  890,  727, 1080]], dtype=int64)

In [None]:
df.iloc[[ 100,  291,  890,  727, 1080]]['Title']

100      Project Wolf Hunting (2022) [Korean]
291                      Hunt (2022) [Korean]
890                               Wolf (2021)
727                  Midnight (2021) [Korean]
1080    Escape from Mogadishu (2021) [Korean]
Name: Title, dtype: object

In [None]:
recommender(movie_2nd_user_likes)

array([[  30,  684,  688,  107, 1548]], dtype=int64)

In [None]:
df.iloc[[  30,  684,  688,  107, 1548]]['Title']

30        Glass Onion: A Knives Out Mystery (2022)
684              Dancing on Glass (2022) [Spanish]
688                      All the Old Knives (2022)
107                     A Christmas Mystery (2022)
1548    Scooby-Doo! The Sword and the Scoob (2021)
Name: Title, dtype: object

### Using Cosine Similarity

In [None]:
cosine_sim = cosine_similarity(matrix)

In [None]:
indices = pd.Series(df.index, index=df['Title'])

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    return df['Title'].iloc[movie_indices]

**For 1st User**

In [None]:
get_recommendations("Project Wolf Hunting (2022) [Korean]")

291                      Hunt (2022) [Korean]
890                               Wolf (2021)
727                  Midnight (2021) [Korean]
1080    Escape from Mogadishu (2021) [Korean]
442            The Most Dangerous Game (2022)
Name: Title, dtype: object

**For 2nd User**

In [None]:
get_recommendations("Glass Onion: A Knives Out Mystery (2022)")

684              Dancing on Glass (2022) [Spanish]
688                      All the Old Knives (2022)
107                     A Christmas Mystery (2022)
1548    Scooby-Doo! The Sword and the Scoob (2021)
859                                  Brazen (2022)
Name: Title, dtype: object