## Importing libraries

In [563]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
films = pd.read_csv('films_clean.csv')


## Defining Features for Machine Learning

In [564]:
films.columns

Index(['genres', 'id', 'original_language', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries', 'runtime',
       'spoken_languages', 'title', 'year', 'director', 'director_gender'],
      dtype='object')

In [565]:
#Dropping all columns I won't need and keeping other for feature engineering
films = films.drop(columns= ['original_title', 'original_language', 'production_companies','runtime','spoken_languages', 'director'])

## Dropping null values

In [566]:
#Drop the movies without a description. I kept them so far because they were analyzed in the EDA.
films = films.drop(films[films['overview'] == 'No overview found.'].index)

## Splitting the dataset

In [567]:
# Split the dataset into male directed and female directed movies
female_directed= films[films['director_gender']== 'female']
male_directed= films[films['director_gender']== 'male']

In [568]:
#Dropping the movies with erroneous movies summaries
index_to_drop = female_directed[female_directed['title'] == 'Tomorrow'].index
index_to_drop = female_directed[female_directed['title'] == 'Titanic'].index
index_to_drop = female_directed[female_directed['title'] == 'Lily Sometimes'].index
female_directed.drop(index_to_drop, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  female_directed.drop(index_to_drop, inplace=True)


## Vectorizing Categorical Data

In [569]:
#We will vectorize the 'genres' column
films.dtypes

genres                   object
id                        int64
overview                 object
popularity              float64
production_countries     object
title                    object
year                      int64
director_gender          object
dtype: object

In [570]:
genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(films['genres'])
male_genres = genre_vectorizer.transform(male_directed['genres']).toarray()
female_genres = genre_vectorizer.transform(female_directed['genres']).toarray()

In [571]:
#Checking the vocabulary to see if all genres are there/no duplicates
genre_vectorizer.vocabulary_

{'comedy': 3,
 'romance': 14,
 'horror': 11,
 'action': 0,
 'adventure': 1,
 'drama': 6,
 'crime': 4,
 'thriller': 16,
 'fantasy': 8,
 'sciencefiction': 15,
 'history': 10,
 'war': 18,
 'foreign': 9,
 'mystery': 13,
 'family': 7,
 'documentary': 5,
 'western': 19,
 'music': 12,
 'animation': 2,
 'tvmovie': 17}

In [572]:
male_genres = pd.DataFrame(male_genres, columns=genre_vectorizer.get_feature_names())
female_genres = pd.DataFrame(female_genres, columns=genre_vectorizer.get_feature_names())



## Conducting NLP on Film Summary

In [573]:
#I will use the stemmer and the ENGLISH_STOP_WORDS library to get rid of insignificant words and make my 
#recommender more robust


text = films['overview']
text_male = male_directed['overview']
text_female = female_directed['overview']


In [574]:
stemmer = EnglishStemmer()
default_analyzer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS).build_analyzer()


In [575]:
def analyze_with_stemming(text):
    unstemmed_words = default_analyzer(text)
    return (stemmer.stem(word) for word in unstemmed_words)

In [576]:
#Example of text before stemming
text 

0       An ugly duckling having undergone a remarkable...
1       When a lawyer shows up at the vampire's doorst...
2       Morgan Adams and her slave, William Shaw, are ...
3       The life of the gambling paradise – Las Vegas ...
4       Rich Mr. Dashwood dies, leaving his second wif...
                              ...                        
9574    The Sublet is a suspense driven psychological ...
9575    A stranger named Silas flees from a devastatin...
9576    Pretty, popular, and slim high-schooler Aly Sc...
9577    Hyperactive teenager Kelly is enrolled into a ...
9578    Yet another version of the classic epic, with ...
Name: overview, Length: 9539, dtype: object

In [577]:
#Overview of what the stemming has done
list(analyze_with_stemming(text[0]))

['ugli',
 'duckl',
 'have',
 'undergon',
 'remark',
 'chang',
 'harbor',
 'feel',
 'crush',
 'carefre',
 'playboy',
 'busi',
 'focus',
 'brother',
 'say']

In [578]:
stemmer_vectorizer = CountVectorizer(analyzer=analyze_with_stemming)

In [579]:
#I am fitting on the films (including both genders)
#and am transforming them seperately.

vectors = stemmer_vectorizer.fit(text)
male_vectorized = vectors.transform(text_male).todense()
female_vectorized = vectors.transform(text_female).todense()

In [580]:
vocabulary = vectors.get_feature_names_out()

In [581]:
male_vectorized

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [582]:
male_vectorized = pd.DataFrame(male_vectorized, columns=vocabulary)
female_vectorized = pd.DataFrame(female_vectorized, columns=vocabulary)

In [583]:
male_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [584]:
female_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Normalizing Numerical Data: Popularity

In [585]:
m_normalized_popularity = male_directed['popularity']
f_normalized_popularity = female_directed['popularity']

In [586]:
m_normalized_popularity = m_normalized_popularity.values.reshape(-1, 1)

In [587]:
f_normalized_popularity = f_normalized_popularity.values.reshape(-1, 1)

In [588]:
from sklearn.preprocessing import StandardScaler

In [589]:
scaler = StandardScaler()
m_normalized_popularity = scaler.fit_transform(m_normalized_popularity)
f_normalized_popularity = scaler.fit_transform(f_normalized_popularity)

In [590]:
m_normalized_popularity = pd.DataFrame(m_normalized_popularity)
f_normalized_popularity = pd.DataFrame(f_normalized_popularity)

## Normalizing Numerical Data: Year

In [591]:
m_normalized_year = male_directed['year']
f_normalized_year = female_directed['year']

In [592]:
m_normalized_year = m_normalized_year.values.reshape(-1, 1)
f_normalized_year = f_normalized_year.values.reshape(-1, 1)

In [593]:
scaler = StandardScaler()
m_normalized_year = scaler.fit_transform(m_normalized_year)
f_normalized_year = scaler.fit_transform(f_normalized_year)

In [594]:
m_normalized_year = pd.DataFrame(m_normalized_year)
f_normalized_year = pd.DataFrame(f_normalized_year)

In [595]:
f_normalized_year

Unnamed: 0,0
0,-0.668372
1,-0.668372
2,-0.668372
3,-0.743835
4,-0.743835
...,...
652,0.539039
653,0.765429
654,0.388113
655,0.916355


## Normalizing Categorical Data

In [596]:
#The 'l2' norm, also known as the Euclidean norm, refers to the length of a vector in a Euclidean space. 
normalizer = Normalizer(norm='l2')

In [597]:
male_genres_overview = pd.concat([male_genres*3, male_vectorized], axis=1)
female_genres_overview = pd.concat([female_genres*3, female_vectorized], axis=1)

In [598]:
normalizer.fit_transform(male_genres_overview)
normalizer.fit_transform(female_genres_overview)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Concatenate and Create Dataframes Ready for Nearest Neighbors

In [599]:
female_directed.reset_index(drop=True, inplace=True)
male_directed.reset_index(inplace=True, drop=True,)

In [600]:
female_movies = pd.concat([female_genres_overview, f_normalized_year], axis=1)
male_movies = pd.concat([male_genres_overview, m_normalized_year], axis=1)


In [601]:
n_neighbors = 5
model = NearestNeighbors(n_neighbors=n_neighbors)
model.fit(female_movies)




NearestNeighbors()

In [602]:
distances, indices = model.kneighbors(male_movies[0:1])



In [603]:
distances

array([[4.37565373, 4.47760083, 4.86625048, 4.90445207, 5.00000274]])

In [604]:
indices

array([[314, 651,   1, 284, 175]])

In [605]:
male_directed

Unnamed: 0,genres,id,overview,popularity,production_countries,title,year,director_gender
0,"Comedy, Romance",11860,An ugly duckling having undergone a remarkable...,6.677277,"Germany, United States of America",Sabrina,1995,male
1,"Comedy, Horror",12110,When a lawyer shows up at the vampire's doorst...,5.430331,"France, United States of America",Dracula: Dead and Loving It,1995,male
2,"Action, Adventure",1408,"Morgan Adams and her slave, William Shaw, are ...",7.284477,"Germany, France, Italy, United States of America",Cutthroat Island,1995,male
3,"Drama, Crime",524,The life of the gambling paradise – Las Vegas ...,10.137389,"France, United States of America",Casino,1995,male
4,"Drama, Romance",4584,"Rich Mr. Dashwood dies, leaving his second wif...",10.673167,"United Kingdom, United States of America",Sense and Sensibility,1995,male
...,...,...,...,...,...,...,...,...
8876,"Drama, Thriller, Mystery, Horror",365432,The Sublet is a suspense driven psychological ...,1.339355,Canada,The Sublet,2015,male
8877,"Action, Thriller, Mystery, Horror",45527,A stranger named Silas flees from a devastatin...,1.270832,"Germany, Canada",The Final Storm,2010,male
8878,"Drama, Family, TVMovie",39562,"Pretty, popular, and slim high-schooler Aly Sc...",0.767762,Canada,To Be Fat Like Me,2007,male
8879,Comedy,14008,Hyperactive teenager Kelly is enrolled into a ...,4.392389,"United States of America, Canada",Cadet Kelly,2002,male


## User interface

In [609]:
import random

def get_title_and_overview_from_index(index, dataset):
    matching_records = dataset[dataset.index == index]
    return matching_records[['title', 'overview', 'genres', 'year', 'production_countries','popularity']]

def get_index_from_title(title, dataset):
    matching_records = dataset[dataset.title == title]
    return matching_records.index[0] if len(matching_records) > 0 else None

user_title = input("Please enter a movie: ")
user_movie_index = get_index_from_title(user_title, male_directed)

if user_movie_index is None:
    print("Invalid movie. Here are 10 random titles from male_directed:")
    random_titles = male_directed.sample(10)['title'].values
    for title in random_titles:
        print(title)
else:
    user_movie_details = get_title_and_overview_from_index(user_movie_index, male_directed)
    indices = get_index_from_title(user_title, male_directed)

    distances, indices = model.kneighbors(male_movies.iloc[indices:indices+1])
    recommended_records = [get_title_and_overview_from_index(index, female_directed) for index in indices[0]]

    print("Movie Entered by User:")
    print('Title:', user_movie_details['title'].values[0])
    print("Overview:", user_movie_details['overview'].values[0])
    print('Genres:', user_movie_details['genres'].values[0])
    print('Year:', user_movie_details['year'].values[0])
    print('Countri(es):', user_movie_details['production_countries'].values[0])
    print('Popularity:', user_movie_details['popularity'].values[0])
    print()

    for record in recommended_records:
        print("Recommended Movie:")
        print('Title:', record['title'].values[0])
        print("Overview:", record['overview'].values[0])
        print('Genres:', record['genres'].values[0])
        print('Year:', record['year'].values[0])
        print('Countri(es):', record['production_countries'].values[0])
        print('Popularity:', record['popularity'].values[0])
        print()

Movie Entered by User:
Title: The Fifth Element
Overview: In 2257, a taxi driver is unintentionally given the task of saving a young girl who is part of the key that will ensure the survival of humanity.
Genres: Action, Adventure, Thriller, Fantasy, ScienceFiction
Year: 1997
Countri(es): France
Popularity: 24.30526

Recommended Movie:
Title: Hisss
Overview: Based on the Far Eastern myth of the snake woman who is able to take on human form.
Genres: Thriller, Fantasy, Horror
Year: 2010
Countri(es): United States of America, India
Popularity: 0.403939

Recommended Movie:
Title: Riverworld
Overview: A movie for the Sci Fi Channel based on the book series by Philip José Farmer. The location is Riverworld, a mysterious and treacherous land where every human who died between the years 99,000 BC and 2,200 AD has been resurrected on the banks of a huge river.
Genres: Thriller, Fantasy, ScienceFiction
Year: 2003
Countri(es): Canada
Popularity: 2.540875

Recommended Movie:
Title: 88
Overview: A y



In [607]:
def get_title_and_overview_from_index(index, dataset):
    matching_records = dataset[dataset.index == index]
    return matching_records[['title', 'overview', 'genres', 'year', 'production_countries', 'popularity']]

def get_index_from_title(title, dataset):
    matching_records = dataset[dataset.title == title]
    return matching_records.index[0] if len(matching_records) > 0 else None

user_title = input("Please enter a movie: ")
user_movie_index = get_index_from_title(user_title, male_directed)

if user_movie_index is None:
    print("Invalid movie entered. Suggestions:")
    suggested_movies = male_directed.head(10)
    for index, record in suggested_movies.iterrows():
        print('Title:', record['title'])
        print('Overview:', record['overview'])
        print('Genres:', record['genres'])
        print('Year:', record['year'])
        print('Countries:', record['production_countries'])
        print('Popularity:', record['popularity'])
        print()
else:
    user_movie_details = get_title_and_overview_from_index(user_movie_index, male_directed)

    indices = get_index_from_title(user_title, male_directed)

    distances, indices = model.kneighbors(male_movies.iloc[indices:indices+1])
    recommended_records = [get_title_and_overview_from_index(index, female_directed) for index in indices[0]]

    print("Movie Entered by User:")
    print('Title:', user_movie_details['title'].values[0])
    print("Overview:", user_movie_details['overview'].values[0])
    print('Genres:', user_movie_details['genres'].values[0])
    print('Year:', user_movie_details['year'].values[0])
    print('Countries:', user_movie_details['production_countries'].values[0])
    print('Popularity:', user_movie_details['popularity'].values[0])
    print()

    for record in recommended_records:
        print("Recommended Movie:")
        print('Title:', record['title'].values[0])
        print("Overview:", record['overview'].values[0])
        print('Genres:', record['genres'].values[0])
        print('Year:', record['year'].values[0])
        print('Countries:', record['production_countries'].values[0])
        print('Popularity:', record['popularity'].values[0])
        print()

Invalid movie entered. Suggestions:
Title: Sabrina
Overview: An ugly duckling having undergone a remarkable change, still harbors feelings for her crush: a carefree playboy, but not before his business-focused brother has something to say about it.
Genres: Comedy, Romance
Year: 1995
Countries: Germany, United States of America
Popularity: 6.677277

Title: Dracula: Dead and Loving It
Overview: When a lawyer shows up at the vampire's doorstep, he falls prey to his charms and joins him in his search for fresh blood. Enter Dr. van Helsing, who may be the only one able to vanquish the count.
Genres: Comedy, Horror
Year: 1995
Countries: France, United States of America
Popularity: 5.430331

Title: Cutthroat Island
Overview: Morgan Adams and her slave, William Shaw, are on a quest to recover the three portions of a treasure map. Unfortunately, the final portion is held by her murderous uncle, Dawg. Her crew is skeptical of her leadership abilities, so she must complete her quest before they m