## Importing libraries

In [306]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
films = pd.read_csv('films_clean.csv')


## Defining Features for Machine Learning

In [307]:
films.columns

Index(['genres', 'id', 'original_language', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries', 'runtime',
       'spoken_languages', 'title', 'year', 'director', 'director_gender'],
      dtype='object')

In [308]:
#Dropping all columns I won't need and keeping other for feature engineering
films = films.drop(columns= ['original_title', 'original_language', 'production_companies','runtime','spoken_languages', 'director'])

## Dropping null values

In [309]:
#Drop the movies without a description. I kept them so far because they were analyzed in the EDA.
films = films.drop(films[films['overview'] == 'No overview found.'].index)

## Splitting the dataset

In [310]:
# Split the dataset into male directed and female directed movies
female_directed= films[films['director_gender']== 'female']
male_directed= films[films['director_gender']== 'male']

## Vectorizing Categorical Data

In [311]:
#We will vectorize the 'genres' column
films.dtypes

genres                   object
id                        int64
overview                 object
popularity              float64
production_countries     object
title                    object
year                      int64
director_gender          object
dtype: object

In [312]:
genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(films['genres'])
male_genres = genre_vectorizer.transform(male_directed['genres']).toarray()
female_genres = genre_vectorizer.transform(female_directed['genres']).toarray()

In [313]:
#Checking the vocabulary to see if all genres are there/no duplicates
genre_vectorizer.vocabulary_

{'comedy': 3,
 'romance': 14,
 'horror': 11,
 'action': 0,
 'adventure': 1,
 'drama': 6,
 'crime': 4,
 'thriller': 16,
 'fantasy': 8,
 'sciencefiction': 15,
 'history': 10,
 'war': 18,
 'foreign': 9,
 'mystery': 13,
 'family': 7,
 'documentary': 5,
 'western': 19,
 'music': 12,
 'animation': 2,
 'tvmovie': 17}

In [314]:
male_genres = pd.DataFrame(male_genres, columns=genre_vectorizer.get_feature_names())
female_genres = pd.DataFrame(female_genres, columns=genre_vectorizer.get_feature_names())



## Conducting NLP on Film Summary

In [315]:
#I will use the stemmer and the ENGLISH_STOP_WORDS library to get rid of insignificant words and make my 
#recommender more robust


text = films['overview']
text_male = male_directed['overview']
text_female = female_directed['overview']


In [316]:
stemmer = EnglishStemmer()
default_analyzer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS).build_analyzer()


In [317]:
def analyze_with_stemming(text):
    unstemmed_words = default_analyzer(text)
    return (stemmer.stem(word) for word in unstemmed_words)

In [318]:
#Example of text before stemming
text 

0       An ugly duckling having undergone a remarkable...
1       When a lawyer shows up at the vampire's doorst...
2       Morgan Adams and her slave, William Shaw, are ...
3       The life of the gambling paradise – Las Vegas ...
4       Rich Mr. Dashwood dies, leaving his second wif...
                              ...                        
9574    The Sublet is a suspense driven psychological ...
9575    A stranger named Silas flees from a devastatin...
9576    Pretty, popular, and slim high-schooler Aly Sc...
9577    Hyperactive teenager Kelly is enrolled into a ...
9578    Yet another version of the classic epic, with ...
Name: overview, Length: 9539, dtype: object

In [319]:
#Overview of what the stemming has done
list(analyze_with_stemming(text[0]))

['ugli',
 'duckl',
 'have',
 'undergon',
 'remark',
 'chang',
 'harbor',
 'feel',
 'crush',
 'carefre',
 'playboy',
 'busi',
 'focus',
 'brother',
 'say']

In [320]:
stemmer_vectorizer = CountVectorizer(analyzer=analyze_with_stemming)

In [321]:
#I am fitting on the films (including both genders)
#and am transforming them seperately.

vectors = stemmer_vectorizer.fit(text)
male_vectorized = vectors.transform(text_male).todense()
female_vectorized = vectors.transform(text_female).todense()

In [322]:
vocabulary = vectors.get_feature_names_out()

In [323]:
male_vectorized

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [324]:
male_vectorized = pd.DataFrame(male_vectorized, columns=vocabulary)
female_vectorized = pd.DataFrame(female_vectorized, columns=vocabulary)

In [325]:
male_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [326]:
female_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
656,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Normalizing Numerical Data: Popularity

In [327]:
m_normalized_popularity = male_directed['popularity']
f_normalized_popularity = female_directed['popularity']

In [328]:
m_normalized_popularity = m_normalized_popularity.values.reshape(-1, 1)

In [329]:
f_normalized_popularity = f_normalized_popularity.values.reshape(-1, 1)

In [330]:
from sklearn.preprocessing import StandardScaler

In [331]:
scaler = StandardScaler()
m_normalized_popularity = scaler.fit_transform(m_normalized_popularity)
f_normalized_popularity = scaler.fit_transform(f_normalized_popularity)

In [332]:
m_normalized_popularity = pd.DataFrame(m_normalized_popularity)
f_normalized_popularity = pd.DataFrame(f_normalized_popularity)

## Normalizing Numerical Data: Year

In [333]:
m_normalized_year = male_directed['year']
f_normalized_year = female_directed['year']

In [334]:
m_normalized_year = m_normalized_year.values.reshape(-1, 1)
f_normalized_year = f_normalized_year.values.reshape(-1, 1)

In [335]:
scaler = StandardScaler()
m_normalized_year = scaler.fit_transform(m_normalized_year)
f_normalized_year = scaler.fit_transform(f_normalized_year)

In [336]:
m_normalized_year = pd.DataFrame(m_normalized_year)
f_normalized_year = pd.DataFrame(f_normalized_year)

In [337]:
f_normalized_year

Unnamed: 0,0
0,-0.669476
1,-0.669476
2,-0.669476
3,-0.744984
4,-0.744984
...,...
653,0.538656
654,0.765181
655,0.387640
656,0.916198


## Normalizing Categorical Data

In [338]:
#The 'l2' norm, also known as the Euclidean norm, refers to the length of a vector in a Euclidean space. 
normalizer = Normalizer(norm='l2')

In [339]:
male_genres_overview = pd.concat([male_genres*5, male_vectorized], axis=1)
female_genres_overview = pd.concat([female_genres*5, female_vectorized], axis=1)

In [None]:
normalizer.fit_transform(male_genres_overview)
normalizer.fit_transform(female_genres_overview)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Concatenate and Create Dataframes Ready for Nearest Neighbors

In [None]:
female_directed.reset_index(drop=True, inplace=True)
male_directed.reset_index(inplace=True, drop=True,)

In [None]:
female_movies = pd.concat([female_genres_overview, f_normalized_year], axis=1)
male_movies = pd.concat([male_genres_overview, m_normalized_year], axis=1)


In [None]:
n_neighbors = 5
model = NearestNeighbors(n_neighbors=n_neighbors)
model.fit(female_movies)




NearestNeighbors()

In [None]:
distances, indices = model.kneighbors(male_movies[0:1])



In [None]:
distances

array([[4.37562026, 4.47763762, 4.86643777, 4.90442975, 5.00000214]])

In [None]:
indices

array([[314, 652,   1, 284, 175]])

In [None]:
female_directed.iloc[[314, 652,   1, 284, 175]]

Unnamed: 0,genres,id,overview,popularity,production_countries,title,year,director_gender
314,"Comedy, Romance",75802,A romantic comedy about the invention of the v...,14.331454,"Germany, France, United Kingdom, Switzerland, ...",Hysteria,2011,female
652,"Comedy, Romance",72363,"A love triangle between a businessman, his wif...",1.18741,France,I'm Staying,2003,female
1,"Comedy, Romance",4482,"After learning of her husband's infidelities, ...",2.518051,France,French Twist,1995,female
284,"Comedy, Romance",14688,So called friends at a dinner party end up act...,2.878098,France,Change of Plans,2009,female
175,"Comedy, Romance",10111,A mockumentary that follows three couples as t...,1.873214,United Kingdom,Confetti,2006,female


## User interface

In [None]:
def get_title_and_overview_from_index(index, dataset):
    matching_records = dataset[dataset.index == index]
    return matching_records[['title', 'overview', 'genres', 'year', 'production_countries','popularity']]

def get_index_from_title(title, dataset):
    matching_records = dataset[dataset.title == title]
    return matching_records.index[0]

user_title = input("Please enter a movie: ")
user_movie_index = get_index_from_title(user_title, male_directed)
user_movie_details = get_title_and_overview_from_index(user_movie_index, male_directed)

indices = get_index_from_title(user_title, male_directed)

distances, indices = model.kneighbors(male_movies.iloc[indices:indices+1])
recommended_records = [get_title_and_overview_from_index(index, female_directed) for index in indices[0]]

print("Movie Entered by User:")
print('Title:', user_movie_details['title'].values[0])
print("Overview:", user_movie_details['overview'].values[0])
print('Genres:', user_movie_details['genres'].values[0])
print('Year:', user_movie_details['year'].values[0])
print('Countri(es):', user_movie_details['production_countries'].values[0])
print('Popularity:', user_movie_details['popularity'].values[0])
print()

for record in recommended_records:
    print("Recommended Movie:")
    print('Title:', record['title'].values[0])
    print("Overview:", record['overview'].values[0])
    print('Genres:', record['genres'].values[0])
    print('Year:', record['year'].values[0])
    print('Countri(es):', record['production_countries'].values[0])
    print('Popularity:', record['popularity'].values[0])
    print()

Movie Entered by User:
Title: Robin Hood
Overview: When soldier Robin happens upon the dying Robert of Loxley, he promises to return the man's sword to his family in Nottingham. There, he assumes Robert's identity; romances his widow, Marion; and draws the ire of the town's sheriff and King John's henchman, Godfrey.
Genres: Action, Adventure
Year: 2010
Countri(es): United Kingdom, United States of America
Popularity: 10.56812

Recommended Movie:
Title: Spies of Warsaw
Overview: A military attaché at the French embassy is drawn into a world of abduction, betrayal and intrigue in the diplomatic salons and back alleys of Warsaw.
Genres: Action, Drama, Adventure
Year: 2013
Countri(es): United Kingdom, United States of America, Poland
Popularity: 2.224206

Recommended Movie:
Title: Goliath and the Sins of Babylon
Overview: Goliath battles for the freedom of the Babylonian people.
Genres: Adventure
Year: 1963
Countri(es): Italy
Popularity: 0.068376

Recommended Movie:
Title: Black Beauty
Ove

