## Importing libraries

In [563]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
films = pd.read_csv('films_clean.csv')


## Defining Features for Machine Learning

In [564]:
films.columns

Index(['genres', 'id', 'original_language', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries', 'runtime',
       'spoken_languages', 'title', 'year', 'director', 'director_gender'],
      dtype='object')

In [514]:
#Dropping all columns I won't need and keeping other for feature engineering
films = films.drop(columns= ['original_title', 'original_language','production_countries', 'production_companies','runtime','spoken_languages', 'vote_average', 'vote_count', 'director'])

## Dropping null values

In [515]:
#Drop the movies without a description. I kept them so far because they were analyzed in the EDA.
films = films.drop(films[films['overview'] == 'No overview found.'].index)

## Splitting the dataset

In [516]:
# Split the dataset into male directed and female directed movies
female_directed= films[films['director_gender']== 'female']
male_directed= films[films['director_gender']== 'male']

## Vectorizing Categorical Data

In [562]:
#We will vectorize the 'genres' column
films.dtypes

genres              object
id                   int64
overview            object
popularity         float64
title               object
year                 int64
director_gender     object
dtype: object

In [517]:
genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(films['genres'])
male_genres = genre_vectorizer.transform(male_directed['genres']).toarray()
female_genres = genre_vectorizer.transform(female_directed['genres']).toarray()

{'comedy': 3,
 'romance': 14,
 'horror': 11,
 'action': 0,
 'adventure': 1,
 'drama': 6,
 'crime': 4,
 'thriller': 16,
 'fantasy': 8,
 'sciencefiction': 15,
 'history': 10,
 'war': 18,
 'foreign': 9,
 'mystery': 13,
 'family': 7,
 'documentary': 5,
 'western': 19,
 'music': 12,
 'animation': 2,
 'tvmovie': 17}

In [None]:
#Checking the vocabulary to see if all genres are there/no duplicates
genre_vectorizer.vocabulary_

In [518]:
male_genres = pd.DataFrame(male_genres, columns=genre_vectorizer.get_feature_names())
female_genres = pd.DataFrame(female_genres, columns=genre_vectorizer.get_feature_names())



## Conducting NLP on Film Summary

In [519]:
#I will use the stemmer and the ENGLISH_STOP_WORDS library to get rid of insignificant words and make my 
#recommender more robust


text = films['overview']
text_male = male_directed['overview']
text_female = female_directed['overview']


In [520]:
stemmer = EnglishStemmer()
default_analyzer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS).build_analyzer()


In [521]:
def analyze_with_stemming(text):
    unstemmed_words = default_analyzer(text)
    return (stemmer.stem(word) for word in unstemmed_words)

In [522]:
#Example of text before stemming
text 

0       An ugly duckling having undergone a remarkable...
1       When a lawyer shows up at the vampire's doorst...
2       Morgan Adams and her slave, William Shaw, are ...
3       The life of the gambling paradise – Las Vegas ...
4       Rich Mr. Dashwood dies, leaving his second wif...
                              ...                        
9574    The Sublet is a suspense driven psychological ...
9575    A stranger named Silas flees from a devastatin...
9576    Pretty, popular, and slim high-schooler Aly Sc...
9577    Hyperactive teenager Kelly is enrolled into a ...
9578    Yet another version of the classic epic, with ...
Name: overview, Length: 9539, dtype: object

In [523]:
#Overview of what the stemming has done
list(analyze_with_stemming(text[0]))

['ugli',
 'duckl',
 'have',
 'undergon',
 'remark',
 'chang',
 'harbor',
 'feel',
 'crush',
 'carefre',
 'playboy',
 'busi',
 'focus',
 'brother',
 'say']

In [524]:
stemmer_vectorizer = CountVectorizer(analyzer=analyze_with_stemming)

In [525]:
#I am fitting on the films (including both genders)
#and am transforming them seperately.

vectors = stemmer_vectorizer.fit(text)
male_vectorized = vectors.transform(text_male).todense()
female_vectorized = vectors.transform(text_female).todense()

In [526]:
vocabulary = vectors.get_feature_names_out()

In [527]:
male_vectorized

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [528]:
male_vectorized = pd.DataFrame(male_vectorized, columns=vocabulary)
female_vectorized = pd.DataFrame(female_vectorized, columns=vocabulary)

In [529]:
male_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [530]:
female_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
656,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Normalizing Numerical Data: Popularity

In [531]:
m_normalized_popularity = male_directed['popularity']
f_normalized_popularity = female_directed['popularity']

In [532]:
m_normalized_popularity = m_normalized_popularity.values.reshape(-1, 1)

In [533]:
f_normalized_popularity = f_normalized_popularity.values.reshape(-1, 1)

In [534]:
from sklearn.preprocessing import StandardScaler

In [535]:
scaler = StandardScaler()
m_normalized_popularity = scaler.fit_transform(m_normalized_popularity)
f_normalized_popularity = scaler.fit_transform(f_normalized_popularity)

In [536]:
m_normalized_popularity = pd.DataFrame(m_normalized_popularity)
f_normalized_popularity = pd.DataFrame(f_normalized_popularity)

## Normalizing Numerical Data: Year

In [589]:
m_normalized_year = male_directed['year']
f_normalized_year = female_directed['year']

In [590]:
m_normalized_year = m_normalized_year.values.reshape(-1, 1)
f_normalized_year = f_normalized_year.values.reshape(-1, 1)

In [591]:
scaler = StandardScaler()
m_normalized_year = scaler.fit_transform(m_normalized_year)
f_normalized_year = scaler.fit_transform(f_normalized_year)

In [540]:
m_normalized_year = pd.DataFrame(m_normalized_year)
f_normalized_year = pd.DataFrame(f_normalized_year)

In [541]:
f_normalized_year

Unnamed: 0,0
0,-0.669476
1,-0.669476
2,-0.669476
3,-0.744984
4,-0.744984
...,...
653,0.538656
654,0.765181
655,0.387640
656,0.916198


## Normalizing Categorical Data

In [542]:
#The 'l2' norm, also known as the Euclidean norm, refers to the length of a vector in a Euclidean space. 
normalizer = Normalizer(norm='l2')

In [543]:
male_genres_overview = pd.concat([male_genres*7, male_vectorized], axis=1)
female_genres_overview = pd.concat([female_genres*7, female_vectorized], axis=1)

In [544]:
normalizer.fit_transform(male_genres_overview)
normalizer.fit_transform(female_genres_overview)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Concatenate and Create Dataframes Ready for Nearest Neighbors

In [545]:
female_directed.reset_index(drop=True, inplace=True)
male_directed.reset_index(inplace=True)

In [546]:
female_movies = pd.concat([female_genres_overview, f_normalized_year], axis=1)
male_movies = pd.concat([male_genres_overview, m_normalized_year], axis=1)


In [547]:
n_neighbors = 5
model = NearestNeighbors(n_neighbors=n_neighbors)
model.fit(female_movies)




NearestNeighbors()

In [548]:
distances, indices = model.kneighbors(male_movies[0:1])



In [549]:
distances

array([[4.37562026, 4.47763762, 4.86643777, 4.90442975, 5.00000214]])

In [550]:
indices

array([[314, 652,   1, 284, 175]])

In [551]:
male_directed.iloc[[8880]]

Unnamed: 0,index,genres,id,overview,popularity,title,year,director_gender
8880,9578,"Action, Drama, Romance",30840,"Yet another version of the classic epic, with ...",5.683753,Robin Hood,1991,male


In [552]:
female_directed.iloc[[314, 652,   1, 284, 175]]

Unnamed: 0,genres,id,overview,popularity,title,year,director_gender
314,"Comedy, Romance",75802,A romantic comedy about the invention of the v...,14.331454,Hysteria,2011,female
652,"Comedy, Romance",72363,"A love triangle between a businessman, his wif...",1.18741,I'm Staying,2003,female
1,"Comedy, Romance",4482,"After learning of her husband's infidelities, ...",2.518051,French Twist,1995,female
284,"Comedy, Romance",14688,So called friends at a dinner party end up act...,2.878098,Change of Plans,2009,female
175,"Comedy, Romance",10111,A mockumentary that follows three couples as t...,1.873214,Confetti,2006,female


## User interface

In [553]:
male_directed.tail()

Unnamed: 0,index,genres,id,overview,popularity,title,year,director_gender
8876,9574,"Drama, Thriller, Mystery, Horror",365432,The Sublet is a suspense driven psychological ...,1.339355,The Sublet,2015,male
8877,9575,"Action, Thriller, Mystery, Horror",45527,A stranger named Silas flees from a devastatin...,1.270832,The Final Storm,2010,male
8878,9576,"Drama, Family, TVMovie",39562,"Pretty, popular, and slim high-schooler Aly Sc...",0.767762,To Be Fat Like Me,2007,male
8879,9577,Comedy,14008,Hyperactive teenager Kelly is enrolled into a ...,4.392389,Cadet Kelly,2002,male
8880,9578,"Action, Drama, Romance",30840,"Yet another version of the classic epic, with ...",5.683753,Robin Hood,1991,male


In [554]:
#print("Male_directed Movie Titles:")
#for i, row in male_directed.iterrows():
#    print(f"{i}: {row['title']}")

In [593]:
chosen_index = int(input("Enter the index of the movie you want: "))
chosen_movie = male_data[chosen_index]
chosen_distance = distances[0][chosen_index]
chosen_indices = indices[0][:3]

print(f"Chosen Movie: {chosen_movie}")
print(f"Distance: {chosen_distance}")
print(f"Indices: {chosen_indices}")

Chosen Movie: [<1x35153 sparse matrix of type '<class 'numpy.int64'>'
 	with 30 stored elements in Compressed Sparse Row format>
 0.15648791403569726]
Distance: 4.375620262071841
Indices: [314 652   1]


In [556]:
#def get_title_from_index(index):
#    return male_directed[male_directed.index == index]['title'].values[0]
#def get_index_from_title(title):
#    return male_directed[male_directed.title == title]['index'].values[0]
#
#
##input("Enter the title of the movie you want: ")



In [557]:
get_index_from_title('Sabrina')

0

In [558]:
#def get_index_from_title(title):
#    return male_directed[male_directed.title == title]['index'].values[0]
#
#def main():
#    title = input("Enter the title: ")
#    index = get_index_from_title(title)
#    print("Index:", index)

In [559]:
#def get_index_from_title(title):
#    return male_directed[male_directed['title']== title]['index'].values[0]
#
#def main():
#    title = input("Enter the title: ")
#    try:
#        index = get_index_from_title(title)
#        print("Index:", index)
#    except IndexError:
#        print("The movie does not exist in the database.")
#        print("Here are some recommended movies:")
#        # Assuming you have a dataframe of recommended movies called 'recommended_movies'
#        recommended_movies = male_directed.sample(5)  # Change the number to specify how many recommendations you want
#        print(recommended_movies[['title', 'index']])
#
#if __name__ == "__main__":
#    main()




In [561]:
#import streamlit as st
#import pandas as pd
#
#def get_index_from_title(title):
#    return male_directed[male_directed['title'] == title]['index'].values[0]
#
#def main():
#    st.title("Movie Recommendation")
#    title = st.text_input("Enter the movie title:")
#    
#    if st.button("Get Index"):
#        try:
#            index = get_index_from_title(title)
#            st.success(f"The index for the movie '{title}' is {index}.")
#        except IndexError:
#            st.error("The movie does not exist in the database.")
#            st.subheader("Recommended Movies:")
#            # Assuming you have a dataframe of recommended movies called 'recommended_movies'
#            recommended_movies = male_directed.sample(5)  # Change the number to specify how many recommendations you want
#            st.table(recommended_movies[['title', 'index']])
#
#if __name__ == "__main__":
#    main()