In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import ast

### Load dataset

In [137]:
movies_df=pd.read_csv('movies_metadata.csv')
actors_df=pd.read_csv('credits.csv')

  movies_df=pd.read_csv('movies_metadata.csv')


Select columns that seem instersting

In [160]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [138]:
movies_df=movies_df[['genres', 'id', 'imdb_id', 'title', 'overview', 'release_date']]

Drop null rows

In [139]:
movies_df.dropna(subset=['title', 'overview', 'release_date', 'imdb_id'], inplace=True)

In [140]:
def get_genre(input_list):
    input_list=ast.literal_eval(input_list)
    if input_list is []:
        return []
    else:
        genres=[]
        for genre in input_list:
            genres.append(genre['name'].replace(" ", ""))
        return genres
movies_df['genres']=movies_df.apply(lambda row: get_genre(row['genres']), axis=1)

In [141]:
# Add column with actual IMDB URL
movies_df['imdb_link']=movies_df.apply(lambda row: 'https://www.imdb.com/title/'+row['imdb_id'], axis=1)

In [142]:
movies_df['decade']=movies_df.apply(lambda row: int(row['release_date'].split('-')[0])-int(row['release_date'].split('-')[0])%10, axis=1)

In [143]:
movies_df['id']=movies_df['id'].astype('int')
movies_df=pd.merge(movies_df, actors_df[['cast', 'crew', 'id']], on='id', how='left')

In [144]:
movies_df.dropna(subset=['cast', 'crew'], inplace=True)

In [145]:
def get_actors(input_list):
    input_list=ast.literal_eval(input_list)
    if input_list is []:
        return []
    else:
        actors=[]
        for i in range(len(input_list)):
            actors.append(input_list[i]['name'].replace(" ", ""))
            if i==2:
                break
        return actors
movies_df['actors']=movies_df.apply(lambda row: get_actors(row['cast']), axis=1)

In [146]:
def get_director(input_list):
    input_list=ast.literal_eval(input_list)
    if input_list is []:
        return []
    else:
        director=[]
        for i in range(len(input_list)):
            if input_list[i]['job']=='Director':
                director=input_list[i]['name'].replace(" ", "")
                break
        return director
movies_df['director']=movies_df.apply(lambda row: get_director(row['crew']), axis=1)

In [147]:
movies_df.drop(columns=['cast', 'crew', 'imdb_id'], inplace=True)
movies_df.head(5)

Unnamed: 0,genres,id,title,overview,release_date,imdb_link,decade,actors,director
0,"[Animation, Comedy, Family]",862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,https://www.imdb.com/title/tt0114709,1990,"[TomHanks, TimAllen, DonRickles]",JohnLasseter
1,"[Adventure, Fantasy, Family]",8844,Jumanji,When siblings Judy and Peter discover an encha...,1995-12-15,https://www.imdb.com/title/tt0113497,1990,"[RobinWilliams, JonathanHyde, KirstenDunst]",JoeJohnston
2,"[Romance, Comedy]",15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,1995-12-22,https://www.imdb.com/title/tt0113228,1990,"[WalterMatthau, JackLemmon, Ann-Margret]",HowardDeutch
3,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,https://www.imdb.com/title/tt0114885,1990,"[WhitneyHouston, AngelaBassett, LorettaDevine]",ForestWhitaker
4,[Comedy],11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,1995-02-10,https://www.imdb.com/title/tt0113041,1990,"[SteveMartin, DianeKeaton, MartinShort]",CharlesShyer


### NLP of overview column

In [148]:
import unidecode
from langdetect import detect
import contractions
import spacy
import re
from nltk.stem.snowball import SnowballStemmer

def preprocess_overview(text, nlp, stemmer):
    if text==[]  or len(text)<=10 or detect(text)!='en':
        return 0
    else:
        # Remove accented characters
        text=unidecode.unidecode(text)
        # Set to lowercase
        text=text.lower()
        #
        text=re.sub(r'[^\w]|[\d]', ' ', text)
        doc=nlp(text)
        clean_text=[]
        for token in doc:
            if token.text not in {''} and token.is_stop==False and token.pos_=='NOUN' and len(token.text)>3: #and token.text.isnumeric()==False and token.lemma_.isnumeric()==False:
                if token.lemma_=='-PRON-':
                    clean_text.append(token.text)
                else:
                    clean_text.append(stemmer.stem(token.text))
        clean_text=list(set(clean_text))
        if clean_text==[]:
            return 0
        else:
            return clean_text

In [149]:
nlp=spacy.load("en_core_web_sm")
stemmer=SnowballStemmer(language='english')
movies_df['overview_processed']=movies_df.apply(lambda row: preprocess_overview(row['overview'], nlp, stemmer), axis=1)

In [150]:
movies_df=movies_df[movies_df.overview_processed!=0]

In [151]:
def tf_idf_transf(df, column_name, max_df, min_df):
    tfidf=TfidfVectorizer(max_df=max_df, min_df=min_df)
    tfidf_matrix=tfidf.fit_transform(df[column_name].apply(lambda x: " ".join(x)))
    return tfidf_matrix
keywords_processed=tf_idf_transf(movies_df, 'overview_processed', 0.1, 5)

In [162]:
from sklearn.feature_extraction.text import CountVectorizer
def ohe_transf(df, column_name, min_df):
    ohe=CountVectorizer(min_df=min_df, binary=True)
    ohe_matrix=ohe.fit_transform(df[column_name].apply(lambda x: str(x)))
    print(ohe.get_feature_names_out())
    return ohe_matrix
genres_processed=ohe_transf(movies_df, 'genres', 1)
actors_processed=ohe_transf(movies_df, 'actors', 3)
director_processed=ohe_transf(movies_df, 'director', 3)
decade_processed=ohe_transf(movies_df, 'decade', 1)

['action' 'adventure' 'animation' 'comedy' 'crime' 'documentary' 'drama'
 'family' 'fantasy' 'foreign' 'history' 'horror' 'music' 'mystery'
 'romance' 'sciencefiction' 'thriller' 'tvmovie' 'war' 'western']
['50cent' 'aakekalliala' 'aamirkhan' ... '松田龙平' '松隆子' '莫玛']
['aaronkatz' 'aaronnorris' 'aaronwoodley' ... 'óscaraibar' 'özcandeniz'
 'šarūnasbartas']
['1870' '1880' '1890' '1900' '1910' '1920' '1930' '1940' '1950' '1960'
 '1970' '1980' '1990' '2000' '2010']


In [163]:
from scipy import sparse
final_df=sparse.hstack((actors_processed, decade_processed, director_processed, genres_processed, keywords_processed))

In [178]:
sparse.save_npz("final_df.npz", final_df)