In [None]:
# Imports

import ast
import nltk
import sklearn
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#pd.options.mode.chained_assignement = None

## Carregando e Compreendendo os Dados

In [None]:
df_filmes = pd.read_csv('/content/dataset_filmes.csv')

In [None]:
df_filmes.head()

In [None]:
df_elenco = pd.read_csv('/content/dataset_elenco.csv')

In [None]:
df_filmes.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [None]:
df_elenco.shape

(4803, 4)

## Organização dos Dados de

In [None]:
df_filmes_elenco = df_filmes.merge(df_elenco, on='title')

In [None]:
df_filmes_elenco.shape

(4809, 23)

In [None]:
df_filmes_elenco.head()

In [None]:
df_filmes_elenco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

Algumas colunas são irrelevantes para o sistema. Sendo assim, serão mantidas somente as variáveis que podem indicar similaridade entre filmes. Obs: movie_id é apenas para indentificação

In [None]:
df_filmes_elenco = df_filmes_elenco[['movie_id', 'title', 'overview',
                                     'genres', 'keywords', 'cast','crew',
                                     'production_companies']]

In [None]:
df_filmes_elenco.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,production_companies
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""name"": ""Ingenious Film Partners"", ""id"": 289..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]"


## Limpeza dos Dados

In [None]:
# Checagem de valores ausentes
df_filmes_elenco.isnull().sum()

movie_id                0
title                   0
overview                3
genres                  0
keywords                0
cast                    0
crew                    0
production_companies    0
dtype: int64

In [None]:
df_filmes_elenco.dropna(inplace=True)

In [None]:
# Checagem de valores duplicados
df_filmes_elenco.duplicated().sum()

0

## Processamento de Texto

In [None]:
def converter(obj):
  word = []
  for i in ast.literal_eval(obj):
    word.append(i['name'])
  return word

In [None]:
# Teste com a estrutura de dados semelhante à do dataframe de filmes
teste = converter('[{"id":28, "name":"Action"}, {"id":12, "name":"Adventure"}, \
                    {"id":14, "name": "Fantasy"}, {"id":878, "name":"Sciense Fiction"}]')

In [None]:
# Aplicando a função para a coluna genres
df_filmes_elenco['genres'] = df_filmes_elenco['genres'].apply(converter)
df_filmes_elenco['keywords'] = df_filmes_elenco['keywords'].apply(converter)

Com relação a cast e crew, de início, vamos selecionar no máximo 3 atores, já que existem muitos em cada filme

In [None]:
def extrair_nomes(obj):
  word = []

  # Inicializa um contador para controlar o número de elementos adicionados à lista
  counter = 0
  # Usa ast.literal_eval para avaliar a string 'obj' e iterar sobre os elementos resultantes
  for i in ast.literal_eval(obj):
    # Verifica se o contador é diferente de 3.
    if counter != 3:
      word.append(i['name'])
      counter += 1
    else:
      break
  return word

In [None]:
df_filmes_elenco['cast'] = df_filmes_elenco['cast'].apply(extrair_nomes)

No caso da coluna crew, o interessante é extrair somente o diretor

In [None]:
def get_director(obj):
  names = []

  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      names.append(i['name'])
      break
  return names

In [None]:
df_filmes_elenco['crew'] = df_filmes_elenco['crew'].apply(get_director)

Para a coluna 'production_companies', serão escolhidas as 3 primeiras

In [None]:
df_filmes_elenco['production_companies'] = df_filmes_elenco['production_companies'].apply(extrair_nomes)

## Limpeza dos Dados de Texto

In [None]:
# Separamos a string da coluna overview por espaço em branco
df_filmes_elenco['overview'] = df_filmes_elenco['overview'].apply(lambda x:x.split())

In [None]:
df_filmes_elenco.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,production_companies
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[Ingenious Film Partners, Twentieth Century Fo..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[Walt Disney Pictures, Jerry Bruckheimer Films..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[Columbia Pictures, Danjaq, B24]"
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[Legendary Pictures, Warner Bros., DC Entertai..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],[Walt Disney Pictures]


In [None]:
df_filmes_elenco['genres'] = df_filmes_elenco['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
df_filmes_elenco['keywords'] = df_filmes_elenco['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
df_filmes_elenco['cast'] = df_filmes_elenco['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
df_filmes_elenco['crew'] = df_filmes_elenco['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
df_filmes_elenco['production_companies'] = df_filmes_elenco['production_companies'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
df_filmes_elenco['production_companies']

0       [IngeniousFilmPartners, TwentiethCenturyFoxFil...
1       [WaltDisneyPictures, JerryBruckheimerFilms, Se...
2                         [ColumbiaPictures, Danjaq, B24]
3       [LegendaryPictures, WarnerBros., DCEntertainment]
4                                    [WaltDisneyPictures]
                              ...                        
4804                                   [ColumbiaPictures]
4805                                                   []
4806    [FrontStreetPictures, MuseEntertainmentEnterpr...
4807                                                   []
4808             [rustybearentertainment, luckycrowfilms]
Name: production_companies, Length: 4806, dtype: object

## Preparando os dados para vetorização em outro espaço vetorial

In [None]:
# Criação da coluna tag, um vetor de strings com os valores das colunas
# O que você está fazendo aqui é redução de dimensionalidade, mudando o espaço
# vetorial
df_filmes_elenco['tags'] = df_filmes_elenco['overview'] + \
                           df_filmes_elenco['genres'] + \
                           df_filmes_elenco['keywords'] + \
                           df_filmes_elenco['cast'] + \
                           df_filmes_elenco['crew'] + \
                           df_filmes_elenco['production_companies']


In [None]:
df_filmes_elenco_final = df_filmes_elenco[['movie_id', 'title', 'tags']]

In [None]:
df_filmes_elenco_final.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [None]:
df_filmes_elenco_final['tags'] = df_filmes_elenco_final['tags'].apply(lambda x: " ".join(x).lower())

In [None]:
df_filmes_elenco_final.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


## Parse e Vetorização

In [None]:
# Esse algoritmo será usado para simplificar as strings
# exemplo: correr, correu e correria, em essencial tem o mesmo significado
# então o algoritmo vai fazer essa redução
parser_ps = PorterStemmer()

In [None]:
def stem(text):
  y = []

  for i in text.split():
    y.append(parser_ps.stem(i))

  return " ".join(y)

In [None]:
df_filmes_elenco_final['tags'] = df_filmes_elenco_final['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filmes_elenco_final['tags'] = df_filmes_elenco_final['tags'].apply(stem)


In [None]:
# Cria o vetorizador com no máximo 5000 atributos
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [None]:
vectors = cv.fit_transform(df_filmes_elenco_final['tags']).toarray()

In [None]:
len(cv.get_feature_names_out())

5000

In [None]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Para visualizar todas as colunas do array
np.set_printoptions(threshold = np.inf)

In [None]:
vectors[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Cálculo de Distância dos Vetores

Para medida de similaridade, será utilizada a distancia do cosseno

In [None]:
vectors.shape

(4806, 5000)

In [None]:
# não precisa imprimir esse objeto, ele é muito grande e pode travar o ambiente
similaridades = cosine_similarity(vectors)

## Construindo o Sistema de Recomendação

In [None]:
# Função para o sistema de recomendação

def sistema_recomendacao(movie):
  index = df_filmes_elenco_final[df_filmes_elenco_final['title'] == movie].index[0]

  # verificar os filmes com vetores de menor distancia para o filme passado como argumento
  distances = sorted(list(enumerate(similaridades[index])), reverse = True, key = lambda x: x[1])

  # consideramos os 5 filmes com menor distancia, ou seja, maior similaridade
  for i in distances[1:6]:
    print(df_filmes_elenco_final.iloc[i[0]].title)

## Aplicando o Sistema de Recomendação

In [None]:
sistema_recomendacao('Jurassic World')

Jurassic Park
The Lost World: Jurassic Park
Walking With Dinosaurs
Jurassic Park III
Terminator Genisys


In [None]:
### TÍTULOS PARA TESTE
df_filmes_elenco_final.title

Avengers: Age of Ultron

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4804                                 El Mariachi
4805                                   Newlyweds
4806                   Signed, Sealed, Delivered
4807                            Shanghai Calling
4808                           My Date with Drew
Name: title, Length: 4806, dtype: object