# Importação das bibliotecas

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

In [3]:
from scipy.stats import chi2_contingency
import pingouin as pg

In [4]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from yellowbrick.classifier import ConfusionMatrix
import optuna
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Importação do banco de dados

In [5]:
db_imdb = pd.read_csv("../Dataset/desafio_indicium_imdb.csv") # Leitura do arquivo csv
db_imdb.head()

Unnamed: 0.1,Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
1,2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
3,4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
4,5,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905


# Análise Exploratória

**Atributos**

- *Series_Title:* Nome do filme
- *
Released_Yea:*  Ano de lançament
- *Certificate:* Classificação etária
- *Runtime:* Tempo de duração
- *Genre:* Gênero
- *Overview:* Overview do filme
- *Meta_score:* Média ponderada de todas as críticas
- *Director:* Diretor
- *Star1:* Ator/atriz #1
- *Star2:* Ator/atriz #2
- *Star3:* Ator/atriz #3
- *Star4:* Ator/atriz #4
- *No_of_Votes:* Número de votos
- *Gross:* Faturamento
- *IMDB_Rating:* Nota do IMDB- Faturamento


In [6]:
db_imdb.info() # Informações sobre os atributos do dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     999 non-null    int64  
 1   Series_Title   999 non-null    object 
 2   Released_Year  999 non-null    object 
 3   Certificate    898 non-null    object 
 4   Runtime        999 non-null    object 
 5   Genre          999 non-null    object 
 6   IMDB_Rating    999 non-null    float64
 7   Overview       999 non-null    object 
 8   Meta_score     842 non-null    float64
 9   Director       999 non-null    object 
 10  Star1          999 non-null    object 
 11  Star2          999 non-null    object 
 12  Star3          999 non-null    object 
 13  Star4          999 non-null    object 
 14  No_of_Votes    999 non-null    int64  
 15  Gross          830 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 125.0+ KB


In [7]:
db_imdb.isnull().sum() # Soma dos valores nulos em cada atributo

Unnamed: 0         0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [8]:
db_imdb.duplicated().sum() # Contagem de linhas duplicadas

0

In [9]:
db_imdb.drop(columns="Unnamed: 0", inplace=True) # Eliminação da coluna dos índices

# Tratamento dos valores nulos

In [10]:
db_imdb.dropna(inplace=True) # Eliminação dos valores nulos

In [11]:
db_imdb.shape # Número de linha e colunas depois da eliminação dos valores nulos

(713, 15)

# Tratamento dos dados

### Eliminando a unidade de tempo da coluna 'Runtime'

In [12]:
db_imdb['Runtime (min)'] = db_imdb['Runtime'].str.replace(" min", "").astype("int64") # Eliminando a unidade de tempo dos valores e alterando o tipo da coluna para 'int64'
db_imdb.drop(columns="Runtime", inplace=True) # Eliminação da coluna 'Runtime' que possui valores do tipo 'object' contendo a unidade de tempo
db_imdb['Runtime (min)']

0      175
1      152
2      202
3       96
4      201
      ... 
989    157
990    144
991     78
993     87
996    118
Name: Runtime (min), Length: 713, dtype: int64

### Separando as categorias de gênero de filme

In [14]:
db_imdb['Genre'] = db_imdb['Genre'].str.replace(',', '') # Alterando as strings para que seja mais fácil manipulá-las, retirando a vírgula de separação
db_imdb['Genre'] = db_imdb['Genre'].str.lower() # Deixando as strings em minúsculo
db_imdb['Genre']

0                     crime drama
1              action crime drama
2                     crime drama
3                     crime drama
4          action adventure drama
                  ...            
989             drama war western
990          adventure comedy war
991    animation adventure family
993          comedy music musical
996             drama romance war
Name: Genre, Length: 713, dtype: object

In [67]:
db_imdb['TokenGenres'] = db_imdb['Genre'].apply(lambda x: word_tokenize(x))
db_imdb['TokenGenres']

0                      [crime, drama]
1              [action, crime, drama]
2                      [crime, drama]
3                      [crime, drama]
4          [action, adventure, drama]
                    ...              
989             [drama, war, western]
990          [adventure, comedy, war]
991    [animation, adventure, family]
993          [comedy, music, musical]
996             [drama, romance, war]
Name: TokenGenres, Length: 713, dtype: object

In [80]:
w2v = Word2Vec(sentences=db_imdb['TokenGenres'], epochs=50, min_count=15, window=3, vector_size=5)

In [84]:
df_genres = pd.DataFrame([sum([w2v.wv[word] for word in i if word in w2v.wv.index_to_key]) for i in db_imdb['TokenGenres']], columns=[f'GenreVector{i}' for i in range(5)])
df_genres

Unnamed: 0,GenreVector0,GenreVector1,GenreVector2,GenreVector3,GenreVector4
0,-1.048536,1.554834,1.148773,-0.355833,-1.339354
1,-1.383481,2.371378,1.818522,-0.504652,-1.635880
2,-1.048536,1.554834,1.148773,-0.355833,-1.339354
3,-1.048536,1.554834,1.148773,-0.355833,-1.339354
4,-1.740273,2.747455,2.181310,-0.483633,-1.665131
...,...,...,...,...,...
708,-0.963380,1.813725,1.261853,-0.377699,-0.930437
709,-1.191109,2.171023,1.363393,-0.618753,-1.216369
710,-1.531001,1.907079,1.681570,-0.629168,-1.023418
711,-0.517063,1.145736,0.612920,-0.180484,-0.897002


## Extração de tópicos do overview

In [111]:
tfidf = TfidfVectorizer(stop_words='english', max_features=15)

In [112]:
tfidf_overview = tfidf.fit_transform(db_imdb['Overview'])

In [114]:
df_overview = pd.DataFrame(tfidf_overview.toarray(), columns=tfidf.get_feature_names_out())
df_overview.head(5)

Unnamed: 0,boy,family,father,help,life,love,man,new,old,son,story,war,woman,world,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.504175,0.0,0.0,0.433808,0.0,0.0,0.482876,0.0,0.569604,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Codificação dos atributos categóricos

In [116]:
le = LabelEncoder()
for c in ['Certificate', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']:
    db_imdb[c] = le.fit_transform(db_imdb[c])
db_imdb[['Director', 'Star1', 'Star2', 'Star3', 'Star4']]

Unnamed: 0,Director,Star1,Star2,Star3,Star4
0,100,305,4,237,147
1,59,89,195,0,451
2,100,5,461,496,147
3,338,184,327,382,297
4,286,132,566,227,493
...,...,...,...,...,...
989,335,381,225,510,416
990,44,95,535,147,97
991,392,357,501,368,81
993,302,242,435,193,540
