In [148]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [149]:
dataset = pd.read_csv("desafio_indicium_imdb.csv", sep=",")

In [150]:
dataset.shape

(999, 16)

In [151]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     999 non-null    int64  
 1   Series_Title   999 non-null    object 
 2   Released_Year  999 non-null    object 
 3   Certificate    898 non-null    object 
 4   Runtime        999 non-null    object 
 5   Genre          999 non-null    object 
 6   IMDB_Rating    999 non-null    float64
 7   Overview       999 non-null    object 
 8   Meta_score     842 non-null    float64
 9   Director       999 non-null    object 
 10  Star1          999 non-null    object 
 11  Star2          999 non-null    object 
 12  Star3          999 non-null    object 
 13  Star4          999 non-null    object 
 14  No_of_Votes    999 non-null    int64  
 15  Gross          830 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 125.0+ KB


In [152]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
1,2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
3,4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
4,5,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905


In [153]:
dataset.describe()

Unnamed: 0.1,Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes
count,999.0,999.0,842.0,999.0
mean,500.0,7.947948,77.969121,271621.4
std,288.530761,0.27229,12.383257,320912.6
min,1.0,7.6,28.0,25088.0
25%,250.5,7.7,70.0,55471.5
50%,500.0,7.9,79.0,138356.0
75%,749.5,8.1,87.0,373167.5
max,999.0,9.2,100.0,2303232.0


In [154]:
dataset.isnull().sum()

Unnamed: 0         0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [155]:
# Ordenar os valores ao longo de um dos eixos

total = dataset.isnull().sum().sort_values(ascending=False)
total.head()

Gross          169
Meta_score     157
Certificate    101
Unnamed: 0       0
Runtime          0
dtype: int64

In [156]:
# Quantos porcento os valores nulos representam no dataset?

percent = ((dataset.isnull().sum()/dataset.shape[0])*100).sort_values(ascending=False)
percent_df = pd.DataFrame({'Total': total, 'Porcentagem':percent})
percent_df.head()

Unnamed: 0,Total,Porcentagem
Gross,169,16.916917
Meta_score,157,15.715716
Certificate,101,10.11011
Unnamed: 0,0,0.0
Runtime,0,0.0


In [157]:
# Removendo valores nulos

dataset.dropna(axis=0, how='any', inplace=True)
dataset.isnull().sum()

Unnamed: 0       0
Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

In [158]:
dataset.shape

(713, 16)

In [159]:
# Vendo filmes duplicados

dataset.duplicated(subset=['Series_Title']).sum()

np.int64(0)

In [160]:
# Questionamentos

## 1 - Comparativo da quantidade de vendas com relação ao IMDB, Faixa etária e Duração do filme

dataset['Gross'] = dataset['Gross'].str.replace(",","")
dataset['Gross']

0      134966411
1      534858444
2       57300000
3        4360000
4      377845905
         ...    
989       696690
990      1378435
991    141843612
993     13780024
996     30500000
Name: Gross, Length: 713, dtype: object

In [161]:
dataset['Gross'] = pd.to_numeric(dataset['Gross'], errors='coerce')

In [162]:
dataset['Runtime'] = dataset['Runtime'].str.replace("min","")
dataset['Runtime'] = pd.to_numeric(dataset['Runtime'], errors='coerce')
dataset['Runtime']

0      175
1      152
2      202
3       96
4      201
      ... 
989    157
990    144
991     78
993     87
996    118
Name: Runtime, Length: 713, dtype: int64

In [163]:
certificate_list = list(dataset["Certificate"].unique())
certificate_imdb = []
certificate_gross = []
certificate_time = []

In [164]:
# Vendo qual a faixa etária que consegue vender mais filmes

for i in certificate_list:
    x = dataset[dataset["Certificate"] == i]
    if(len(x)!=0):
        imdb = sum(x.IMDB_Rating)/len(x)
        gross = sum(x.Gross)/len(x)
        time = sum(x.Runtime)/len(x)
        certificate_imdb.append(imdb)
        certificate_gross.append(gross)
        certificate_time.append(time)
    else:
        imdb = sum(x.IMDB_Rating)
        gross = sum(x.Gross)
        time = sum(x.Runtime)
        certificate_gross.append(gross)
        certificate_imdb.append(imdb)
        certificate_time.append(time)

df_certificate = pd.DataFrame({'Gross': certificate_gross,'Certificate': certificate_list, 'IMDB_Rating': certificate_imdb, 'Time':certificate_time})
df_certificate.sort_values(by="Gross", ascending=False).head(30)

Unnamed: 0,Gross,Certificate,IMDB_Rating,Time
1,150095600.0,UA,7.929577,127.711268
2,94624830.0,U,7.959016,121.431694
0,66253980.0,A,7.993642,128.959538
4,55087320.0,G,8.022222,115.0
5,37804000.0,PG-13,7.784211,112.210526
3,26538400.0,R,7.866412,121.091603
10,26020960.0,U/A,7.6,102.0
6,20154800.0,PG,7.905263,111.947368
7,8609353.0,Passed,8.044444,124.666667
8,6949014.0,Approved,8.016667,124.666667


In [165]:
## Por meio desses dados vemos que a classificação para maiores de 12 anos é a que mais vende no mercado.

In [196]:
df_encoded = (dataset['Genre'].str.split(',').explode()).str.replace(" ","")
df_encoded = pd.get_dummies(df_encoded).groupby(df_encoded.index).sum()
df_encoded = pd.concat([dataset,df_encoded], axis=1)
df_encoded = df_encoded.drop(['Genre','Unnamed: 0'], axis=1)
df_encoded.info()
# actors = set(df_encoded['Star1'].unique()).union(set(df_encoded['Star2'].unique())).union(set(df_encoded['Star3'].unique())).union(set(df_encoded['Star4'].unique()))

# actors = list(actors)
# actors.


<class 'pandas.core.frame.DataFrame'>
Index: 713 entries, 0 to 996
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series_Title   713 non-null    object 
 1   Released_Year  713 non-null    object 
 2   Certificate    713 non-null    object 
 3   Runtime        713 non-null    int64  
 4   IMDB_Rating    713 non-null    float64
 5   Overview       713 non-null    object 
 6   Meta_score     713 non-null    float64
 7   Director       713 non-null    object 
 8   Star1          713 non-null    object 
 9   Star2          713 non-null    object 
 10  Star3          713 non-null    object 
 11  Star4          713 non-null    object 
 12  No_of_Votes    713 non-null    int64  
 13  Gross          713 non-null    int64  
 14  Action         713 non-null    int64  
 15  Adventure      713 non-null    int64  
 16  Animation      713 non-null    int64  
 17  Biography      713 non-null    int64  
 18  Comedy         