## Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## File

In [2]:
url = 'https://raw.githubusercontent.com/albvieiraa/EDA-Streamings/refs/heads/main/datasets/amazon_prime_titles.csv'

In [3]:
df_amazon = pd.read_csv(url)
df_amazon.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [4]:
df_amazon.tail(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
9666,s9667,TV Show,Maradona: Blessed Dream,,"Esteban Recagno, Ezequiel Stremiz, Luciano Vit...",,,2021,TV-MA,1 Season,"Drama, Sports","The series tells the story of Diego Maradona, ..."
9667,s9668,Movie,Harry Brown,Daniel Barber,"Michael Caine, Emily Mortimer, Joseph Gilgun, ...",,,2010,R,103 min,"Action, Drama, Suspense","Harry Brown, starring two-time Academy Award w..."


In [None]:
df_amazon.shape

(9668, 12)

In [None]:
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7585 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
dtypes: int64(1), object(11)
memory usage: 906.5+ KB


In [None]:
df_amazon.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2083
cast,1233
country,8996
date_added,9513
release_year,0
rating,337
duration,0


In [None]:
# categorias do type
df_amazon['type'].unique()

array(['Movie', 'TV Show'], dtype=object)

## Tratamento dos dados

In [5]:
# Criando uma cópia
df_amazon_tratando = df_amazon.copy()

In [6]:
df_amazon_tratando = df_amazon_tratando.rename(columns={'listed_in': 'gender'})

#### Movies

In [7]:
df_amazon_movies = df_amazon_tratando[df_amazon_tratando['type'] == 'Movie']

In [8]:
df_amazon_movies.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,gender,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...


In [9]:
df_amazon_movies['duration'] = df_amazon_movies['duration'].str.replace(' min', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_movies['duration'] = df_amazon_movies['duration'].str.replace(' min', '')


In [10]:
# Transformar str em float
df_amazon_movies['duration']= df_amazon_movies['duration'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_movies['duration']= df_amazon_movies['duration'].astype(float)


In [11]:
df_amazon_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7814 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       7814 non-null   object 
 1   type          7814 non-null   object 
 2   title         7814 non-null   object 
 3   director      7585 non-null   object 
 4   cast          7050 non-null   object 
 5   country       569 non-null    object 
 6   date_added    16 non-null     object 
 7   release_year  7814 non-null   int64  
 8   rating        7483 non-null   object 
 9   duration      7814 non-null   float64
 10  gender        7814 non-null   object 
 11  description   7814 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 1.0+ MB


#### TV Show

In [12]:
df_amazon_series = df_amazon_tratando[df_amazon_tratando['type'] == 'TV Show']

In [13]:
df_amazon_series.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1854 entries, 17 to 9666
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1854 non-null   object
 1   type          1854 non-null   object
 2   title         1854 non-null   object
 3   director      0 non-null      object
 4   cast          1385 non-null   object
 5   country       103 non-null    object
 6   date_added    139 non-null    object
 7   release_year  1854 non-null   int64 
 8   rating        1848 non-null   object
 9   duration      1854 non-null   object
 10  gender        1854 non-null   object
 11  description   1854 non-null   object
dtypes: int64(1), object(11)
memory usage: 188.3+ KB


In [14]:
df_amazon_series.describe()

Unnamed: 0,release_year
count,1854.0
mean,2014.545307
std,9.440291
min,1932.0
25%,2013.0
50%,2017.0
75%,2020.0
max,2021.0


In [15]:
df_amazon_series['date_added'] = pd.to_datetime(df_amazon_series['date_added'], errors='coerce') #first

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_series['date_added'] = pd.to_datetime(df_amazon_series['date_added'], errors='coerce') #first


In [16]:
df_amazon_series['date_added'] = df_amazon_series['date_added'].dt.date # second

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_series['date_added'] = df_amazon_series['date_added'].dt.date # second


In [17]:
df_amazon_series['duration'] = df_amazon_series['duration'].str.replace(' Seasons', '')
df_amazon_series['duration'] = df_amazon_series['duration'].str.replace(' Season', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_series['duration'] = df_amazon_series['duration'].str.replace(' Seasons', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_series['duration'] = df_amazon_series['duration'].str.replace(' Season', '')


In [18]:
# transformar coluna duration em float
df_amazon_series['duration'] = df_amazon_series['duration'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_series['duration'] = df_amazon_series['duration'].astype(float)


In [19]:
df_amazon_series.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,gender,description
17,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,,NaT,2008,ALL,1.0,"Kids, Special Interest",A heart warming and inspiring series that welc...
18,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,,NaT,2020,18+,1.0,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...
