# Explore Dataset - Homework exercice 1 (Programming task)

In [None]:
import pandas as pd

In [None]:
netflix_df = pd.read_csv("./datasets/netflix_titles.csv")

In [None]:
netflix_df.head()

In [None]:
netflix_df.info()

In [None]:
netflix_df.shape

In [None]:
import numpy as np
netflix_df.describe(include=np.object)

### Examine how often specific values occur in release_year column

In [None]:
netflix_df['release_year'].value_counts()

## Quering the dataset

In [None]:
netflix_df[netflix_df['release_year'] > 2019]

### Sorting

In [None]:
netflix_df.sort_values(by='release_year').head()

## Transforming Dataset

### Convert season to minutes and convert x min to x (convert string to int)

In [None]:
def convert_duration(d):
    a = d.split(' ')
    if a[1] == 'min':
        return int(a[0])
    else:
        #estimate season duration
        # 12 = average number of episodes
        # 45 = average number of minutes per episode
        return int(a[0])*12*45
        
netflix_df['duration'] = netflix_df['duration'].apply(convert_duration)
netflix_df.head()

In [None]:
netflix_df.isnull().sum()

In [None]:
netflix_df.dropna(inplace=True)

In [None]:
netflix_df.isnull().sum()

## Convert String Date to Datetime

### Conversion Times

In [None]:
# Conversion times with inference and without

'''
%timeit pd.to_datetime(netflix_df['date_added'], infer_datetime_format=True)

%timeit pd.to_datetime(netflix_df['date_added'], infer_datetime_format=False)
'''

![Convertion Times](images/convertion_times.png)

### Convert date to datetime

In [None]:
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'], infer_datetime_format=False)

In [None]:
netflix_df.dropna(inplace=True)

In [None]:
netflix_df.info()

## Visualizing

In [None]:
netflix_df["release_year"].value_counts().head(10).plot(kind="bar")

### Year Histogram

In [None]:
netflix_df.hist(column='release_year', bins =20)

### Duration Histogram

In [None]:
#netflix_df.boxplot(by ='', column =[''], grid = False)
netflix_df.hist(column='duration', bins=50)

## Top rated movies on netflix

In [None]:
#Loads IMDB ratings dataset
imdb_ratings=pd.read_csv('datasets/IMDb ratings.csv',usecols=['weighted_average_vote'])
imdb_titles=pd.read_csv('datasets/IMDb movies.csv', usecols=['title','year','genre'], dtype=str)

#Builds a dataframe containing movie's title, release year, rating and genre
ratings = pd.DataFrame({'Title':imdb_titles.title,
                    'Release Year':imdb_titles.year,
                    'Rating': imdb_ratings.weighted_average_vote,
                    'Genre':imdb_titles.genre})

#Removes duplicate values from the dataframe and rebuilds it
ratings.drop_duplicates(subset=['Title','Release Year','Rating'], inplace=True)
ratings.shape

In [None]:
imdb_titles.info()

In [None]:
imdb_ratings.info()

In [None]:
#Removes rows with empty values from the dataframe and rebuilds it
ratings.dropna(inplace=True)

#Merges the two dataframes into a new one
joint_data=ratings.merge(netflix_df,left_on='Title',right_on='title',how='inner')

#Sorts the dataframe into descending rating order
joint_data=joint_data.sort_values(by='Rating', ascending=False)

In [None]:
joint_data.info()

In [None]:
#Imports Plotly library
import plotly.express as px

#Picks the first 10 elements from the dataframe and creates a list
top_rated=joint_data[0:10]

#Builds sunburst visualization with Plotly
fig = px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

## Top Movie Makers

In [None]:
#Creates a dataframe based on the value counts of movies by 'Country' column
country_count=joint_data['country'].value_counts().sort_values(ascending=False)
country_count=pd.DataFrame(country_count)

#Picks 11 top counts
topcountries=country_count[0:11]
topcountries

In [None]:
#Get countries to list format
topcountries.to_dict()['country'].keys()

In [None]:
#Get movie counts to list format
topcountries['country'].values.tolist()

In [None]:
#Converts data to dictionary to fit plotting criteria
data = dict(
    number=topcountries['country'].values.tolist(),
    country=topcountries.to_dict()['country'].keys())

#Plots funnel image
fig = px.funnel(data, x='number', y='country')
fig.show()