# Data Analysis with Python - Netflix Dataset
(dataset from Kaggle)

In [None]:
#Import the dataset
import pandas as pd #to import pandas library
data= pd.read_csv(r"filepathname")
data #to preview dataset

In [None]:
#Get some basic information from the dataset
data.head() #shows top 5 records
data.tail() #shows bottom 5 records
data.shape #shows no. of columns & rows
data.size #shows no. of total values (elements)
data.columns #shows each column names
data.dtypes #shows data type of each column
data.info() #shows indexes, columns, data-types of each column, memory at once


In [None]:
#Task 1: Is there any duplicae records in the dataset? If yes, then remove the duplicate records.
data[data.duplicated()] #shows only duplicate records
data.drop_duplicates(inplace = True) #for permanent change
data[data.duplicated()] #to check all duplicated were removed


In [None]:
#Task 2: Is there any null value present in any column? SHow with heat-map.
data.head() #to look at data
data.isnull() #to show null values - true means null values
data.isnull().sum() #to show count of null values in each column

In [None]:
import seaborn as sns #to import Seaborn library
sns.heatmap(data.isnull()) #using heat-map to show mull values count


In [None]:
#Q.1. For "House of Cards", what is the show id and who is the director of this show?
data[data['Title'].isin(['House of Cards'])] #to show all records of a particular item in any column


In [None]:
data[data['Title'].str.contains('House of Cards')] #to show all records of a particular string in any column

In [None]:
#Q.2. In which year was the highest number of tv shows & movies released? Show with bar graph.
data.dtypes
data['Date_N'] = pd.to_datetime(data['Release_Date'])
data.head()
data.dtypes #to ensure it's datetime element
data['Date_N'].dt.year.value_counts() #it counts the occurence of all individual years in date column.
data['Date_N'].dt.year.value_counts().plot(kind='bar')


In [None]:
#Q.3. How many movies & tv shows are in the dataset? Show with bar graph.
data.head(2) #to show first 2 rows
data.groupby('Category').Category.count() #to group all unique items of a column and show their count
sns.countplot(data['Category']) #to show the count of all uniques values of any columns as a bar graph

In [None]:
#Q.4. Show all the movies that were released in 2000 & 2020.
#create new column
data.head(2)
data['Year'] = data['Date_N'].dt.year  #to create new Year column; it will consider only year
data.head(2)

In [None]:
#filtering
data[(data['Category'] == 'Movie') & (data['Year'] == 2000)]
data[(data['Category'] == 'Movie') & (data['Year'] == 2020)]

In [None]:
#Q.5. Show only the titles of all tv shos that were released in India only.
data.head(2)
data[(data['Category']=='TV Show') & (data['Country']=='India')] ['Title'] #it filters by Title only

In [None]:
#Q.6. Show top 10 directors who gave the highest numbr of tv shows & movies to Netflix?
data['Director'].value_counts().head(10)

In [None]:
#Q.7. Show all records where 'Category' is movie and 'Type' is comedies or 'Country' is United Kingdom.
#filtering (And/Or operators)
data.head(2)
data[(data['Category']== 'Movie') & (data['Type']=='Comedies')]
data[(data['Category']== 'Movie') & (data['Type']=='Comedies') | (data['Country']== 'United Kingdom')]

In [None]:
#Q.8. In how many movies/shows was Tom Cruise cast?
#filtering
data.head(2)
data[data['Cast']== 'Tom Cruise'] #should not show any results since there are multiple names in column


In [None]:
#string contains
data[data['Cast'].str.contains('Tom Cruise')] #should not work since there are null values in column


In [None]:
#create new data-frame
data_new = data.dropna()  #drops rows with missing values
data_new.head(2)

data_new[data_new['Cast'].str.contains('Tom Cruise')] #should work now

In [None]:
#Q.9. What are the different ratings defined by Netflix?
data_new.head(2)
data_new['Rating'].nunique()
data_new['Rating'].unique()

In [None]:
#Q.9.1. How many movies got the 'TV-14' raing in Canada?
data_new.head(2)
data_new[(data_new['Category']=='Movie') & (data_new['Rating']=='TV-14') & (data_new['Country']=='Canada')].shape

In [None]:
#Q.9.2. How many tv shows got the 'R' rating after 2018?
data_new[(data_new['Category']=='TV Show') & (data_new['Rating']=='R') & (data_new['Year'] > 2018)]

In [None]:
#Q.10. What is the maximum duration of a movie/show on Netflix?
data_new.head(2)
data_new.Duration.unique()
data_new.Duration.dtypes

In [None]:
#strip strings
data_new.head(2)
data_new[['Minutes','Unit']] = data_new['Duration'].str.split(' ', expand = True)
data_new.head(2)

In [None]:
data_new['Minues'].max()
data_new['Minues'].min()
#output may be incorrect as minutes column is an object. try converting column to integer if aggregation is necessary.

In [None]:
#Q.11. Which indivdual country has the highest no. of tv shows?
data_new.head(2)

#create new dataframe which has only tv shows
data_tvshow = data_new[data_new['Category']== 'TV Show']
data_tvshow.head(2)
data_tvshow.Country.values_counts().head(1)

In [None]:
#Q.12. How can we sort the dataset by year?
data_new.sort_values(by = 'Year', ascending=False).head()

In [None]:
#Q.13. Find all instances where:
#Category is 'Movie' and Type is 'Dramas'
data_new[(data_new['Category']=='Movie') & (data_new['Type']=='Dramas')].head(2)

In [None]:
#Category is 'TV Show' and Type is 'Kids' TV'
data_new[(data_new['Category']=='TV Show') & (data_new['Type']=="Kids' TV")].head(2)

In [None]:
#use Or operator for mulitple conditions
data_new[(data_new['Category']=='Movie') & (data_new['Type']=='Dramas') | (data_new['Category']=='TV Show') & (data_new['Type']=="Kids' TV")]