#### Import required libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### Load the data

In [2]:
titles = pd.read_csv("Titles dataset/cast.csv")
titles.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,31.0
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


#### General Properties of Data

In [3]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3786176 entries, 0 to 3786175
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   title      object 
 1   year       int64  
 2   name       object 
 3   type       object 
 4   character  object 
 5   n          float64
dtypes: float64(1), int64(1), object(4)
memory usage: 173.3+ MB


##### Observation
> From the above statistics, it is clear that the n feature may contain NaN values in the dataset

In [4]:
titles.shape

(3786176, 6)

In [5]:
titles.isna().sum()

title              0
year               0
name               0
type               0
character          0
n            1458573
dtype: int64

##### Observation:<br>
> From the above figures, it is evident that n feature of titles dataset has more than 1.4M NaN values which is 38.33% of total observations in dataset. So we need to do something to decrease this value

In [6]:
titles.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,3786176.0,1988.910769,27.892251,1894.0,1970.0,2001.0,2012.0,2115.0
n,2327603.0,16.905993,31.706789,1.0,5.0,10.0,21.0,33613.0


#### Research Questions

#### Q1. How many movies are listed in the titles dataframe?

In [7]:
movies_count = titles['title'].count()
print('There are {} movies present in titles dataset'.format(movies_count))

There are 3786176 movies present in titles dataset


#### Q2. What are the earliest two films listed in the titles dataframe?

In [8]:
#Load the dataset
names = pd.read_csv("Titles dataset/titles.csv")

#sort the dataframe by year
result = names.sort_values('year', ascending=True)

#print earliest two films released 
print("Earliest two films released are:\n")
print(result.head(2))

Earliest two films released are:

                     title  year
193222          Miss Jerry  1894
51388   The Startled Lover  1898


#### Q3. How many movies have the title "Hamlet"?

In [9]:
#Lets create a df named 'hamlet_title' that contains the data with movie title 'Hamlet'
hamlet_title = titles[titles['title'] == 'Hamlet']

#print the length
print('The are {} movies with title hamlet'.format(hamlet_title.shape[0]))

The are 342 movies with title hamlet


#### Q4. How many movies are titled "North by Northwest"?

In [10]:
#Lets create a df named 'north' that contains the data with movie title 'Hamlet'
north = titles[titles['title'] == 'North by Northwest']

#print the length
print('The are {} movies with title North by Northwest'.format(north.shape[0]))

The are 116 movies with title North by Northwest


#### Q5. When was the first movie titled "Hamlet" made?

In [11]:
#Lets create a small df named new_df that consist of title of movie and year which the movie released
new_df = titles[['title','year']]

#Capturing the result by filtering the movie title 'Hamlet' and 
#sorting according to year and printing 1st value using head function
result = new_df[new_df['title'] == 'Hamlet'].sort_values('year', ascending=True).head(1)
print("The first movie titled Hamlet was made on:\n")
print(result.head(1))

The first movie titled Hamlet was made on:

          title  year
1602320  Hamlet  1910


#### Q6. List all of the "Treasure Island" movies from earliest to most recent.

In [12]:
#Lets create a small df named my_df that consist of title of movie and year which the movie released
my_df = titles[['title','year']]

#Capturing the result by filtering the movie title 'Hamlet' and 
#sorting according to earliest to most recent and printing...
result = my_df[new_df['title'] == 'Treasure Island'].sort_values('year', ascending=True)
print("Below is the list of movie with title Treasure Island from earliest to most recent:\n")
print(result)

Below is the list of movie with title Treasure Island from earliest to most recent:

                   title  year
860800   Treasure Island  1918
871803   Treasure Island  1918
942632   Treasure Island  1918
3450730  Treasure Island  1918
1778878  Treasure Island  1918
...                  ...   ...
2511526  Treasure Island  1999
1698413  Treasure Island  1999
1426457  Treasure Island  1999
3340561  Treasure Island  1999
319551   Treasure Island  1999

[190 rows x 2 columns]


#### Q7. How many movies were made in the year 1950?

In [13]:
#Lets create a small df named required_df that consist of movies released in year 1950
required_df = titles[titles['year'] == 1950]

#printing the result using shape method, value zero return number of rows
print("Number of movies that were made in year 1950 is: ", required_df.shape[0])

Number of movies that were made in year 1950 is:  22397


#### Q8. How many movies were made in the year 1960?

In [14]:
#Lets create a small df named required_df that consist of movies released in year 1960
required_df = titles[titles['year'] == 1960]

#printing the result using shape method, value zero return number of rows
print("Number of movies that were made in year 1960 is: ", required_df.shape[0])

Number of movies that were made in year 1960 is:  19164


#### Q9. How many movies were made from 1950 through 1959?

In [15]:
#Lets create a small df named required_df that consist of movie titles and years released
df_range = titles[['title','year']]

#lets store the dataframe that consists of titles that were made in between 1950 to 1959
result = df_range[df_range['year'].between(1950,1959)]

#sort the result dataframe by year-wise
result2 = result.sort_values('year', ascending=True)

#print the no. of titles by shape method
print("The number of movies that were made between year 1950 to year 1959 is: ",result2.shape[0])

The number of movies that were made between year 1950 to year 1959 is:  215471


#### Q10. In what years has a movie titled "Batman" been released?

In [16]:
#Lets create a small df named required_df that consist of movie titles and years released
h = titles[['title','year']]

#Filter the above dataframe with movie titled 'Batman'
result = h[h['title'] == 'Batman']

#Print the years in which batman got released
print(result['year'])

99564      1943
100080     1943
118792     1989
124426     1989
155945     1989
           ... 
3031640    1989
3402807    1943
3499598    1989
3501652    1989
3513305    1989
Name: year, Length: 123, dtype: int64


#### Q11. How many roles were there in the movie "Inception"?

In [17]:
#Lets create a dataframe with having inception movie
inceptionMovies  = titles[titles['title'] == 'Inception']

#get the characters count using unique() function
allCharacters = inceptionMovies.character.unique()

#Printing the output
print("Total number of characters in movie inception: ", len(allCharacters))

Total number of characters in movie inception:  56


#### Q12. How many roles in the movie "Inception" are NOT ranked by an "n" value?

In [18]:
#Creating a dataframe that is filtered with inception
inceptionMovie = titles[titles['title'] == 'Inception']

#Compute the NaN values which is not ranked category in this above dataabsframe
notRanked = inceptionMovie[inceptionMovie['n'].isnull()]

#print the output
print("The number of roles did not receive a n value in the movie inception are:\n")
print(len(notRanked['character'].unique()))

The number of roles did not receive a n value in the movie inception are:

21


#### Q13. But how many roles in the movie "Inception" did receive an "n" value?

In [19]:
#Creating a dataframe that is filtered with inception
inceptionMovie = titles[titles['title'] == 'Inception']

#Compute the non NAN values which is  ranked category in this above dataframe
Ranked = inceptionMovie[inceptionMovie['n'].notna()]

#print the output
print("The number of roles did receive a n value in the movie inception are:\n")
print(len(Ranked['character'].unique()))

The number of roles did receive a n value in the movie inception are:

36


#### Q14. Display the cast of "North by Northwest" in their correct "n"-value order, ignoring roles that did not earn a numeric "n" value.

In [20]:
#Filter the original dataset with 3 features title, n and character
required = titles[['title','n','character', 'name']]

#filter again with movie title north by northwest
required_new = required[required['title'] == 'North by Northwest']

#Drop the duplicates to not count the repeated values
required_new.drop_duplicates(keep='first', inplace=True)

#sort the result with n value
result = required_new.sort_values('n', ascending=True)

#print the cast also called names ordered by n in the movie north by northwest
print(result[['name','n']])

                        name    n
871426            Cary Grant  1.0
3518333      Eva Marie Saint  2.0
1457165          James Mason  3.0
3157232  Jessie Royce Landis  4.0
356707        Leo G. Carroll  5.0
...                      ...  ...
3441077      Maudie Prickett  NaN
3577128          Doris Singh  NaN
3599912         Helen Spring  NaN
3739133        Susan Whitney  NaN
3751311       Paula Winslowe  NaN

[116 rows x 2 columns]


#### Q15. Display the entire cast, in "n"-order, of the 1972 film "Sleuth".

In [21]:
#Creating a first filter from original dataset where taking title, n, name and year
filter_1 = titles[['title','n','name','year']]

#Creating a second filter from filter_1 dataset where taking title as sleuth
filter_2 = filter_1[filter_1['title'] == 'Sleuth']

#Creating a third filter from filter_2 dataset where taking year in which sleuth was released as 1972
filter_3 = filter_2[filter_2['year']==1972]

#Sorting the result by n value
filter_4 = filter_3.sort_values('n', ascending=True)

#Displaying the cast of 1972 sleuth film with n-value ordered
print(filter_4[['name','n']])

                       name    n
1705394    Laurence Olivier  1.0
326773        Michael Caine  2.0
373876       Alec Cawthorne  3.0
1465483  John (II) Matthews  4.0
2724553  Eve (III) Channing  5.0
1448690        Teddy Martin  6.0


#### Q16. Now display the entire cast, in "n"-order, of the 2007 version of "Sleuth".

In [22]:
#Creating a first filter from original dataset where taking title, n, name and year
filter_1 = titles[['title','n','name','year']]

#Creating a second filter from filter_1 dataset where taking title as sleuth
filter_2 = filter_1[filter_1['title'] == 'Sleuth']

#Creating a third filter from filter_2 dataset where taking year in which sleuth was released as 2007
filter_3 = filter_2[filter_2['year']==2007]

#Sorting the result by n value
filter_4 = filter_3.sort_values('n', ascending=True)

#Displaying the cast of 2007 sleuth film with n-value ordered
print(filter_4[['name','n']])

                        name    n
326774         Michael Caine  1.0
1293634             Jude Law  2.0
1805907        Harold Pinter  3.0
260242       Kenneth Branagh  NaN
373877   Alec (II) Cawthorne  NaN
2724552    Eve (II) Channing  NaN
3370424    Carmel O'Sullivan  NaN


#### Q17. How many roles were credited in the silent 1921 version of Hamlet?

In [23]:
#Creating a first filter from original dataset where taking title, n, character and year
filter_one = titles[['title','year','character','n']]

#Creating a second filter from filter_1 dataset where taking title as hamlet
filter_two = filter_one[filter_one['title'] == 'Hamlet']

#Creating a third filter from filter_2 dataset where taking year in which sleuth was released as 1921
filter_three = filter_two[filter_two['year'] == 1921]

#creating a fourth filter by taking notna values of the n column 
filter_four = filter_three[filter_three['n'].notna()]

#printing the chararcters count who got credits in 1921 hamlet film
print("The number of roles that were credited in the silent 1921 version of Hamlet film is: ")
print(len(filter_four['character'].unique()))

The number of roles that were credited in the silent 1921 version of Hamlet film is: 
9


#### Q18. How many roles were credited in Branagh’s 1996 Hamlet?

In [24]:
#Creating a first filter from original dataset where taking title, n, name, character and year
filter_one = titles[['title','year','name','n','character']]

#Creating a second filter from filter_1 dataset where taking title as hamlet
filter_two = filter_one[filter_one['title'] == 'Hamlet']

#Creating a third filter from filter_2 dataset where taking year in which sleuth was released as 1921
filter_three = filter_two[filter_two['year'] == 1996]


# #creating a fourth filter by taking character name is branagh 
filter_four = filter_three[filter_three['name'] == 'Kenneth Branagh']
filter_four

# #printing the chararcters count who got credits in 1921 hamlet film
print("The number of roles that were credited in the branagh's 1996 Hamlet film is: ")
print(len(filter_four['character']))

The number of roles that were credited in the branagh's 1996 Hamlet film is: 
1


#### Q19. How many "Hamlet" roles have been listed in all film credits through history?

In [50]:
#Creating a first filter from original dataset where taking title, n, character
filter_one = titles[['title','n','character']]

#Creating a second filter from filter_1 dataset where taking title as hamlet
filter_two = filter_one[filter_one['title'] == 'Hamlet']

#Creating a third filter from filter_2 dataset where taking year in which sleuth was released as 1921
filter_three = filter_two[filter_two['n'].notna()]


#count the characters/roles appeared in hamlet movie
print("Total roles that have been listed in all film credits history:\n",filter_three['character'].count())

Total roles that have been listed in all film credits history:
 249


#### Q20. How many people have played an "Ophelia"?

In [33]:
#Creating a first filter from original dataset where taking character, name 
filter_one = titles[['name','character']]

#Creating a second filter from first filter where character is ophelia
filter_two = filter_one[filter_one['character'] == 'Ophelia']

#PRint the count of names who played the ophelia character
print("No. of people who played role named ophelia is: " , filter_two['name'].count())

No. of people who played role named ophelia is:  117


#### Q21. How many people have played a role called "The Dude"?

In [34]:
#Creating a first filter from original dataset where taking character, name 
filter_one = titles[['name','character']]

#Creating a second filter from first filter where character is the dude
filter_two = filter_one[filter_one['character'] == 'The Dude']

#PRint the count of names who played the ophelia character
print("No. of people who played role named The Dude is: " , filter_two['name'].count())

No. of people who played role named The Dude is:  19


#### Q22. How many people have played a role called "The Stranger"?

In [35]:
#Creating a first filter from original dataset where taking character, name 
filter_one = titles[['name','character']]

#Creating a second filter from first filter where character is The stranger
filter_two = filter_one[filter_one['character'] == 'The Stranger']

#PRint the count of names who played the ophelia character
print("No. of people who played role named The Stranger is: " , filter_two['name'].count())

No. of people who played role named The Stranger is:  212


#### Q23. How many roles has Sidney Poitier played throughout his career?

In [36]:
#Creating a first filter from original dataset where taking character, name 
filter_one = titles[['name','character']]

#Creating a second filter from first filter where name is Sidney Poitier
filter_two = filter_one[filter_one['name'] == 'Sidney Poitier']

#PRint the count of roles that he played
print("No. of roles Sidney Poitier played in his career: " , filter_two['character'].count())

No. of roles Sidney Poitier played in his career:  43


#### Q24. How many roles has Judi Dench played?

In [38]:
#Creating a first filter from original dataset where taking character, name 
filter_one = titles[['name','character']]

#Creating a second filter from first filter where name is Judi Dench
filter_two = filter_one[filter_one['name'] == 'Judi Dench']

#PRint the count of roles that he played
print("No. of roles Judi Dench played in his career: " , filter_two['character'].count())

No. of roles Judi Dench played in his career:  55


#### Q25. List the supporting roles (having n=2) played by Cary Grant in the 1940s, in order by year.

In [45]:
#Creating a first filter from original dataset where taking character, n, name and year
filter_one = titles[['name','character','n','year']]

#Creating a second filter from first filter where name is Cary grant
filter_two = filter_one[filter_one['name'] == 'Cary Grant']

#Creating a thrid filter from second filter where n is 2
filter_three = filter_two[filter_two['n'] == 2]
filter_three

#Creating a fourth filter where he played supporting roles in 1940's
filter_four = filter_three[filter_three['year'].between(1940,1950)]

#PRinting the result sorted by year
print(filter_four.sort_values('year', ascending=True))

              name    character    n  year
871423  Cary Grant   Nick Arden  2.0  1940
871433  Cary Grant  Roger Adams  2.0  1941


#### Q26. List the leading roles that Cary Grant played in the 1940s in order by year.

In [44]:
#Creating a first filter from original dataset where taking character, n, name and year
filter_one = titles[['name','character','n','year']]

#Creating a second filter from first filter where name is Cary grant
filter_two = filter_one[filter_one['name'] == 'Cary Grant']

#Creating a thrid filter from second filter where n is 1(leading role)
filter_three = filter_two[filter_two['n'] == 1]

#Creating a fourth filter where he played leading roles in 1940's
filter_four = filter_three[filter_three['year'].between(1940,1950)]

#PRinting the result sorted by year
print(filter_four.sort_values('year', ascending=True))

              name                             character    n  year
871448  Cary Grant                           Matt Howard  1.0  1940
871407  Cary Grant                          Walter Burns  1.0  1940
871450  Cary Grant                    C. K. Dexter Haven  1.0  1940
871438  Cary Grant                      Johnnie Aysgarth  1.0  1941
871452  Cary Grant                          Leopold Dilg  1.0  1942
871429  Cary Grant                 Patrick 'Pat' O'Toole  1.0  1942
871398  Cary Grant                         Capt. Cassidy  1.0  1943
871422  Cary Grant  Joe Adams -posing as Joe Bascopolous  1.0  1943
871430  Cary Grant                           Jerry Flynn  1.0  1944
871390  Cary Grant                     Mortimer Brewster  1.0  1944
871425  Cary Grant                            Ernie Mott  1.0  1944
871427  Cary Grant                                Devlin  1.0  1946
871424  Cary Grant                           Cole Porter  1.0  1946
871444  Cary Grant                           Dic

#### Q27. How many roles were available for actors in the 1950s?

In [51]:
#Creating a first filter from original dataset where taking character, type, year
filter_one = titles[['type','character','year']]

#Creating a second filter from first filter where type is actor
filter_two = filter_one[filter_one['type'] == 'actor']

#Creating a thrid filter where year is 1950s
filter_three = filter_two[filter_two['year'].between(1950,1960)]

#Printing the number of roles available for actors
print("Number of roles available for actors in 1950s is: " ,filter_three['character'].count())

Number of roles available for actors in 1950s is:  171302


#### Q28. How many roles were available for actresses in the 1950s?

In [52]:
#Creating a first filter from original dataset where taking character, type, year
filter_one = titles[['type','character','year']]

#Creating a second filter from first filter where type is actress
filter_two = filter_one[filter_one['type'] == 'actress']

#Creating a thrid filter where year is 1950s
filter_three = filter_two[filter_two['year'].between(1950,1960)]

#Printing the number of roles available for actors
print("Number of roles available for actresses in 1950s is: " ,filter_three['character'].count())

Number of roles available for actresses in 1950s is:  63333


#### Q29. How many leading roles (n=1) were available from the beginning of film history through 1980?

In [62]:
#Creating a first filter from original dataset where taking character, n, year
filter_one = titles[['character','n','year']]

#Creating a second filter from first filter where dataframe has years till 1980
filter_two = filter_one[filter_one['year'].between(1894,1980)]

#Creating a thrid filter where n is 1 from second filter
filter_three = filter_two[filter_two['n'] == 1]

#Printing the number of leading roles from beginning of industry to 1980
print("The number of leading roles available from beginning of film industry to 1980 is: ", filter_three['character'].count())


The number of leading roles available from beginning of film industry to 1980 is:  65140


#### Q30. How many non-leading roles were available through from the beginning of film history through 1980?

In [63]:
#Creating a first filter from original dataset where taking character, n, year
filter_one = titles[['character','n','year']]

#Creating a second filter from first filter where dataframe has years till 1980
filter_two = filter_one[filter_one['year'].between(1894,1980)]

#Creating a thrid filter where n is not 1 from second filter
filter_three = filter_two[filter_two['n'] != 1]

#Printing the number of leading roles from beginning of industry to 1980
print("The number of non-leading roles available from beginning of film industry to 1980 is: ", filter_three['character'].count())


The number of non-leading roles available from beginning of film industry to 1980 is:  1117667


#### Q31. How many roles through 1980 were minor enough that they did not warrant a numeric "n" rank?

In [67]:
#Creating a first filter from original dataset where taking character, n
filter_one = titles[['character','n', 'year']]

#Creating a second filter from first filter where n is null or nan(minor enough to get numeric n rank)
filter_two = filter_one[filter_one['n'].isnull()]

#Creating a thrid filter that were minors from beginning(minimum value of year == 1894) to 1980
filter_three = filter_two[filter_two['year'].between(1894,1980)]

#Printing the number of roles that are not enough to get numeric n rank
print("The number of roles that were minors enough to get numeric n rank is from beginning to 1980: ")
print(filter_three['character'].count())

The number of roles that were minors enough to get numeric n rank is from beginning to 1980: 
448347
