##  1. <u>Reading Data </u>

In [1]:
import numpy as np
import pandas as pd

In [3]:
movies = pd.read_csv('data/movies.csv')

In [5]:
movies.head(3)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


## 2. <u> Accessing Main DF Components </u>

In [6]:
index = movies.index
columns = movies.columns
data = movies.values

In [13]:
print(type(data), type(index), type(columns))

<class 'numpy.ndarray'> <class 'pandas.core.indexes.range.RangeIndex'> <class 'pandas.core.indexes.base.Index'>


## 3. <u>Understanding Data Types</u>

In [14]:
movies.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [15]:
movies.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

## 4. <u>Selecting a single column as a Series</u>

In [21]:
directorName = movies['director_name']
#if name is not passed, series name is taken as the col name
directorName = directorName.to_frame(name='DirectorName')

In [22]:
directorName.head()

Unnamed: 0,DirectorName
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


## 5. <u>Calling Series Methods</u>

In [26]:
len(set(dir(pd.Series))) #finding the number of Series methods

439

In [27]:
len(set(dir(pd.DataFrame))) #finding the number of DataFrame methods

444

In [36]:
director = movies['director_name']
director.value_counts().head()

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Ridley Scott        17
Name: director_name, dtype: int64

In [38]:
likes1 = movies['actor_1_facebook_likes']
#the output indicates that the likes have been rounded off to the nearest thousands
#since, its unlikely that so many datapoints got the EXACT SAME number of likes
likes1.value_counts().head()

1000.0     449
11000.0    211
2000.0     197
3000.0     155
12000.0    135
Name: actor_1_facebook_likes, dtype: int64

In [40]:
#finding the number of rows with non missing values in the dataframe
likes1.count(), director.count()

(5036, 4939)

In [43]:
likes1.quantile([0.3, 0.4])

0.3    700.0
0.4    862.0
Name: actor_1_facebook_likes, dtype: float64

In [55]:
#returns a series containing bools, where True indicates that the value is null
director.isnull().tail()

5038    False
5039     True
5040    False
5041    False
5042    False
Name: director_name, dtype: bool

In [56]:
#lets create a "likes" series, where all the null values are filled with 0
likes1_filled = likes1.fillna(0)
#lets also create a "likes" series, where all the null values are dropped
likes1_dropped = likes1.dropna()
#lets print out the number of missing values in the actor1_fb_likes series
likes1_filled.size - likes1_dropped.size

7

## 06. <u>Working with Operators on a Series</u>

In [57]:
imdb = movies["imdb_score"]
imdb.head()

0    7.9
1    7.1
2    6.8
3    8.5
4    7.1
Name: imdb_score, dtype: float64

In [58]:
(imdb * 3).head(), (imdb > 8).head()

(0    23.7
 1    21.3
 2    20.4
 3    25.5
 4    21.3
 Name: imdb_score, dtype: float64, 0    False
 1    False
 2    False
 3     True
 4    False
 Name: imdb_score, dtype: bool)

## 07. <u>Chaining Methods in a Series</u>

In [60]:
#a more compact ways of finding the number of missing values in a Series
likes1.isnull().sum()
#this is because pandas assumes True=1 and False=0, and therefore the summation of the series gives the count of missing values.

7

In [61]:
#lets fill in the missing values, convert the datatype to int and then print head
likes1.fillna(0)\
      .astype(int)\
      .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

## 08. <u>Making the Index more meaningful</u>

In [74]:
#lets use movie title as the row label for our dataframe
movies2 = movies.set_index('movie_title')
movies2.tail(3)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,Documentary,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


In [111]:
#alternate way to do the same thing
movies3 = pd.read_csv('data/movies.csv', index_col='movie_title')
movies3.tail(3)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,Documentary,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


In [77]:
#lets reset movie2 make to the default rangeIndex
movies2.reset_index().tail(3)

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
5040,A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
5041,Shanghai Calling,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660
5042,My Date with Drew,Color,Jon Gunn,43.0,90.0,16.0,16.0,Brian Herzlinger,86.0,85222.0,...,84.0,English,USA,PG,1100.0,2004.0,23.0,6.6,1.85,456


In [78]:
#lets now reinitialize movies2 to include the 'movie title' in the dataFrame, while using it as the row label simultaneously
print(movies2.columns.size)
movies2 = movies.set_index('movie_title', drop=False)
print(movies2.columns.size)

27
28


## 09. <u>Renaming Columns and rows</u>

In [112]:
idx_rename = {'Avatar' : 'Ratava', 'Spectre' : 'Ertceps'}
col_rename = {'director_name' : 'Director Name', 'num_critic_for_reviews' : 'Critical Reviews'}

In [113]:
#idx_rename seems to have no effect... no clue on whats going wrong
movies3.rename(index=idx_rename,columns=col_rename).head()

Unnamed: 0_level_0,color,Director Name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## 10. <u>Adding and deleting columns from the DataFrame</u>

In [114]:
#adding a column called 'has_seen' to the dataFrame
movies3['has_seen'] = 0
movies3.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes', 'has_seen'],
      dtype='object')

In [115]:
#lets add a col actor_director_fb_likes
movies3['actor_director_fb_likes'] = (movies3['actor_1_facebook_likes']+\
                                      movies3['actor_2_facebook_likes']+\
                                      movies3['actor_3_facebook_likes']+\
                                      movies3['director_facebook_likes'])

In [116]:
movies3['actor_director_fb_likes'].isnull().sum()

124

In [117]:
movies3['actor_director_fb_likes'] = movies3['actor_director_fb_likes'].fillna(0)

In [118]:
#We want to check what fraction of the newly added col contributes to cast_total_facebook_likes
#however, inorder to do this, we need values in the new col to be less than cast_total_facebook_likes in all rows
#lets check that
movies3['is_cast_more'] = (movies3['cast_total_facebook_likes'] >= movies3['actor_director_fb_likes'])

In [123]:
#check if all entries in the series is True
movies3['is_cast_more'].all()

False

In [124]:
#this means it does not make sense to find the fraction of actor_director_fb_likes in cast_total_fb_likes. So lets drop this column
movies3 = movies3.drop('actor_director_fb_likes', axis = 'columns')

In [126]:
movies3 = movies3.drop('has_seen', axis = 1)

In [128]:
#lets trying finding the fraction for total_actor_likes by going through the same data validation as above
movies3['actor_total_fb_likes'] = (movies3['actor_1_facebook_likes']+\
                                   movies3['actor_2_facebook_likes']+\
                                   movies3['actor_3_facebook_likes'])

In [135]:
movies3['actor_total_fb_likes'] = movies3['actor_total_fb_likes'].fillna(0)

In [136]:
movies3['is_cast_more'] = (movies3['cast_total_facebook_likes'] >=
                           movies3['actor_total_fb_likes'])

In [137]:
#the data validation came out to be true, lets carry on to find the fraction of actor likes in total cast fb likes
movies3['is_cast_more'].all()

True

In [138]:
movies3['fraction_actor_likes'] = (movies3['actor_total_fb_likes'] / 
                                   movies3['cast_total_facebook_likes'])

In [139]:
#final validation that the min and max lie between 0.0 and 1.0
movies3['fraction_actor_likes'].min(), movies3['fraction_actor_likes'].max()

(0.0, 1.0)

In [141]:
#wohoo! we now have a new feature which is, the fraction of actor likes in total cast fb likes
movies3['fraction_actor_likes'].head()

movie_title
Avatar                                                     0.577369
Pirates of the Caribbean: At World's End                   0.951396
Spectre                                                    0.987521
The Dark Knight Rises                                      0.683783
Star Wars: Episode VII - The Force Awakens                 0.000000
Name: fraction_actor_likes, dtype: float64