## 1. Developing a Data Analysis Routine

In [2]:
import numpy as np
import pandas as pd

#we see here, a common set of tasks that are usually done when importing a new dataset

In [11]:
#this is needed to display the quantiles that we would like to display later on
from IPython.display import display 

In [3]:
#its good to develop a routine when first examining a new dataset
#you can dynamically change this as you learn more about pandas

college = pd.read_csv('data/college.csv')
#checking out the first few rows of the dataset
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [4]:
#we've got 27 attributes from each of the 7535 datapoints.
college.shape

(7535, 27)

In [5]:
#this is the primary method to get the most metadata at once.
college.info()

#it gives us the data type of each column, number of non missing values, and the memory usage all at once.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
INSTNM                7535 non-null object
CITY                  7535 non-null object
STABBR                7535 non-null object
HBCU                  7164 non-null float64
MENONLY               7164 non-null float64
WOMENONLY             7164 non-null float64
RELAFFIL              7535 non-null int64
SATVRMID              1185 non-null float64
SATMTMID              1196 non-null float64
DISTANCEONLY          7164 non-null float64
UGDS                  6874 non-null float64
UGDS_WHITE            6874 non-null float64
UGDS_BLACK            6874 non-null float64
UGDS_HISP             6874 non-null float64
UGDS_ASIAN            6874 non-null float64
UGDS_AIAN             6874 non-null float64
UGDS_NHPI             6874 non-null float64
UGDS_2MOR             6874 non-null float64
UGDS_NRA              6874 non-null float64
UGDS_UNKN             6874 non-null float64
PPTUG_EF          

In [6]:
#lets get some summary statistics on the numerical columns of the data
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [7]:
#now lets do the same for columns containing categorical values
#Transposing enhances readability, especially when we have a lot of columns
college.describe(include=[np.object, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,West Virginia Junior College-Bridgeport,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [12]:
#we can specify the exact quantiles we want the describe method to return
with pd.option_context('display.max_rows', 5):
    display(college.describe(include=[np.number],
           percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T)

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.000000,0.00000,0.00000,1.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.000000,0.00000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PCTFLOAN,6849.0,0.522211,0.283616,0.0,0.0000,0.0000,0.0000,0.3329,0.58330,0.745000,0.84752,0.89792,0.986368,1.0
UG25ABV,6718.0,0.410021,0.228939,0.0,0.0025,0.0374,0.0899,0.2415,0.40075,0.572275,0.72666,0.80000,0.917383,1.0


## 2. Reducing memory by changing data types

In [37]:
#lets choose a few columns and make a dataset out of it.
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
college2 = college.loc[:,different_cols]
college2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [38]:
#we see from the dtypes, that all of them default to 64 bits irrespective of how small they really are.
college2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [39]:
#now lets find how much memory each column takes up.
#We must set deep=True to get memory usage of object dtype columns.
original_mem = college2.memory_usage(deep=True)

In [40]:
#RELAFFIL stores only 0/1, so it doesnt need 64 bits
college2['RELAFFIL'] = college2['RELAFFIL'].astype(np.int8)

In [41]:
college2.memory_usage(deep=True)
#now to decrease memory usage further we must consider changing
#object data types to categorical dtypes, provided they have low cardinality, aka, 
#small number of unique values

Index           80
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR      444565
dtype: int64

In [42]:
college2.select_dtypes(include=[object]).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [43]:
#cols containing object datatype could contain multiple kinds of dtypes
#and hence memory value of each value in the col is inconsistent

#we'll convert the STABBR col to categorical type since 
#the number of unique values is less than 1% of its values

college2['STABBR'] = college2['STABBR'].astype('category')

In [44]:
new_mem = college2.memory_usage(deep=True)

In [45]:
new_mem / original_mem
#we see that the RELAFFIL col is now 1/8th, 
#and the STABBR col is 3% of its original size.

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000000
STABBR      0.030538
dtype: float64

## 3. Selecting the smallest of the largest

In [47]:
#we'll find the lowest budget films from the top 100 scoring movies
#using nlargest and nsmallest
movie = pd.read_csv('data/movies.csv')
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [50]:
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens ...,7.1,


In [51]:
#lets choose the top 100 movies using imdb score
movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2765,Towering Inferno,9.5,
1937,The Shawshank Redemption,9.3,25000000.0
3466,The Godfather,9.2,6000000.0
2824,Dekalog,9.1,
3207,Dekalog,9.1,


In [52]:
#lets choose the smallest budget by chaining nsmallest to the above command
#also we could pass in columns names as well to the columns parameter
#to help break ties when there are duplicate values sharing the nth spot in the first col in the list.
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')

Unnamed: 0,movie_title,imdb_score,budget
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0
2242,Psycho,8.5,806947.0


## 4. Selecting the largest of each group by sorting

In [64]:
#lets find the highest rated film of each year

#a common operation : selecting the row with the largest value of a col within a group
movie3 = movie[['movie_title', 'title_year','imdb_score']]  #selecting the relevant cols
movie3.head()

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates of the Caribbean: At World's End,2007.0,7.1
2,Spectre,2015.0,6.8
3,The Dark Knight Rises,2012.0,8.5
4,Star Wars: Episode VII - The Force Awakens ...,,7.1


In [65]:
movie3.sort_values('title_year', ascending=False).head()

Unnamed: 0,movie_title,title_year,imdb_score
2366,Fight Valley,2016.0,5.0
3817,Yoga Hosers,2016.0,4.8
1367,The 5th Wave,2016.0,5.2
1742,The Boss,2016.0,5.3
519,The Secret Life of Pets,2016.0,6.8


In [66]:
#lets now sort the same dataframe by multiple cols
movie3 = movie3.sort_values(['title_year', 'imdb_score'], ascending=False)

#the command says the following:
#first sort the movies by title_year(in descending fashion),
#then sort the movies by rating(highest to lowest) within each year.

In [67]:
#we are now propped up to select the highest rated movie of each year
"""
by default drop_duplicates drops duplicate cols, here it wouldn't drop any by default
since each row is unique. So we pass in subset='title year'. So, now any other movie 
containing a particular year will be dropped (except for the first occurence)
"""

movie_top_yearwise = movie3.drop_duplicates(subset='title_year')
movie_top_yearwise.head()

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
3816,Running Forever,2015.0,8.6
4468,Queen of the Mountains,2014.0,8.7
4017,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5


## 5. Replicating nlargest with sort_values

In [82]:
#lets replicate the results we got with nlargest, now with sort_values. 
#Lets also explore the differences between the two.

movie_top100 = movie2.sort_values('imdb_score', ascending=False).head(100)
#here, ascending is True by default, hence we take the head, ie, the first five values
movie_least_budget = movie_top100.sort_values('budget').head()

movie_least_budget  #the replicated result from part 3 of this NB
# movie_top100.head()

Unnamed: 0,movie_title,imdb_score,budget
4937,A Charlie Brown Christmas,8.4,150000.0
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0


In [83]:
#lets compare this result to the one we got previously.
#we need to look closer to check why they are different.
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')

Unnamed: 0,movie_title,imdb_score,budget
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0
2242,Psycho,8.5,806947.0


In [85]:
#lets look at the intermediate step from this section and section 3
movie2.nlargest(100, 'imdb_score').tail()

Unnamed: 0,movie_title,imdb_score,budget
4017,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4105,Oldboy,8.4,3000000.0
4253,To Kill a Mockingbird,8.4,2000000.0
4496,Reservoir Dogs,8.4,1200000.0
4659,A Separation,8.4,500000.0


In [87]:
movie2.sort_values('imdb_score', ascending=False).head(100).tail()

Unnamed: 0,movie_title,imdb_score,budget
3849,Requiem for a Dream,8.4,4500000.0
3623,Veronica Mars,8.4,
3685,Rang De Basanti,8.4,
2362,The Shining,8.4,19000000.0
3807,Psych,8.4,


In [89]:
"""
We see that there are many movies with rating 8.4, and 
thats why we are getting different results since 
nlargest and sort_values break ties differently...
"""


