## 1. Selecting Multiple Columns

In [2]:
import numpy as np
import pandas as pd

In [10]:
movie = pd.read_csv('data\movies.csv')
#separate cols variable for readability
cols = ['actor_1_name','actor_2_name','actor_3_name','director_name']
movie_actor_director = movie[cols]
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


## 2. Selecting Columns with Methods

In [11]:
movie = pd.read_csv('data/movies.csv', index_col='movie_title')
movie.get_dtype_counts()

float64    13
int64       3
object     11
dtype: int64

In [12]:
movie.select_dtypes(['integer']).head()

Unnamed: 0_level_0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,886204,4834,33000
Pirates of the Caribbean: At World's End,471220,48350,0
Spectre,275868,11700,85000
The Dark Knight Rises,1144337,106759,164000
Star Wars: Episode VII - The Force Awakens,8,143,0


In [13]:
#the filter selects all the column names that have anything like 'facebook' in their name
movie.filter(like='facebook').head()

Unnamed: 0_level_0,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,actor_2_facebook_likes,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avatar,0.0,855.0,1000.0,4834,936.0,33000
Pirates of the Caribbean: At World's End,563.0,1000.0,40000.0,48350,5000.0,0
Spectre,0.0,161.0,11000.0,11700,393.0,85000
The Dark Knight Rises,22000.0,23000.0,27000.0,106759,23000.0,164000
Star Wars: Episode VII - The Force Awakens,131.0,,131.0,143,12.0,0


In [14]:
#the following filter applies the regex function on all the column names
#this regex takes all columns that have 0-9 in their column names
movie.filter(regex='\d').head()

Unnamed: 0_level_0,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,actor_3_name,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avatar,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
Pirates of the Caribbean: At World's End,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
Spectre,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
The Dark Knight Rises,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
Star Wars: Episode VII - The Force Awakens,,Rob Walker,131.0,Doug Walker,,12.0


In [15]:
#the following filter is just like the index operator to select multiple columns, 
#however, unlike the index operator, this filter does not raise an error if any col in items does not exist
movie.filter(items=['director_facebook_likes','asdf']).head()

Unnamed: 0_level_0,director_facebook_likes
movie_title,Unnamed: 1_level_1
Avatar,0.0
Pirates of the Caribbean: At World's End,563.0
Spectre,0.0
The Dark Knight Rises,22000.0
Star Wars: Episode VII - The Force Awakens,131.0


## 3. Ordering Columns Sensibly

In [16]:
movie = pd.read_csv('data\movies.csv')
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [17]:
#sorting columns into meaningful groups of discrete and continuous values
disc_core = ['movie_title', 'title_year', 'content_rating', 'genres']
disc_people = ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']
disc_other = ['color', 'country', 'language', 'plot_keywords', 'movie_imdb_link']
cont_fb = ['director_facebook_likes', 'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes',
           'cast_total_facebook_likes', 'movie_facebook_likes']
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_critic_for_reviews', 'num_voted_users', 'num_user_for_reviews']
cont_other = ['imdb_score', 'duration', 'aspect_ratio', 'facenumber_in_poster']

In [18]:
cols_ordered = disc_core + disc_people + disc_other + cont_fb + cont_finance + cont_num_reviews + cont_other
#to check if we haven't missed out any columns
set(movie.columns) == set(cols_ordered)

True

In [19]:
movie2 = movie[cols_ordered]
movie2.head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_facebook_likes,budget,gross,num_critic_for_reviews,num_voted_users,num_user_for_reviews,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,723.0,886204,3054.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,302.0,471220,1238.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,...,85000,245000000.0,200074175.0,602.0,275868,994.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,...,164000,250000000.0,448130642.0,813.0,1144337,2701.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens ...,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,...,0,,,,8,,7.1,,,0.0


In [20]:
#just refreshing some stuff from section01
movie2.set_index('movie_title', drop=False).head(2)

Unnamed: 0_level_0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_facebook_likes,budget,gross,num_critic_for_reviews,num_voted_users,num_user_for_reviews,imdb_score,duration,aspect_ratio,facenumber_in_poster
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,723.0,886204,3054.0,7.9,178.0,1.78,0.0
Pirates of the Caribbean: At World's End,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,302.0,471220,1238.0,7.1,169.0,2.35,0.0


## 4. Operating on the Entire DataFrame

In [21]:
movie = pd.read_csv('data/movies.csv')

In [22]:
movie.shape, movie.size, movie.ndim, len(movie)

((5043, 28), 141204, 2, 5043)

In [23]:
#get the number of non missing values in each column of the dataframe, 
#where the index is the column name
movie.count()

color                        5024
director_name                4939
num_critic_for_reviews       4993
duration                     5028
director_facebook_likes      4939
actor_3_facebook_likes       5020
actor_2_name                 5030
actor_1_facebook_likes       5036
gross                        4159
genres                       5043
actor_1_name                 5036
movie_title                  5043
num_voted_users              5043
cast_total_facebook_likes    5043
actor_3_name                 5020
facenumber_in_poster         5030
plot_keywords                4890
movie_imdb_link              5043
num_user_for_reviews         5022
language                     5031
country                      5038
content_rating               4740
budget                       4551
title_year                   4935
actor_2_facebook_likes       5030
imdb_score                   5043
aspect_ratio                 4714
movie_facebook_likes         5043
dtype: int64

In [24]:
#applying simple stats methods like min(), max(), mean(), etc. returns similar Series
"""
however the object dtype columns are removed in the resulting series,
since those columns have missing values. 
And pandas does not know how to compare missing values with string values
and silently drops the columns wherein such comparisons arise
"""
movie.min()

num_critic_for_reviews          1.00
duration                        7.00
director_facebook_likes         0.00
actor_3_facebook_likes          0.00
actor_1_facebook_likes          0.00
gross                         162.00
num_voted_users                 5.00
cast_total_facebook_likes       0.00
facenumber_in_poster            0.00
num_user_for_reviews            1.00
budget                        218.00
title_year                   1916.00
actor_2_facebook_likes          0.00
imdb_score                      1.60
aspect_ratio                    1.18
movie_facebook_likes            0.00
dtype: float64

In [25]:
#percentile=0.5 is always included in describe irrespective of what values are specified in percentile
movie.describe(percentiles = [0.01, 0.33, 0.66, 0.99])

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
1%,2.0,43.0,0.0,0.0,7.0,8497.8,54.26,6.42,0.0,1.21,60000.0,1951.34,0.0,3.1,1.33,0.0
33%,67.0,97.0,14.54,204.0,755.0,10561400.0,14609.76,1915.74,0.0,90.93,10000000.0,2001.0,396.57,6.1,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
66%,159.0,111.0,120.08,535.0,3000.0,44804820.0,67275.56,6213.44,1.0,249.0,32000000.0,2009.0,812.0,7.0,2.35,747.72
99%,548.08,189.0,16000.0,11000.0,44000.0,333573600.0,681094.3,63027.24,8.0,2010.11,203500000.0,2016.0,17000.0,8.5,4.0,94580.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


## 5. Chaining DataFrame methods together

In [26]:
#simply calling isnull() on the df returns a df where dtype of every column=boolean
movie.isnull().head() 
#returning the dtypes of cols present in the returned df and the counts of each dtype
movie.isnull().get_dtype_counts()

bool    28
dtype: int64

In [27]:
#counting the missing values of each column
movie.isnull().sum().head(3)

color                      19
director_name             104
num_critic_for_reviews     50
dtype: int64

In [28]:
#get a count of all the missing values in the entire dataframe
movie.isnull().sum().sum()

2698

In [29]:
#alternatively we could ask the df if there are any missing values
#using any() once, would ask each column and return a series, instead of a single boolean value (as in the following command)
movie.isnull().any().any()

True

## 6. Working with Operators on a DataFrame

In [30]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.head(2)

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5


In [31]:
#a type error will be raised on the following command since the dataset does not contain homogeneous data
# college + 5
"""
so we will first select columns to make a homogeneous dataset and 
apply operators to the resulting homogeneous dataset
We shall select the ugds columns which contain the fraction of students 
in each college segregated by race(that sounds bad, but could contain some interesting stats!)
"""
ugds = college.filter(like='UGDS_')
ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [32]:
#we are now going to round all the values to the nearest hundreths in the dataframe without using pandas methods
ugds_rounded = (ugds + 0.00501) // 0.01 / 100
ugds_rounded.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


In [33]:
#this is the same as the following
ugds_rounded_method = (ugds + 0.00001).round(2)
ugds_rounded_method.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


In [34]:
#this can be confirmed by using the equals method
ugds_rounded.equals(ugds_rounded_method)

True

## 7. Comparing missing values

In [35]:
print(np.nan == np.nan)

False


In [36]:
print(None == None)

True


In [37]:
print(np.nan > 5)
print(5 < np.nan)

#the following is the only comparison of np.nan that returns true
print(np.nan != 5)

False
False
True


In [39]:
#Series and DataFrames use == to do elementwise comparisons and returns an object of the same size
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')

In [43]:
(college_ugds_ == 0.003).head(2)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,False,False,False,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False


In [44]:
college_self_compare = (college_ugds_ == college_ugds_)
college_self_compare.head(2)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True


In [48]:
#the reason why the all the values are not true, is because of comparison of missing values using == operator
college_self_compare.all().head(2)

UGDS_WHITE    False
UGDS_BLACK    False
dtype: bool

In [50]:
#this tells us that using the == operator is not the right way of comparing two dataframes. We need to use the equals method.
college_ugds_.equals(college_ugds_) #this returns the expected value

True

## 8. Transposing the Direction of a DataFrame Operation

In [52]:
#by default the count method counts the number of non missing values for each col (axis = 0)
#we can this to number of non missing values for each row(ie, transpose the operation) by specifying axis=1
college_ugds_.count(axis = 'columns').head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [57]:
#instead of counting the non missing values, we can add up the values in each row, 
#which being fractions of each race in a university would add upto 1.
college_ugds_.sum(axis = 1).head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [59]:
#not specifying axis='index'(ie, axis=0), since this is the behaviour by default
college_ugds_.median().head()

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
dtype: float64

## 9. Determining College Campus Diversity

In [60]:
diversity = pd.read_csv('data/college_diversity.csv', index_col='School')
diversity.head(3)

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74


In [61]:
#finding number of missing values in each row and sorting colleges by descending order.
college_ugds_.isnull().sum(axis=1).sort_values(ascending=False).head()

INSTNM
Excel Learning Center-San Antonio South         9
Philadelphia College of Osteopathic Medicine    9
Assemblies of God Theological Seminary          9
Episcopal Divinity School                       9
Phillips Graduate Institute                     9
dtype: int64

In [62]:
#lets drop the colleges that have missing values in all race columns

#by default, dropna sets how='any' which drops all rows with one or more missing values
college_ugds_ = college_ugds_.dropna(how='all')

In [64]:
college_ugds_.isnull().sum(axis=1).sort_values(ascending=False).head()

INSTNM
Coastal Pines Technical College               0
CUNY Bronx Community College                  0
College of Staten Island CUNY                 0
CUNY City College                             0
CUNY Graduate School and University Center    0
dtype: int64

In [65]:
#defining our diversity metric, where 0.15 is the threshold for each race
diversity_metric = college_ugds_.ge(.15).sum(axis='columns')
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [66]:
#the last row of the output tells us there are two schools with race_fraction >= 0.15 for five races.
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [69]:
#lets check out which schools those are
diversity_metric.sort_values(ascending=False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64

In [73]:
#seems a little suspicious for these colleges to be so diverse, lets look at the race splits explicitly
#loc helps us to search by label(ie, name)
college_ugds_.loc[['Regency Beauty Institute-Austin', 'Central Texas Beauty College-Temple']]

#we see that several of the race columns have been aggregated to the unknown column. 
#Despite this fact, both the colleges still seem to be pretty diverse.

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


In [74]:
#lets look at how the top us colleges(as per the news) fare in terms of race diversity.
us_top_news = ['Rutgers University-Newark',
              'Andrews University',
              'Stanford University',
              'University of Houston',
              'University of Nevada-Las Vegas']
diversity_metric.loc[us_top_news]

#we see that these "top" schools also fare well in terms of diversity in race.

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64