# Chapter 2: Essential DataFrame Operations

In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 4, "display.max_rows", 10)

## Introduction

## Selecting Multiple DataFrame Columns

### How to do it\...

In [2]:
movies = pd.read_csv("../data/movie.csv")
movie_actor_director = movies[
    ["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]
]
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [3]:
type(movies[["director_name"]])

pandas.core.frame.DataFrame

In [4]:
type(movies["director_name"])

pandas.core.series.Series

In [5]:
type(movies.loc[:, ["director_name"]])

pandas.core.frame.DataFrame

In [6]:
type(movies.loc[:, "director_name"])

pandas.core.series.Series

### How it works\...

### There\'s more\...

In [7]:
cols = ["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]
movie_actor_director = movies[cols]

In [8]:
try:
    movies["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]
except KeyError as e:
    print("KeyError:", e)
    print("Double brackets are needed to select multiple DataFrame columns")

KeyError: ('actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name')
Double brackets are needed to select multiple DataFrame columns


## Selecting Columns with Methods

### How it works\...

In [9]:
movies = pd.read_csv("../data/movie.csv")


def shorten(col):
    return col.replace("facebook_likes", "fb").replace("_for_reviews", "")


movies = movies.rename(columns=shorten)
# `.get_dtypes_counts` => Deprecated since version 0.25.0.
movies.dtypes.value_counts()

float64    13
object     12
int64       3
Name: count, dtype: int64

In [10]:
movies.select_dtypes(include="int").head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [11]:
movies.select_dtypes(include="number").head()

Unnamed: 0,num_critic,duration,...,aspect_ratio,movie_fb
0,723.0,178.0,...,1.78,33000
1,302.0,169.0,...,2.35,0
2,602.0,148.0,...,2.35,85000
3,813.0,164.0,...,2.35,164000
4,,,...,,0


In [12]:
movies.select_dtypes(include=["int", "object"]).head()

Unnamed: 0,color,director_name,...,content_rating,movie_fb
0,Color,James Cameron,...,PG-13,33000
1,Color,Gore Verbinski,...,PG-13,0
2,Color,Sam Mendes,...,PG-13,85000
3,Color,Christopher Nolan,...,PG-13,164000
4,,Doug Walker,...,,0


In [13]:
movies.select_dtypes(exclude="float").head()

Unnamed: 0,color,director_name,...,content_rating,movie_fb
0,Color,James Cameron,...,PG-13,33000
1,Color,Gore Verbinski,...,PG-13,0
2,Color,Sam Mendes,...,PG-13,85000
3,Color,Christopher Nolan,...,PG-13,164000
4,,Doug Walker,...,,0


In [14]:
movies.filter(like="fb").head()

Unnamed: 0,director_fb,actor_3_fb,...,actor_2_fb,movie_fb
0,0.0,855.0,...,936.0,33000
1,563.0,1000.0,...,5000.0,0
2,0.0,161.0,...,393.0,85000
3,22000.0,23000.0,...,23000.0,164000
4,131.0,,...,12.0,0


In [15]:
cols = ["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]
movies.filter(items=cols).head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [16]:
movies.filter(regex=r"\d").head()

Unnamed: 0,actor_3_fb,actor_2_name,...,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,...,Wes Studi,936.0
1,1000.0,Orlando Bloom,...,Jack Davenport,5000.0
2,161.0,Rory Kinnear,...,Stephanie Sigman,393.0
3,23000.0,Christian Bale,...,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,...,,12.0


### How it works\...

### There\'s more\...

### See also

## Ordering Column Names

### How to do it\...

In [17]:
movies = pd.read_csv("../data/movie.csv")


def shorten(col):
    return col.replace("facebook_likes", "fb").replace("_for_reviews", "")


movies = movies.rename(columns=shorten)

In [18]:
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [19]:
cat_core = ["movie_title", "title_year", "content_rating", "genres"]
cat_people = ["director_name", "actor_1_name", "actor_2_name", "actor_3_name"]
cat_other = ["color", "country", "language", "plot_keywords", "movie_imdb_link"]
cont_fb = [
    "director_fb",
    "actor_1_fb",
    "actor_2_fb",
    "actor_3_fb",
    "cast_total_fb",
    "movie_fb",
]
cont_finance = ["budget", "gross"]
cont_num_reviews = ["num_voted_users", "num_user", "num_critic"]
cont_other = ["imdb_score", "duration", "aspect_ratio", "facenumber_in_poster"]

In [20]:
new_col_order = (
    cat_core
    + cat_people
    + cat_other
    + cont_fb
    + cont_finance
    + cont_num_reviews
    + cont_other
)
set(movies.columns) == set(new_col_order)

True

In [21]:
movies[new_col_order].head()

Unnamed: 0,movie_title,title_year,...,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,...,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,...,2.35,0.0
2,Spectre,2015.0,...,2.35,1.0
3,The Dark Knight Rises,2012.0,...,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,...,,0.0


### How it works\...

### There\'s more\...

### See also

## Summarizing a DataFrame

### How to do it\...

In [22]:
movies = pd.read_csv("../data/movie.csv")
movies.shape

(4916, 28)

In [23]:
movies.size

137648

In [24]:
movies.ndim

2

In [25]:
len(movies)

4916

In [26]:
movies.count()

color                      4897
director_name              4814
num_critic_for_reviews     4867
duration                   4901
director_facebook_likes    4814
                           ... 
title_year                 4810
actor_2_facebook_likes     4903
imdb_score                 4916
aspect_ratio               4590
movie_facebook_likes       4916
Length: 28, dtype: int64

⚠️ Comparing NaN values with string will raise a TypeError

**Option 1**: drop rows that contain NaN


In [27]:
print(
    f"Reduces dataframe rows by {((movies.shape[0] - movies.dropna().shape[0]) / movies.shape[0]):.0%} \
from {movies.shape[0]} to {movies.dropna().shape[0]} rows"
)

opt_1 = movies.dropna().min().to_frame(name="min_opt_1")
opt_1

Reduces dataframe rows by 26% from 4916 to 3654 rows


Unnamed: 0,min_opt_1
color,Black and White
director_name,Aaron Schneider
num_critic_for_reviews,2.0
duration,37.0
director_facebook_likes,0.0
...,...
title_year,1927.0
actor_2_facebook_likes,0.0
imdb_score,1.6
aspect_ratio,1.18


**Option 2**: `try` / `except` block to keep only numeric columns

In [28]:
try:
    movies.min()
except TypeError as e:
    print("TypeError", e)
    print("Selecting only numeric columns before aggregation")
    opt_2 = movies.select_dtypes("number").min().to_frame(name="min_opt_2")
    display(opt_2)

TypeError '<=' not supported between instances of 'str' and 'float'
Selecting only numeric columns before aggregation


Unnamed: 0,min_opt_2
num_critic_for_reviews,1.00
duration,7.00
director_facebook_likes,0.00
actor_3_facebook_likes,0.00
actor_1_facebook_likes,0.00
...,...
title_year,1916.00
actor_2_facebook_likes,0.00
imdb_score,1.60
aspect_ratio,1.18


**Option 3**: remove NaN from each series

In [29]:
# split into numeric and non numeric columns, then exclude NaN values if any
movies_number_cols = movies.select_dtypes(include="number").columns
movies_number_series_without_nan = [movies[col].dropna() for col in movies_number_cols]

movies_object_cols = movies.select_dtypes(exclude="number").columns
movies_object_series_without_nan = [movies[col].dropna() for col in movies_object_cols]

# calculate row reduction independently for each series with strings (object columns)
tuple_check = [
    (s.name, (movies.shape[0] - s.shape[0]) / movies.shape[0])
    for s in movies_object_series_without_nan
]
print("Reduced rows for each column")
display(
    pd.DataFrame(
        [
            [tuple_check[i][0] for i, el in enumerate(tuple_check)],
            [tuple_check[i][1] for i, el in enumerate(tuple_check)],
        ]
    )
)

# check that we have the same columns as in the original dataset
movies_series_without_nan = (
    movies_number_series_without_nan + movies_object_series_without_nan
)
assert set(movies.columns) == set(s.name for s in movies_series_without_nan)

# create min dataframe for each col
movies_min = [s.min() for s in movies_series_without_nan]
opt_3 = pd.DataFrame(
    movies_min, index=[s.name for s in movies_series_without_nan], columns=["min_opt_3"]
)
opt_3

Reduced rows for each column


Unnamed: 0,0,1,...,10,11
0,color,director_name,...,country,content_rating
1,0.003865,0.020749,...,0.001017,0.061025


Unnamed: 0,min_opt_3
num_critic_for_reviews,1.0
duration,7.0
director_facebook_likes,0.0
actor_3_facebook_likes,0.0
actor_1_facebook_likes,0.0
...,...
plot_keywords,10 year old|dog|florida|girl|supermarket
movie_imdb_link,http://www.imdb.com/title/tt0006864/?ref_=fn_t...
language,Aboriginal
country,Afghanistan


**Option 4**: use `fillna` with relevant values => requires domain expertise

In [30]:
opt_4 = pd.DataFrame(data=[None] * 28, index=movies.columns, columns=["min_opt_4"])
opt_4

Unnamed: 0,min_opt_4
color,
director_name,
num_critic_for_reviews,
duration,
director_facebook_likes,
...,...
title_year,
actor_2_facebook_likes,
imdb_score,
aspect_ratio,


**Option 5** : use `describe`

In [31]:
pd.set_option("display.max_columns", 8, "display.max_rows", 10)
opt_5 = movies.describe().T[["min"]].rename(columns={"min": "min_opt_5"})
opt_5

Unnamed: 0,min_opt_5
num_critic_for_reviews,1.00
duration,7.00
director_facebook_likes,0.00
actor_3_facebook_likes,0.00
actor_1_facebook_likes,0.00
...,...
title_year,1916.00
actor_2_facebook_likes,0.00
imdb_score,1.60
aspect_ratio,1.18


In [32]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4867.0,137.988905,120.239379,1.00,49.00,108.00,191.00,813.0
duration,4901.0,107.090798,25.286015,7.00,93.00,103.00,118.00,511.0
director_facebook_likes,4814.0,691.014541,2832.954125,0.00,7.00,48.00,189.75,23000.0
actor_3_facebook_likes,4893.0,631.276313,1625.874802,0.00,132.00,366.00,633.00,23000.0
actor_1_facebook_likes,4909.0,6494.488491,15106.986884,0.00,607.00,982.00,11000.00,640000.0
...,...,...,...,...,...,...,...,...
title_year,4810.0,2002.447609,12.453977,1916.00,1999.00,2005.00,2011.00,2016.0
actor_2_facebook_likes,4903.0,1621.923516,4011.299523,0.00,277.00,593.00,912.00,137000.0
imdb_score,4916.0,6.437429,1.127802,1.60,5.80,6.60,7.20,9.5
aspect_ratio,4590.0,2.222349,1.402940,1.18,1.85,2.35,2.35,16.0


In [33]:
movies.describe(percentiles=[0.01, 0.3, 0.99]).T.iloc[:, -5:]

Unnamed: 0,1%,30%,50%,99%,max
num_critic_for_reviews,2.00,60.00,108.00,546.68,813.0
duration,43.00,95.00,103.00,189.00,511.0
director_facebook_likes,0.00,11.00,48.00,16000.00,23000.0
actor_3_facebook_likes,0.00,176.00,366.00,11000.00,23000.0
actor_1_facebook_likes,6.08,694.00,982.00,44920.00,640000.0
...,...,...,...,...,...
title_year,1951.00,2000.00,2005.00,2016.00,2016.0
actor_2_facebook_likes,0.00,345.00,593.00,17000.00,137000.0
imdb_score,3.10,6.00,6.60,8.50,9.5
aspect_ratio,1.33,1.85,2.35,4.00,16.0


**Comparative summary of all options**

**Option 1**: drop rows that contain NaN => risk of dropping a row with a minimal value, e.g. `num_critic_for_review`

**Option 2**: `try` / `except` block to keep only numeric columns => use `describe`to type less

**Option 3**: remove NaN from each series => to get the summary for object columns with missing values as well

**Option 4**: use `fillna` with relevant values => requires domain expertise

e.g. movies.select_dtypes(exclude="number").fillna("").describe()

**Option 5** : use `describe` => most convenient, specially if the focus is on numerical data

In [34]:
(
    opt_1.merge(
        opt_2, how="left", left_index=True, right_index=True
    )  # start with all columns
    .merge(opt_3, how="left", left_index=True, right_index=True)
    .merge(opt_4, how="left", left_index=True, right_index=True)
    .merge(opt_5, how="left", left_index=True, right_index=True)
)

Unnamed: 0,min_opt_1,min_opt_2,min_opt_3,min_opt_4,min_opt_5
color,Black and White,,Black and White,,
director_name,Aaron Schneider,,A. Raven Cruz,,
num_critic_for_reviews,2.0,1.00,1.0,,1.00
duration,37.0,7.00,7.0,,7.00
director_facebook_likes,0.0,0.00,0.0,,0.00
...,...,...,...,...,...
title_year,1927.0,1916.00,1916.0,,1916.00
actor_2_facebook_likes,0.0,0.00,0.0,,0.00
imdb_score,1.6,1.60,1.6,,1.60
aspect_ratio,1.18,1.18,1.18,,1.18


### How it works\...

### There\'s more\...

In [35]:
movies.select_dtypes(exclude="number").columns

Index(['color', 'director_name', 'actor_2_name', 'genres', 'actor_1_name',
       'movie_title', 'actor_3_name', 'plot_keywords', 'movie_imdb_link',
       'language', 'country', 'content_rating'],
      dtype='object')

In [36]:
type_check_data = []
for col in movies.select_dtypes(exclude="number").columns:
    types = movies[col].apply(type).unique()
    total_nan = movies[col].isna().sum()
    type_check_data.append([col, types, total_nan])
pd.DataFrame(type_check_data, columns=["col", "types", "total_nan"]).set_index("col")

Unnamed: 0_level_0,types,total_nan
col,Unnamed: 1_level_1,Unnamed: 2_level_1
color,"[<class 'str'>, <class 'float'>]",19
director_name,"[<class 'str'>, <class 'float'>]",102
actor_2_name,"[<class 'str'>, <class 'float'>]",13
genres,[<class 'str'>],0
actor_1_name,"[<class 'str'>, <class 'float'>]",7
...,...,...
plot_keywords,"[<class 'str'>, <class 'float'>]",152
movie_imdb_link,[<class 'str'>],0
language,"[<class 'str'>, <class 'float'>]",14
country,"[<class 'str'>, <class 'float'>]",5


In [37]:
try:
    movies.min(skipna=False)
except TypeError as e:
    print("TypeError", e)
    print("Selecting only numeric columns before aggregation")
    display(movies.select_dtypes("number").min(skipna=False))

TypeError '<=' not supported between instances of 'str' and 'float'
Selecting only numeric columns before aggregation


num_critic_for_reviews     NaN
duration                   NaN
director_facebook_likes    NaN
actor_3_facebook_likes     NaN
actor_1_facebook_likes     NaN
                          ... 
title_year                 NaN
actor_2_facebook_likes     NaN
imdb_score                 1.6
aspect_ratio               NaN
movie_facebook_likes       0.0
Length: 16, dtype: float64

## Chaining DataFrame Methods

### How to do it\...

In [38]:
movies = pd.read_csv("../data/movie.csv")


def shorten(col):
    return col.replace("facebook_likes", "fb").replace("_for_reviews", "")


movies = movies.rename(columns=shorten)
movies.isnull().head()

Unnamed: 0,color,director_name,num_critic,duration,...,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,False,False,False,False,...,False,False,False,False
1,False,False,False,False,...,False,False,False,False
2,False,False,False,False,...,False,False,False,False
3,False,False,False,False,...,False,False,False,False
4,True,False,True,True,...,False,False,True,False


In [39]:
(movies.isnull().sum().head())

color             19
director_name    102
num_critic        49
duration          15
director_fb      102
dtype: int64

In [40]:
display(movies.isnull().sum())
display(movies.isnull().sum().sum())

color             19
director_name    102
num_critic        49
duration          15
director_fb      102
                ... 
title_year       106
actor_2_fb        13
imdb_score         0
aspect_ratio     326
movie_fb           0
Length: 28, dtype: int64

np.int64(2656)

In [41]:
display(movies.isnull().any())
display(movies.isnull().any().any())

color             True
director_name     True
num_critic        True
duration          True
director_fb       True
                 ...  
title_year        True
actor_2_fb        True
imdb_score       False
aspect_ratio      True
movie_fb         False
Length: 28, dtype: bool

np.True_

### How it works\...

In [42]:
movies.isnull().dtypes.value_counts()

bool    28
Name: count, dtype: int64

### There\'s more\...

In [43]:
try:
    movies[["color", "movie_title", "color"]].max()
except TypeError as e:
    print("TypeError", e)
    print("Selecting only numeric columns before aggregation")
    display(movies[["color", "movie_title", "color"]].select_dtypes("number").max())

TypeError '>=' not supported between instances of 'str' and 'float'
Selecting only numeric columns before aggregation


Series([], dtype: float64)

In [44]:
with pd.option_context("display.max_colwidth", 50):
    display(movies.select_dtypes(["object"]).fillna("").max())

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
                                         ...                        
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

In [45]:
with pd.option_context("max_colwidth", 50):
    display(movies.select_dtypes(["object"]).fillna("").max())

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
                                         ...                        
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

### See also

## DataFrame Operations

In [46]:
try:
    colleges = pd.read_csv("../data/college.csv")
    colleges + 5
except TypeError as e:
    print("TypeError", e)

TypeError can only concatenate str (not "int") to str


In [47]:
colleges = pd.read_csv("../data/college.csv", index_col="INSTNM")
college_ugds = colleges.filter(like="UGDS_")
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,...,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,...,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,...,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,...,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,...,0.0006,0.0098,0.0243,0.0137


In [48]:
name = "Northwest-Shoals Community College"
college_ugds.loc[name]

UGDS_WHITE    0.7912
UGDS_BLACK    0.1250
UGDS_HISP     0.0339
UGDS_ASIAN    0.0036
UGDS_AIAN     0.0088
UGDS_NHPI     0.0006
UGDS_2MOR     0.0012
UGDS_NRA      0.0033
UGDS_UNKN     0.0324
Name: Northwest-Shoals Community College, dtype: float64

In [49]:
college_ugds.loc[name].round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.12
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [50]:
(college_ugds.loc[name] + 0.0001).round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.13
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [51]:
college_ugds + 0.00501

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03831,0.94031,0.01051,0.00691,...,0.00691,0.00501,0.01091,0.01881
University of Alabama at Birmingham,0.59721,0.26501,0.03331,0.05681,...,0.00571,0.04181,0.02291,0.01501
Amridge University,0.30401,0.42421,0.01191,0.00841,...,0.00501,0.00501,0.00501,0.27651
University of Alabama in Huntsville,0.70381,0.13051,0.04321,0.04261,...,0.00521,0.02221,0.03821,0.04001
Alabama State University,0.02081,0.92581,0.01711,0.00691,...,0.00561,0.01481,0.02931,0.01871
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,...,,,,
Rasmussen College - Overland Park,,,,,...,,,,
National Personal Training Institute of Cleveland,,,,,...,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,...,,,,


In [52]:
# round down to the nearest whole nimber percentage
(college_ugds + 0.00501) // 0.01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,3.0,94.0,1.0,0.0,...,0.0,0.0,1.0,1.0
University of Alabama at Birmingham,59.0,26.0,3.0,5.0,...,0.0,4.0,2.0,1.0
Amridge University,30.0,42.0,1.0,0.0,...,0.0,0.0,0.0,27.0
University of Alabama in Huntsville,70.0,13.0,4.0,4.0,...,0.0,2.0,3.0,4.0
Alabama State University,2.0,92.0,1.0,0.0,...,0.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,...,,,,
Rasmussen College - Overland Park,,,,,...,,,,
National Personal Training Institute of Cleveland,,,,,...,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,...,,,,


In [53]:
college_ugds_op_round = (college_ugds + 0.00501) // 0.01 / 100
college_ugds_op_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,...,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,...,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,...,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,...,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,...,0.0,0.01,0.02,0.01


In [54]:
college_ugds_round = (college_ugds + 0.00001).round(2)
college_ugds_round

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.00,...,0.0,0.00,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,...,0.0,0.04,0.02,0.01
Amridge University,0.30,0.42,0.01,0.00,...,0.0,0.00,0.00,0.27
University of Alabama in Huntsville,0.70,0.13,0.04,0.04,...,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.00,...,0.0,0.01,0.02,0.01
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,...,,,,
Rasmussen College - Overland Park,,,,,...,,,,
National Personal Training Institute of Cleveland,,,,,...,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,...,,,,


In [55]:
college_ugds_op_round.equals(college_ugds_round)

True

### How it works\...

In [56]:
0.045 + 0.005

0.049999999999999996

### There\'s more\...

In [57]:
college2 = college_ugds.add(0.00501).floordiv(0.01).div(100)
college2.equals(college_ugds_op_round)

True

### See also

## Comparing Missing Values

In [58]:
np.nan == np.nan

False

In [59]:
print(None == None)  # linter : Comparison to `None` should be `con is None` Ruff(E711)`
print(None is None)
print(None.__eq__(None))

True
True
True


In [60]:
np.nan > 5

False

In [61]:
5 > np.nan

False

In [62]:
np.nan != 5

True

### Getting ready

In [63]:
college = pd.read_csv("../data/college.csv", index_col="INSTNM")
college_ugds = college.filter(like="UGDS_")

In [64]:
college_ugds == 0.0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,...,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,...,False,False,False,False
Amridge University,False,False,False,False,...,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,...,False,False,False,False
Alabama State University,False,False,False,True,...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,...,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,...,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,...,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,...,False,False,False,False


In [65]:
college_self_compare = college_ugds == college_ugds
college_self_compare.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,...,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,...,True,True,True,True
Amridge University,True,True,True,True,...,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,...,True,True,True,True
Alabama State University,True,True,True,True,...,True,True,True,True


In [66]:
college_self_compare.all()

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [67]:
(college_ugds == np.nan).sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [68]:
college_ugds.isna().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [69]:
college_ugds.equals(college_ugds)

True

### How it works\...

### There\'s more\...

In [70]:
college_ugds.eq(0.0019)  # same as college_ugds == .0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,...,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,...,False,False,False,False
Amridge University,False,False,False,False,...,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,...,False,False,False,False
Alabama State University,False,False,False,True,...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,...,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,...,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,...,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,...,False,False,False,False


In [71]:
from pandas.testing import assert_frame_equal

assert_frame_equal(college_ugds, college_ugds) is None

True

## Transposing the direction of a DataFrame operation

### How to do it\...

In [72]:
college = pd.read_csv("../data/college.csv", index_col="INSTNM")
college_ugds = college.filter(like="UGDS_")
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,...,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,...,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,...,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,...,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,...,0.0006,0.0098,0.0243,0.0137


In [73]:
college_ugds.count()

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [74]:
college_ugds.count(axis="columns").head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [75]:
college_ugds.sum(axis="columns").head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [76]:
college_ugds.median(axis="index")

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

### How it works\...

### There\'s more\...

In [77]:
college_ugds_cumsum = college_ugds.cumsum(axis=1)
college_ugds_cumsum.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,...,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,...,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,...,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,...,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,...,0.9522,0.962,0.9863,1.0


### See also

## Determining college campus diversity

In [78]:
pd.read_csv("../data/college_diversity.csv", index_col="School")

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74
"University of Houston Houston, TX",0.74
"University of Nevada--Las Vegas Las Vegas, NV",0.74
"University of San Francisco San Francisco, CA",0.74
"San Francisco State University San Francisco, CA",0.73
"University of Illinois--Chicago Chicago, IL",0.73
"New Jersey Institute of Technology Newark, NJ",0.72
"Texas Woman's University Denton, TX",0.72


### How to do it\...

In [79]:
college = pd.read_csv("../data/college.csv", index_col="INSTNM")
college_ugds = college.filter(like="UGDS_")

In [80]:
(college_ugds.isna().sum(axis="columns").sort_values(ascending=False).head())

INSTNM
Excel Learning Center-San Antonio South              9
Western State College of Law at Argosy University    9
Albany Law School                                    9
Albany Medical College                               9
A T Still University of Health Sciences              9
dtype: int64

In [81]:
college_ugds = college_ugds.dropna(how="all")
college_ugds.isna().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [82]:
# count of the number of races having greater than 15% of the population
college_ugds.ge(0.15)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,True,False,False,...,False,False,False,False
University of Alabama at Birmingham,True,True,False,False,...,False,False,False,False
Amridge University,True,True,False,False,...,False,False,False,True
University of Alabama in Huntsville,True,False,False,False,...,False,False,False,False
Alabama State University,False,True,False,False,...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,True,False,...,False,False,False,False
Hollywood Institute of Beauty Careers-Casselberry,False,True,True,False,...,False,False,False,False
Coachella Valley Beauty College-Beaumont,True,False,True,False,...,False,False,False,False
Dewey University-Mayaguez,False,False,True,False,...,False,False,False,False


In [83]:
diversity_metric = college_ugds.ge(0.15).sum(axis="columns")
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [84]:
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
Name: count, dtype: int64

In [85]:
diversity_metric.sort_values(ascending=False).head()

INSTNM
Central Texas Beauty College-Temple                               5
Regency Beauty Institute-Austin                                   5
Westwood College-O'Hare Airport                                   4
Regency Beauty Institute-Pasadena                                 4
Soma Institute-The National School of Clinical Massage Therapy    4
dtype: int64

In [86]:
college_ugds.loc[
    ["Regency Beauty Institute-Austin", "Central Texas Beauty College-Temple"]
]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,...,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,...,0.0,0.1717,0.0,0.1515


In [87]:
us_news_top = [
    "Rutgers University-Newark",
    "Andrews University",
    "Stanford University",
    "University of Houston",
    "University of Nevada-Las Vegas",
]
diversity_metric.loc[us_news_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64

### How it works\...

### There\'s more\...

In [88]:
# least diverse schools with this metric ? we know there is more than 3k ...
(college_ugds.max(axis=1).sort_values(ascending=False).head(10))

INSTNM
Caribbean University-Ponce                                        1.0
Brighton Institute of Cosmetology                                 1.0
Mesivta Torah Vodaath Rabbinical Seminary                         1.0
Rabbinical College Telshe                                         1.0
University of Puerto Rico-Mayaguez                                1.0
Haskell Indian Nations University                                 1.0
Lake Career and Technical Center                                  1.0
Leon Studio One School of Hair Design & Career Training Center    1.0
Dewey University-Hato Rey                                         1.0
Columbia Central University-Caguas                                1.0
dtype: float64

In [89]:
# any school with all 9 races above 1% ?
(college_ugds > 0.01).all(axis=1).any()

np.True_

### See also