# Table of contents

- <a href="#1">1. Selecting multiple dataframe columns<a>
- <a href="#2">2. Selecting columns with methods<a>
- <a href="#3">3. Ordering column names<a>
- <a href="#4">4. Summarizing a dataframe<a>
- <a href="#5">5. Chaining dataframe methods<a>
- <a href="#6">6. Dataframe operations<a>
- <a href="#7">7. Comparing missing values<a>
- <a href="#8">8. Transposing the direction of a dataframe<a>
- <a href="#9">9. Determining college campus diversity<a>

In [1]:
import pandas as pd
import numpy as np

### 1. Selecting multiple dataframe columns
<a id=""></a>

In [2]:
movies = pd.read_csv('data/movie.csv')

In [3]:
movie_actor_director = movies[
    [
        'actor_1_name',
        'actor_2_name',
        'actor_3_name',
        'director_name',
    ]
]

In [4]:
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [5]:
#return dataframe
type(movies[['director_name']])

pandas.core.frame.DataFrame

In [6]:
#return series
type(movies['director_name'])

pandas.core.series.Series

In [7]:
movies.loc[: ,[
    'actor_1_name',
        'actor_2_name',
        'actor_3_name',
        'director_name',
]].head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [8]:
#return dataframe
type(movies.loc[:, ['director_name']])

pandas.core.frame.DataFrame

In [9]:
#return series
type(movies.loc[:, 'director_name'])

pandas.core.series.Series

In [10]:
cols = [
    'actor_1_name',
        'actor_2_name',
        'actor_3_name',
        'director_name',
]

In [11]:
movies[cols]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker
...,...,...,...,...
4911,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith
4912,Natalie Zea,Valorie Curry,Sam Underwood,
4913,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds
4914,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia


### 2. Selecting columns with methods
<a id=""></a>

In [12]:
def shorten(col):
    return (
        str(col)
        .replace("facebook_likes", "fb")
        .replace("_for_reviews", "")
    )

In [13]:
movies.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [14]:
movies = movies.rename(columns=shorten)

In [15]:
#after
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [16]:
#select integer columns only
movies.select_dtypes(include="int").head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [17]:
#select all number only
movies.select_dtypes(include="number").head()

Unnamed: 0,num_critic,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [18]:
#exclude float
movies.select_dtypes(exclude="float").head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [19]:
#passing array
movies.select_dtypes(include=['int', 'object']).head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [20]:
movies.filter(like="fb").head()

Unnamed: 0,director_fb,actor_3_fb,actor_1_fb,cast_total_fb,actor_2_fb,movie_fb
0,0.0,855.0,1000.0,4834,936.0,33000
1,563.0,1000.0,40000.0,48350,5000.0,0
2,0.0,161.0,11000.0,11700,393.0,85000
3,22000.0,23000.0,27000.0,106759,23000.0,164000
4,131.0,,131.0,143,12.0,0


In [21]:
movies.filter(items=cols).head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [22]:
movies.filter(regex=r"\d").head()

Unnamed: 0,actor_3_fb,actor_2_name,actor_1_fb,actor_1_name,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0


### 3. Ordering column names
<a id="3"></a>

In [23]:
#bikin urutannya pake array
cat_core = [
    'movie_title',
    'title_year',
    'content_rating',
    'genres'
]

cat_people = [
    'director_name',
    'actor_1_name',
    'actor_2_name',
    'actor_3_name',
]

cat_other = [
    'color',
    'country',
    'language',
    'plot_keywords'
]

cont_fb = [
    'director_fb',
    'actor_1_name',
    'actor_2_name',
    'actor_3_name',
    'cast_total_fb',
    'movie_fb',    
]
#redundan ya, dari bukunya begitu sih.

cont_finance = ['budget', 'gross']

cont_num_reviews = [
    'num_voted_users',
    'num_user',
    'num_critic',
]

cont_other = [
    'imdb_score',
    'duration',
    'aspect_ratio',
    'facenumber_in_poster'
]

In [24]:
#disatuin
new_col_order = (
    cat_core
    + cat_people
    + cat_other
    + cont_fb
    + cont_finance
    + cont_num_reviews
    + cont_other
)

In [25]:
set(movies.columns) == set(new_col_order)

False

In [26]:
#eksekusi
movies[new_col_order].columns

Index(['movie_title', 'title_year', 'content_rating', 'genres',
       'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name',
       'color', 'country', 'language', 'plot_keywords', 'director_fb',
       'actor_1_name', 'actor_2_name', 'actor_3_name', 'cast_total_fb',
       'movie_fb', 'budget', 'gross', 'num_voted_users', 'num_user',
       'num_critic', 'imdb_score', 'duration', 'aspect_ratio',
       'facenumber_in_poster'],
      dtype='object')

### 4. Summarizing a dataframe
<a id="4"></a>

In [27]:
movies.shape

(4916, 28)

In [28]:
movies.size

137648

In [29]:
#Return an int representing the number of axes / array dimensions.
movies.ndim

2

In [30]:
movies.count() #hitung non missing value

color                   4897
director_name           4814
num_critic              4867
duration                4901
director_fb             4814
actor_3_fb              4893
actor_2_name            4903
actor_1_fb              4909
gross                   4054
genres                  4916
actor_1_name            4909
movie_title             4916
num_voted_users         4916
cast_total_fb           4916
actor_3_name            4893
facenumber_in_poster    4903
plot_keywords           4764
movie_imdb_link         4916
num_user                4895
language                4904
country                 4911
content_rating          4616
budget                  4432
title_year              4810
actor_2_fb              4903
imdb_score              4916
aspect_ratio            4590
movie_fb                4916
dtype: int64

In [31]:
print(movies.select_dtypes(include="number").min())
print(movies.select_dtypes(include="number").max())
print(movies.select_dtypes(include="number").mean())
print(movies.select_dtypes(include="number").median())
print(movies.select_dtypes(include="number").std())

num_critic                 1.00
duration                   7.00
director_fb                0.00
actor_3_fb                 0.00
actor_1_fb                 0.00
gross                    162.00
num_voted_users            5.00
cast_total_fb              0.00
facenumber_in_poster       0.00
num_user                   1.00
budget                   218.00
title_year              1916.00
actor_2_fb                 0.00
imdb_score                 1.60
aspect_ratio               1.18
movie_fb                   0.00
dtype: float64
num_critic              8.130000e+02
duration                5.110000e+02
director_fb             2.300000e+04
actor_3_fb              2.300000e+04
actor_1_fb              6.400000e+05
gross                   7.605058e+08
num_voted_users         1.689764e+06
cast_total_fb           6.567300e+05
facenumber_in_poster    4.300000e+01
num_user                5.060000e+03
budget                  4.200000e+09
title_year              2.016000e+03
actor_2_fb              1.370

In [32]:
# movies.describe()
movies.describe().T #transpose

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic,4867.0,137.9889,120.2394,1.0,49.0,108.0,191.0,813.0
duration,4901.0,107.0908,25.28602,7.0,93.0,103.0,118.0,511.0
director_fb,4814.0,691.0145,2832.954,0.0,7.0,48.0,189.75,23000.0
actor_3_fb,4893.0,631.2763,1625.875,0.0,132.0,366.0,633.0,23000.0
actor_1_fb,4909.0,6494.488,15106.99,0.0,607.0,982.0,11000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,5019656.25,25043962.0,61108412.75,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,8361.75,33132.5,93772.75,1689764.0
cast_total_fb,4916.0,9579.816,18164.32,0.0,1394.75,3049.0,13616.75,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user,4895.0,267.6688,372.9348,1.0,64.0,153.0,320.5,5060.0


In [33]:
movies.describe(percentiles=[0.01, 0.3, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,30%,50%,99%,max
num_critic,4867.0,137.9889,120.2394,1.0,2.0,60.0,108.0,546.68,813.0
duration,4901.0,107.0908,25.28602,7.0,43.0,95.0,103.0,189.0,511.0
director_fb,4814.0,691.0145,2832.954,0.0,0.0,11.0,48.0,16000.0,23000.0
actor_3_fb,4893.0,631.2763,1625.875,0.0,0.0,176.0,366.0,11000.0,23000.0
actor_1_fb,4909.0,6494.488,15106.99,0.0,6.08,694.0,982.0,44920.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,8474.8,7914068.6,25043962.0,326412800.0,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,53.0,11864.5,33132.5,681584.6,1689764.0
cast_total_fb,4916.0,9579.816,18164.32,0.0,6.0,1684.5,3049.0,62413.9,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,0.0,1.0,8.0,43.0
num_user,4895.0,267.6688,372.9348,1.0,1.94,80.0,153.0,1999.24,5060.0


### 5. Chaining dataframe methods
<a id="5"></a>

In [34]:
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [36]:
movies.isnull().sum().head()

color             19
director_name    102
num_critic        49
duration          15
director_fb      102
dtype: int64

In [37]:
movies.isnull().sum().sum()

2654

In [39]:
#1 any spesifik perkolom
# 2 any check apakah ada isnull per dataframe
movies.isnull().any().any()

True

### 6. Dataframe operations
<a id="6"></a>

In [40]:
colleges = pd.read_csv('data/college.csv')

In [42]:
colleges.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [52]:
for col in colleges.select_dtypes(include="object").columns:
    print(colleges[col].unique())

['Alabama A & M University' 'University of Alabama at Birmingham'
 'Amridge University' ...
 'National Personal Training Institute of Cleveland'
 'Bay Area Medical Academy - San Jose Satellite Location'
 'Excel Learning Center-San Antonio South']
['Normal' 'Birmingham' 'Montgomery' ... 'Palm Beach Gardens'
 'Coral Springs' 'Willingboro']
['AL' 'IL' 'AK' 'AZ' 'NM' 'AR' 'CA' 'MN' 'CO' 'CT' 'NY' 'DE' 'DC' 'VA'
 'FL' 'GA' 'HI' 'ID' 'IN' 'TN' 'MI' 'IA' 'KS' 'MO' 'KY' 'LA' 'ME' 'MD'
 'MA' 'MS' 'MT' 'NE' 'NV' 'NH' 'NJ' 'NC' 'ND' 'OH' 'WV' 'OK' 'OR' 'PA'
 'RI' 'SC' 'SD' 'TX' 'UT' 'VT' 'WA' 'WI' 'WY' 'AS' 'GU' 'MP' 'PR' 'FM'
 'PW' 'VI' 'MH']
['30300' '39700' '40100' '45500' '26600' '41900' '27500' '39000' '35000'
 '45700' '44200' '26100' '19900' '28800' '24600' '28700' '37200' '25700'
 nan '24200' '42300' '36500' 'PrivacySuppressed' '22300' '34600' '25400'
 '29400' '27400' '29300' '23800' '33900' '25200' '44500' '34900' '36100'
 '28300' '22400' '20100' '21800' '45800' '24700' '26500' '37300' '4

In [None]:
colleges + 5 #error

In [55]:
#colleges = pd.read_csv('data/college.csv', index_col='INSTNM')
colleges = colleges.set_index('INSTNM')

In [56]:
college_ugds = colleges.filter(like="UGDS_")
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [61]:
name = "Northwest-Shoals Community College"
college_ugds.loc[name].round(2) #buletin angka di belakang koma

UGDS_WHITE    0.79
UGDS_BLACK    0.12
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [62]:
(college_ugds.loc[name] + 0.001).round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.13
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [63]:
college_ugds_op_round = (
    (college_ugds + 0.00501) // 0.01 / 100
)

college_ugds_op_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


In [65]:
college_ugds_round = ( college_ugds + 0.00001).round(2)
college_ugds_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


In [66]:
college_ugds_op_round.equals(college_ugds_round)

True

### 7. Comparing missing values
<a id="7"></a>

In [67]:
np.nan == np.nan

False

In [68]:
None == None

True

In [69]:
np.nan > 5

False

In [70]:
5 > np.nan

False

In [71]:
np.nan != 5

True

In [72]:
college_ugds == 0.0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,False,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [73]:
#self comparison
college_self_compare = college_ugds == college_ugds
college_self_compare.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,True,True,True,True,True
Alabama State University,True,True,True,True,True,True,True,True,True


In [75]:
#check apakah semua nilainya True
college_self_compare.all()
#hasilnya unexpected karena missing value ga bisa dicompare

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [82]:
# college_ugds.isnull().sum()
college_ugds.isna().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [79]:
(college_ugds == np.nan).sum()
#dicheck pake isnull ada.tapi pas dicompare pake operations hasilnya 0

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [80]:
college_ugds.equals(college_ugds) #correct way to compare

True

In [84]:
college_ugds.eq(college_ugds).all() #wrong, eq sama dengan ==

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

### 8. Transposing the direction of a dataframe
<a id="8"></a>

In [85]:
college_ugds.count() #itung jumlah item per columns

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [87]:
college_ugds.count(axis="columns").head() #itung jumlah item per row

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [88]:
college_ugds.sum(axis='columns').head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [89]:
college_ugds.median(axis='index') #persebaran tiap kolom

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

In [90]:
#transpose bisa pake T
college_ugds.T

INSTNM,Alabama A & M University,University of Alabama at Birmingham,Amridge University,University of Alabama in Huntsville,Alabama State University,The University of Alabama,Central Alabama Community College,Athens State University,Auburn University at Montgomery,Auburn University,...,Strayer University-North Dallas,Strayer University-San Antonio,Strayer University-Stafford,WestMed College - Merced,Vantage College,SAE Institute of Technology San Francisco,Rasmussen College - Overland Park,National Personal Training Institute of Cleveland,Bay Area Medical Academy - San Jose Satellite Location,Excel Learning Center-San Antonio South
UGDS_WHITE,0.0333,0.5922,0.299,0.6988,0.0158,0.7825,0.7255,0.7823,0.5328,0.8507,...,,,,,,,,,,
UGDS_BLACK,0.9353,0.26,0.4192,0.1255,0.9208,0.1119,0.2613,0.12,0.3376,0.0704,...,,,,,,,,,,
UGDS_HISP,0.0055,0.0283,0.0069,0.0382,0.0121,0.0348,0.0044,0.0191,0.0074,0.0248,...,,,,,,,,,,
UGDS_ASIAN,0.0019,0.0518,0.0034,0.0376,0.0019,0.0106,0.0025,0.0053,0.0221,0.0227,...,,,,,,,,,,
UGDS_AIAN,0.0024,0.0022,0.0,0.0143,0.001,0.0038,0.0044,0.0157,0.0044,0.0074,...,,,,,,,,,,
UGDS_NHPI,0.0019,0.0007,0.0,0.0002,0.0006,0.0009,0.0,0.001,0.0016,0.0,...,,,,,,,,,,
UGDS_2MOR,0.0,0.0368,0.0,0.0172,0.0098,0.0261,0.0,0.0174,0.0297,0.0,...,,,,,,,,,,
UGDS_NRA,0.0059,0.0179,0.0,0.0332,0.0243,0.0268,0.0,0.0057,0.0397,0.01,...,,,,,,,,,,
UGDS_UNKN,0.0138,0.01,0.2715,0.035,0.0137,0.0026,0.0019,0.0334,0.0246,0.014,...,,,,,,,,,,


### 9. Determining college campus diversity
<a id="9"></a>

Many articles are written every year on the different aspects and impacts of diversity on
college campuses. Various organizations have developed metrics attempting to measure
diversity. US News is a leader in providing rankings for many different categories of colleges,
with diversity being one of them.

In [91]:
pd.read_csv('data/college_diversity.csv')

Unnamed: 0,School,Diversity Index
0,"Rutgers University--Newark Newark, NJ",0.76
1,"Andrews University Berrien Springs, MI",0.74
2,"Stanford University Stanford, CA",0.74
3,"University of Houston Houston, TX",0.74
4,"University of Nevada--Las Vegas Las Vegas, NV",0.74
5,"University of San Francisco San Francisco, CA",0.74
6,"San Francisco State University San Francisco, CA",0.73
7,"University of Illinois--Chicago Chicago, IL",0.73
8,"New Jersey Institute of Technology Newark, NJ",0.72
9,"Texas Woman's University Denton, TX",0.72


Our college dataset classifies race into nine different categories. When trying to quantify
something without an obvious definition, such as diversity, it helps to start with something
simple. In this recipe, our diversity metric will equal the count of the number of races having
greater than 15% of the student population.

In [96]:
#ge adalah method untuk lebih dari sama dengan
diversity_metric = college_ugds.dropna(how='all').ge(0.15).sum(axis='columns')
diversity_metric.head()
#Nilai 1 berarti ada 1 kolom ( 1 ras) yang presentasenya di atas 15%

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [97]:
diversity_metric.value_counts() #distribution
#cuma dua sekolah yang punya presentase lima ras di atas 15%

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [98]:
diversity_metric.sort_values(ascending=False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64

In [99]:
college_ugds.loc[
    [
        'Regency Beauty Institute-Austin',
        'Central Texas Beauty College-Temple'
    ]
]
#ternyata karena kebayakan masuk kategori ras unknown dan ras more

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


In [105]:
#top 10 least diverse college
college_ugds.max(axis='columns').sort_values(ascending=False).head(10)
# kenapa? karena ada satu ras yang presentasenya 100% atau 1

INSTNM
Dewey University-Manati                               1.0
Yeshiva and Kollel Harbotzas Torah                    1.0
Mr Leon's School of Hair Design-Lewiston              1.0
Dewey University-Bayamon                              1.0
Shepherds Theological Seminary                        1.0
Yeshiva Gedolah Kesser Torah                          1.0
Monteclaro Escuela de Hoteleria y Artes Culinarias    1.0
Yeshiva Shaar Hatorah                                 1.0
Bais Medrash Elyon                                    1.0
Yeshiva of Nitra Rabbinical College                   1.0
dtype: float64