# Slicing and Indexing DataFrames

1) Explicit indexes

2) Slicing and subsetting with .loc and .iloc

3) Working with pivot tables

In [1]:
import pandas as pd
homelessness = pd.read_csv('homelessness.csv')

In [2]:
homelessness.columns

Index(['Unnamed: 0', 'region', 'state', 'individuals', 'family_members',
       'state_pop'],
      dtype='object')

In [3]:
homelessness.index

RangeIndex(start=0, stop=51, step=1)

In [4]:
print(homelessness.head())

   Unnamed: 0              region       state  individuals  family_members  \
0           0  East South Central     Alabama       2570.0           864.0   
1           1             Pacific      Alaska       1434.0           582.0   
2           2            Mountain     Arizona       7259.0          2606.0   
3           3  West South Central    Arkansas       2280.0           432.0   
4           4             Pacific  California     109008.0         20964.0   

   state_pop  
0    4887681  
1     735139  
2    7158024  
3    3009733  
4   39461588  


In [7]:
# You can move a column from the body of the DataFrame to the index. 
# This is called "setting an index,"

homelessness_ind = homelessness.set_index("region")
print(homelessness_ind.head())

                    Unnamed: 0       state  individuals  family_members  \
region                                                                    
East South Central           0     Alabama       2570.0           864.0   
Pacific                      1      Alaska       1434.0           582.0   
Mountain                     2     Arizona       7259.0          2606.0   
West South Central           3    Arkansas       2280.0           432.0   
Pacific                      4  California     109008.0         20964.0   

                    state_pop  
region                         
East South Central    4887681  
Pacific                735139  
Mountain              7158024  
West South Central    3009733  
Pacific              39461588  


In [10]:
homelessness_ind.reset_index().head()

Unnamed: 0.1,region,Unnamed: 0,state,individuals,family_members,state_pop
0,East South Central,0,Alabama,2570.0,864.0,4887681
1,Pacific,1,Alaska,1434.0,582.0,735139
2,Mountain,2,Arizona,7259.0,2606.0,7158024
3,West South Central,3,Arkansas,2280.0,432.0,3009733
4,Pacific,4,California,109008.0,20964.0,39461588


In [31]:
# 6. Dropping an index
# reset_index has a drop argument that allows you to discard an index. 
# Here, setting drop to True entirely removes the dog names.

homelessness_ind.reset_index(drop=True).head()

Unnamed: 0.1,Unnamed: 0,state,individuals,family_members,state_pop
0,0,Alabama,2570.0,864.0,4887681
1,1,Alaska,1434.0,582.0,735139
2,2,Arizona,7259.0,2606.0,7158024
3,3,Arkansas,2280.0,432.0,3009733
4,4,California,109008.0,20964.0,39461588


In [32]:
homelessness_ind.loc[["Pacific", "Mountain"]].head()

Unnamed: 0_level_0,Unnamed: 0,state,individuals,family_members,state_pop
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pacific,1,Alaska,1434.0,582.0,735139
Pacific,4,California,109008.0,20964.0,39461588
Pacific,11,Hawaii,4131.0,2399.0,1420593
Pacific,37,Oregon,11139.0,3337.0,4181886
Pacific,47,Washington,16424.0,5880.0,7523869


In [33]:
homelessness_ind.loc["Pacific"]

Unnamed: 0_level_0,Unnamed: 0,state,individuals,family_members,state_pop
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pacific,1,Alaska,1434.0,582.0,735139
Pacific,4,California,109008.0,20964.0,39461588
Pacific,11,Hawaii,4131.0,2399.0,1420593
Pacific,37,Oregon,11139.0,3337.0,4181886
Pacific,47,Washington,16424.0,5880.0,7523869


In [34]:
homelessness_ind_mult = homelessness.set_index(["region", "state"])
print(homelessness_ind_mult.head())

                               Unnamed: 0  individuals  family_members  \
region             state                                                 
East South Central Alabama              0       2570.0           864.0   
Pacific            Alaska               1       1434.0           582.0   
Mountain           Arizona              2       7259.0          2606.0   
West South Central Arkansas             3       2280.0           432.0   
Pacific            California           4     109008.0         20964.0   

                               state_pop  
region             state                  
East South Central Alabama       4887681  
Pacific            Alaska         735139  
Mountain           Arizona       7158024  
West South Central Arkansas      3009733  
Pacific            California   39461588  


In [35]:
homelessness_ind.loc[[("East South Central", "Alabama"),("Pacific", "Alaska")]]

KeyError: "None of [Index([('East South Central', 'Alabama'), ('Pacific', 'Alaska')], dtype='object', name='region')] are in the [index]"

In [37]:
homelessness_ind.sort_index()

Unnamed: 0_level_0,Unnamed: 0,state,individuals,family_members,state_pop
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East North Central,35,Ohio,6929.0,3320.0,11676341
East North Central,49,Wisconsin,2740.0,2167.0,5807406
East North Central,22,Michigan,5209.0,3142.0,9984072
East North Central,13,Illinois,6752.0,3891.0,12723071
East North Central,14,Indiana,3776.0,1482.0,6695497


In [43]:
# Sorting index values is similar to sorting values in columns, 
# except that you call .sort_index() instead of .sort_values().
homelessness_ind.sort_values(["individuals", "state_pop"], ascending=[True, True])

Unnamed: 0_level_0,Unnamed: 0,state,individuals,family_members,state_pop
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mountain,50,Wyoming,434.0,205.0,577601
West North Central,34,North Dakota,467.0,75.0,758080
South Atlantic,7,Delaware,708.0,374.0,965479
New England,39,Rhode Island,747.0,354.0,1058287
New England,45,Vermont,780.0,511.0,624358
New England,29,New Hampshire,835.0,615.0,1353465
West North Central,41,South Dakota,836.0,323.0,878698
Mountain,26,Montana,983.0,422.0,1060665
South Atlantic,48,West Virginia,1021.0,222.0,1804291
East South Central,24,Mississippi,1024.0,328.0,2981020


In [44]:
# Slicing List
homelessness[2:5]

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
2,2,Mountain,Arizona,7259.0,2606.0,7158024
3,3,West South Central,Arkansas,2280.0,432.0,3009733
4,4,Pacific,California,109008.0,20964.0,39461588


In [45]:
homelessness[:]

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
0,0,East South Central,Alabama,2570.0,864.0,4887681
1,1,Pacific,Alaska,1434.0,582.0,735139
2,2,Mountain,Arizona,7259.0,2606.0,7158024
3,3,West South Central,Arkansas,2280.0,432.0,3009733
4,4,Pacific,California,109008.0,20964.0,39461588
5,5,Mountain,Colorado,7607.0,3250.0,5691287
6,6,New England,Connecticut,2280.0,1696.0,3571520
7,7,South Atlantic,Delaware,708.0,374.0,965479
8,8,South Atlantic,District of Columbia,3770.0,3134.0,701547
9,9,South Atlantic,Florida,21443.0,9587.0,21244317


In [48]:
# Sort the index before you slice
homelessness_sorted = homelessness.set_index(["region", "state"]).sort_index()
print(homelessness_sorted)

                                         Unnamed: 0  individuals  \
region             state                                           
East North Central Illinois                      13       6752.0   
                   Indiana                       14       3776.0   
                   Michigan                      22       5209.0   
                   Ohio                          35       6929.0   
                   Wisconsin                     49       2740.0   
East South Central Alabama                        0       2570.0   
                   Kentucky                      17       2735.0   
                   Mississippi                   24       1024.0   
                   Tennessee                     42       6139.0   
Mid-Atlantic       New Jersey                    30       6048.0   
                   New York                      32      39827.0   
                   Pennsylvania                  38       8163.0   
Mountain           Arizona                      

In [50]:
# Slicing the outer index level
homelessness_sorted.loc["East North Central" : "Mid-Atlantic"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,individuals,family_members,state_pop
region,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East North Central,Illinois,13,6752.0,3891.0,12723071
East North Central,Indiana,14,3776.0,1482.0,6695497
East North Central,Michigan,22,5209.0,3142.0,9984072
East North Central,Ohio,35,6929.0,3320.0,11676341
East North Central,Wisconsin,49,2740.0,2167.0,5807406
East South Central,Alabama,0,2570.0,864.0,4887681
East South Central,Kentucky,17,2735.0,953.0,4461153
East South Central,Mississippi,24,1024.0,328.0,2981020
East South Central,Tennessee,42,6139.0,1744.0,6771631
Mid-Atlantic,New Jersey,30,6048.0,3350.0,8886025


In [51]:
# Slicing the inner index levels badly
# The same technique doesn't work on inner index levels. 

homelessness_sorted.loc["Ohio":"Wisconsin"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,individuals,family_members,state_pop
region,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pacific,Alaska,1,1434.0,582.0,735139
Pacific,California,4,109008.0,20964.0,39461588
Pacific,Hawaii,11,4131.0,2399.0,1420593
Pacific,Oregon,37,11139.0,3337.0,4181886
Pacific,Washington,47,16424.0,5880.0,7523869
South Atlantic,Delaware,7,708.0,374.0,965479
South Atlantic,District of Columbia,8,3770.0,3134.0,701547
South Atlantic,Florida,9,21443.0,9587.0,21244317
South Atlantic,Georgia,10,6943.0,2556.0,10511131
South Atlantic,Maryland,20,4914.0,2230.0,6035802


In [53]:
# Slicing the inner index levels CORRECTLY
homelessness_sorted.loc[("Pacific", "California"):("South Atlantic", "Florida")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,individuals,family_members,state_pop
region,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pacific,California,4,109008.0,20964.0,39461588
Pacific,Hawaii,11,4131.0,2399.0,1420593
Pacific,Oregon,37,11139.0,3337.0,4181886
Pacific,Washington,47,16424.0,5880.0,7523869
South Atlantic,Delaware,7,708.0,374.0,965479
South Atlantic,District of Columbia,8,3770.0,3134.0,701547
South Atlantic,Florida,9,21443.0,9587.0,21244317


In [57]:
# Slice twice
# Note: You can also slice DataFrames, but first, you need to sort the index. 
homelessness_sorted.loc[("Pacific", "Hawaii"): ("South Atlantic", "Florida"), "individuals":"state_pop"]

Unnamed: 0_level_0,Unnamed: 1_level_0,individuals,family_members,state_pop
region,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pacific,Hawaii,4131.0,2399.0,1420593
Pacific,Oregon,11139.0,3337.0,4181886
Pacific,Washington,16424.0,5880.0,7523869
South Atlantic,Delaware,708.0,374.0,965479
South Atlantic,District of Columbia,3770.0,3134.0,701547
South Atlantic,Florida,21443.0,9587.0,21244317


In [65]:
homelessness = homelessness.set_index("individuals").sort_index()
print(homelessness.head())

             Unnamed: 0         state
individuals                          
434.0                50       Wyoming
467.0                34  North Dakota
708.0                 7      Delaware
747.0                39  Rhode Island
780.0                45       Vermont


In [70]:
homelessness.reset_index().head()

Unnamed: 0.1,individuals,Unnamed: 0,state
0,434.0,50,Wyoming
1,467.0,34,North Dakota
2,708.0,7,Delaware
3,747.0,39,Rhode Island
4,780.0,45,Vermont


In [73]:
homelessness.iloc[1:10, 1:3]

Unnamed: 0_level_0,state
individuals,Unnamed: 1_level_1
467.0,North Dakota
708.0,Delaware
747.0,Rhode Island
780.0,Vermont
835.0,New Hampshire
836.0,South Dakota
983.0,Montana
1021.0,West Virginia
1024.0,Mississippi


In [75]:
homelessness.mean(axis="index").head()

Unnamed: 0    25.0
dtype: float64

In [74]:
# Calculating summary stats across columns
homelessness.mean(axis="columns").head()

individuals
434.0    50.0
467.0    34.0
708.0     7.0
747.0    39.0
780.0    45.0
dtype: float64

In [88]:
# .loc & slicing is a powerful combo

all_stocks_5yr = pd.read_csv('all_stocks_5yr.csv')
print(all_stocks_5yr)

              date   open   high    low  close    volume Name
0       2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL
1       2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL
2       2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL
3       2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL
4       2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL
...            ...    ...    ...    ...    ...       ...  ...
619035  2018-02-01  76.84  78.27  76.69  77.82   2982259  ZTS
619036  2018-02-02  77.53  78.12  76.73  76.78   2595187  ZTS
619037  2018-02-05  76.64  76.92  73.18  73.83   2962031  ZTS
619038  2018-02-06  72.74  74.56  72.13  73.27   4924323  ZTS
619039  2018-02-07  72.70  75.00  72.69  73.86   4534912  ZTS

[619040 rows x 7 columns]


In [83]:
# Note: You can also slice DataFrames, but first, you need to sort the index. 
all_stocks_5yr.set_index("date").sort_index().head()

Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
2013-02-08,81.22,81.93,80.94,81.89,296853,SLG
2013-02-08,78.24,79.07,78.125,79.07,4632684,SLB
2013-02-08,236.64,238.6924,235.75,238.16,552207,BLK
2013-02-08,89.04,89.48,88.91,89.16,554948,SJM


In [91]:
# Note: You can also slice DataFrames, but first, you need to sort the index. 

AAL = all_stocks_5yr[all_stocks_5yr["Name"] == "AAL"]
print(AAL)

            date   open   high    low  close    volume Name
0     2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL
1     2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL
2     2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL
3     2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL
4     2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL
...          ...    ...    ...    ...    ...       ...  ...
1254  2018-02-01  54.00  54.64  53.59  53.88   3623078  AAL
1255  2018-02-02  53.49  53.99  52.03  52.10   5109361  AAL
1256  2018-02-05  51.99  52.39  49.75  49.76   6878284  AAL
1257  2018-02-06  49.32  51.50  48.79  51.18   6782480  AAL
1258  2018-02-07  50.91  51.98  50.89  51.40   4845831  AAL

[1259 rows x 7 columns]


In [92]:
# Set & sort index
# Note: You can also slice DataFrames, but first, you need to sort the index. 

AAL = AAL.set_index("date").sort_index()
print(AAL)

             open   high    low  close    volume Name
date                                                 
2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL
2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL
2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL
2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL
2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL
...           ...    ...    ...    ...       ...  ...
2018-02-01  54.00  54.64  53.59  53.88   3623078  AAL
2018-02-02  53.49  53.99  52.03  52.10   5109361  AAL
2018-02-05  51.99  52.39  49.75  49.76   6878284  AAL
2018-02-06  49.32  51.50  48.79  51.18   6782480  AAL
2018-02-07  50.91  51.98  50.89  51.40   4845831  AAL

[1259 rows x 6 columns]


In [94]:
# Note: You can also slice DataFrames, but first, you need to sort the index. 
# Slicing by dates
AAL.loc["2013-02-08":"2013-02-14"]

Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [96]:
# Note: You can also slice DataFrames, but first, you need to sort the index. 
# Slicing by partial dates
AAL.loc["2013":"2015"]

Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-02-08,15.07,15.12,14.63,14.750,8407500,AAL
2013-02-11,14.89,15.01,14.26,14.460,8882000,AAL
2013-02-12,14.45,14.51,14.10,14.270,8126000,AAL
2013-02-13,14.30,14.94,14.25,14.660,10259500,AAL
2013-02-14,14.94,14.96,13.16,13.990,31879900,AAL
...,...,...,...,...,...,...
2014-12-24,50.66,51.69,50.35,51.430,5961395,AAL
2014-12-26,51.46,52.25,51.06,51.955,6841763,AAL
2014-12-29,51.65,53.24,51.30,52.850,8791483,AAL
2014-12-30,53.00,53.70,52.83,53.420,8063140,AAL


In [99]:
# Subsetting by row/column number
AAL.iloc[0:5, 0:4]

Unnamed: 0_level_0,open,high,low,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-02-08,15.07,15.12,14.63,14.75
2013-02-11,14.89,15.01,14.26,14.46
2013-02-12,14.45,14.51,14.1,14.27
2013-02-13,14.3,14.94,14.25,14.66
2013-02-14,14.94,14.96,13.16,13.99


In [102]:
# Pivot Table

AAL = AAL.pivot_table("open", index="date")
print(AAL)

             open
date             
2013-02-08  15.07
2013-02-11  14.89
2013-02-12  14.45
2013-02-13  14.30
2013-02-14  14.94
...           ...
2018-02-01  54.00
2018-02-02  53.49
2018-02-05  51.99
2018-02-06  49.32
2018-02-07  50.91

[1259 rows x 1 columns]


In [103]:
AAL.mean(axis="index")

open    38.390495
dtype: float64

In [104]:
AAL.mean(axis="columns")

date
2013-02-08    15.07
2013-02-11    14.89
2013-02-12    14.45
2013-02-13    14.30
2013-02-14    14.94
              ...  
2018-02-01    54.00
2018-02-02    53.49
2018-02-05    51.99
2018-02-06    49.32
2018-02-07    50.91
Length: 1259, dtype: float64