# Data Indexing and Selection

In [2]:
# A Series is like a dicitonary

import pandas as pd

data = pd.Series([.25, .5, .75, 1.0], index = ['a', 'b', 'c', 'd'])                        
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
data['b']

0.5

In [4]:
# You can use dictionary like Python experssions to examine a Series

'a' in data

True

In [5]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [7]:
# You can add new data to a Series by assigning a new index value

data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### Series as a 1D array

You can do all the standard numpy treatments to the Series as you would a 1D NumPy array

In [14]:
# Slicing by index

data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [15]:
# A Series with declared indices can still be sliced by the implicit integer index

data[1:4]

b    0.50
c    0.75
d    1.00
dtype: float64

In [16]:
# Masking
data[(data > .1) & ( data < 1.1)]

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [17]:
data[(data < .3) | (data > 1.1)]

a    0.25
e    1.25
dtype: float64

In [20]:
# Fancy Indexing

data[['a', 'e']] = .2, 1.2
data

a    0.20
b    0.50
c    0.75
d    1.00
e    1.20
dtype: float64

In [21]:
data + .5

a    0.70
b    1.00
c    1.25
d    1.50
e    1.70
dtype: float64

In [22]:
data

a    0.20
b    0.50
c    0.75
d    1.00
e    1.20
dtype: float64

### Indexers : loc, iloc, ix

In [23]:
# POINT OF CONFUSION
# If your array has EXPLICIT integer indexes and you access the array by index
#     the value associated with the EXPLICIT index will be returned
#
# If you us ARRAY SLICING the IMPLICIT index will be returned!!!

data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [24]:
data[1]

'a'

In [25]:
data[1:3]

3    b
5    c
dtype: object

In [26]:
# [object].loc[index] always references the explicitely stated index

data.loc[1]

'a'

In [27]:
data.loc[1:3]

1    a
3    b
dtype: object

In [28]:
# [object].iloc[index] always references the implicit index

data.iloc[1]

'b'

In [29]:
data.iloc[1:3]

3    b
5    c
dtype: object

## Data Selection in a DataFrame

In [31]:
data

1    a
3    b
5    c
dtype: object

In [142]:
#DataFrame as a Dictionary

area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [143]:
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [144]:
data.area   # This doesn't work when the column name is not a string or
            # there is already a function attribute with the name

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [145]:
data.pop is data['pop']

False

In [146]:
data.area is data['area']

True

In [147]:
data['PopDensity'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,PopDensity
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [148]:
data.sort_values('PopDensity', ascending=False)

Unnamed: 0,area,pop,PopDensity
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
California,423967,38332521,90.413926
Illinois,149995,12882135,85.883763
Texas,695662,26448193,38.01874


### DataFrame as a 2D array

In [149]:
data.values # This shows the underlying data.

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [150]:
data.T # Shows the Transpose of the DataFrame

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
PopDensity,90.41393,38.01874,139.0767,114.8061,85.88376


In [151]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [152]:
data.T.values[0]

array([423967., 695662., 141297., 170312., 149995.])

In [153]:
# You can access the array like a NumPy array using .iloc and a slice

data.iloc[1:4,:]

Unnamed: 0,area,pop,PopDensity
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [154]:
# And we can do the same thing with the string based integers too

data.loc[:'Texas', 'pop':]

Unnamed: 0,pop,PopDensity
California,38332521,90.413926
Texas,26448193,38.01874


In [155]:
# You can access a DataFrame with a mask

data[data.PopDensity > 100]

Unnamed: 0,area,pop,PopDensity
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [156]:
# And Slice with a mask using loc or iloc

data.loc[data.PopDensity > 100, ['pop', 'area']]

Unnamed: 0,pop,area
New York,19651127,141297
Florida,19552860,170312


In [157]:
# IN .loc or .iloc the ORDER OF THE KEYS EFFECTS THE ORDER OF DISPLAY

print(data.loc[:,['pop', 'area']], '\n')
print(data.loc[:,['area', 'pop']])

                 pop    area
California  38332521  423967
Texas       26448193  695662
New York    19651127  141297
Florida     19552860  170312
Illinois    12882135  149995 

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


In [158]:
data

Unnamed: 0,area,pop,PopDensity
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [159]:
# You can assign new values to the element of a DataFrame using the .loc and .iloc indexers
data.loc['California','pop'] = 39368078
data.loc['California','PopDensity']=data.loc['California','pop'] / data.loc['California','area']
data

Unnamed: 0,area,pop,PopDensity
California,423967,39368078,92.856468
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, California to Illinois
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area        5 non-null      int64  
 1   pop         5 non-null      int64  
 2   PopDensity  5 non-null      float64
dtypes: float64(1), int64(2)
memory usage: 332.0+ bytes


### Indexing Conventions

In [161]:
# Referencing a DataFrame with just an integer index Fails unless the indices ARE integers
# THIS A NO NO ---> print(data[1])
# BUT
# Slicing with integer indices works fine
# This Returns an Index
data[1:3]

Unnamed: 0,area,pop,PopDensity
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [162]:
# Single string names for columns works fine too

data['pop']

California    39368078
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [166]:
# Referencing a SINGLE row name doesn't work
# BUT
# Slicing by row names DOES work

data['California':'Florida']

Unnamed: 0,area,pop,PopDensity
California,423967,39368078,92.856468
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [169]:
# Slicing With Masks

data[data.PopDensity > 100]

Unnamed: 0,area,pop,PopDensity
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [199]:
data_byindex = data.sort_index()
print('data')
print(data, '\n')
print('data_byindex')
print(data_byindex, '\n')


for i in data:
    name = 'data_by'+i
    framename = name
    exec("%s = data.sort_values(i)" % (framename))   
    print(name)
    print(eval(framename), '\n')


data
              area       pop  PopDensity
California  423967  39368078   92.856468
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763 

data_byindex
              area       pop  PopDensity
California  423967  39368078   92.856468
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763
New York    141297  19651127  139.076746
Texas       695662  26448193   38.018740 

data_byarea
              area       pop  PopDensity
New York    141297  19651127  139.076746
Illinois    149995  12882135   85.883763
Florida     170312  19552860  114.806121
California  423967  39368078   92.856468
Texas       695662  26448193   38.018740 

data_bypop
              area       pop  PopDensity
Illinois    149995  12882135   85.883763
Florida     170312  19552860  114.806121
New York    141297  19651127  139.076746
Texas       695662  26448193   38.018740
California

In [186]:
data_byarea

Unnamed: 0,area,pop,PopDensity
New York,141297,19651127,139.076746
Illinois,149995,12882135,85.883763
Florida,170312,19552860,114.806121
California,423967,39368078,92.856468
Texas,695662,26448193,38.01874
