In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Positional and labeled indexing

In [15]:
df = pd.read_csv('data/county.csv', index_col='county')
df.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156
Allegheny,PA,614671,56.640219,42.18582,Obama,924351
Armstrong,PA,28322,30.696985,67.901278,Romney,42147
Beaver,PA,80015,46.032619,52.63763,Romney,115157
Bedford,PA,21444,22.057452,76.98657,Romney,32189


In [16]:
df.iloc[4, 4] == df.loc['Bedford', 'winner']

True

# Indexing and column rearrangement

In [17]:
results = df[['winner', 'total', 'voters']]
results.head()

Unnamed: 0_level_0,winner,total,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,Romney,41973,61156
Allegheny,Obama,614671,924351
Armstrong,Romney,28322,42147
Beaver,Romney,80015,115157
Bedford,Romney,21444,32189


# Series vs. 1-column DataFrame

In [20]:
# Series
series = df['winner']
type(series)

pandas.core.series.Series

In [21]:
# 1-column DataFrame
data_frame = df[['winner']]
type(data_frame)

pandas.core.frame.DataFrame

# Slicing rows

In [23]:
p_counties = df.loc['Perry':'Potter']
p_counties.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Perry,PA,18240,29.769737,68.591009,Romney,27245
Philadelphia,PA,653598,85.224251,14.051451,Obama,1099197
Pike,PA,23164,43.904334,54.882576,Romney,41840
Potter,PA,7205,26.259542,72.158223,Romney,10913


In [24]:
p_counties_rev = df.loc['Potter':'Perry':-1]
p_counties_rev.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Potter,PA,7205,26.259542,72.158223,Romney,10913
Pike,PA,23164,43.904334,54.882576,Romney,41840
Philadelphia,PA,653598,85.224251,14.051451,Obama,1099197
Perry,PA,18240,29.769737,68.591009,Romney,27245


# Slicing columns

In [28]:
df.head(1)

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156


In [33]:
left_columns = df.loc[:, :'Obama']
left_columns.head(1)

Unnamed: 0_level_0,state,total,Obama
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,PA,41973,35.482334


In [34]:
middle_columns = df.loc[:, 'Obama':'winner']
middle_columns.head(1)

Unnamed: 0_level_0,Obama,Romney,winner
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,35.482334,63.112001,Romney


In [35]:
right_columns = df.loc[:, 'Romney':]
right_columns.head(1)

Unnamed: 0_level_0,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,63.112001,Romney,61156


# Subselecting DataFrames with lists

In [36]:
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']

# Create the new DataFrame: three_counties
three_counties = df.loc[rows, cols]

# Print the three_counties DataFrame
print(three_counties)

              winner      Obama     Romney
county                                    
Philadelphia   Obama  85.224251  14.051451
Centre        Romney  48.948416  48.977486
Fulton        Romney  21.096291  77.748861


# Thresholding data

In [42]:
high_voters = df['voters'] > 50000
high_voters.head()

county
Adams         True
Allegheny     True
Armstrong    False
Beaver        True
Bedford      False
Name: voters, dtype: bool

In [43]:
high_voters_df = df[high_voters]
high_voters_df.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156
Allegheny,PA,614671,56.640219,42.18582,Obama,924351
Beaver,PA,80015,46.032619,52.63763,Romney,115157
Berks,PA,163253,48.939376,49.528646,Romney,250356
Blair,PA,47631,32.575424,66.133401,Romney,85328


# Filtering columns using other columns

In [44]:
high_voters = df['voters'] > 50000
high_voters.head()

county
Adams         True
Allegheny     True
Armstrong    False
Beaver        True
Bedford      False
Name: voters, dtype: bool

In [47]:
df['value'] = 1.0
df.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,value
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,1.0
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,1.0
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,1.0
Beaver,PA,80015,46.032619,52.63763,Romney,115157,1.0
Bedford,PA,21444,22.057452,76.98657,Romney,32189,1.0


In [49]:
df.loc[high_voters, 'value'] = np.nan
df.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,value
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,1.0
Beaver,PA,80015,46.032619,52.63763,Romney,115157,
Bedford,PA,21444,22.057452,76.98657,Romney,32189,1.0
