This is still just python code, but like an Rmd notebook, you can mix code with Markdown and output

In [1]:
import pandas as pd
import numpy as np

Let's import some data and display it.

You can run cells using the "play button" at the top, the "kernel" menu, or "Shift+Enter".

In [2]:
# Read in the data
police_df = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/police-locals/police-locals.csv", na_values = "**")

In [3]:
## filter - select rows in your dataframe that fit criteria

# Find the cities where the police force contains more than 4000 officers
large_force = police_df[police_df.police_force_size > 4000]
large_force # notice that the results of the last line are displayed in the notebook

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
0,New York,32300,0.617957,0.446387,0.764419,0.770891,0.762861,0.749235
1,Chicago,12120,0.875,0.871963,0.8774,0.897406,0.839827,0.966667
2,Los Angeles,10100,0.228218,0.152778,0.263848,0.387387,0.21768,0.305263
3,Washington,9340,0.115632,0.056774,0.157365,0.170189,0.089888,0.230769
4,Houston,7700,0.292208,0.173735,0.399258,0.366379,0.457143,0.408163
5,Philadelphia,6045,0.835401,0.776899,0.89948,0.924658,0.817391,
6,Phoenix,4475,0.311732,0.270802,0.42735,0.521739,0.427711,
7,San Diego,4460,0.362108,0.372984,0.348485,0.538462,0.297794,0.515625


The `large_force` object displayed above is a data frame.

A single array is referred to as a Pandas Series object. Rows and columns are Series objects.

In [4]:
large_force['city']

0        New York
1         Chicago
2     Los Angeles
3      Washington
4         Houston
5    Philadelphia
6         Phoenix
7       San Diego
Name: city, dtype: object

In [5]:
# Find the cities where the police force contains less than 4000 officers and
# less than half of black officers live inside city limits
smaller_force = police_df[(police_df.police_force_size < 4000) & (police_df.black < 0.5)]
smaller_force.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
8,Dallas,3605,0.191401,0.171504,0.21345,0.214634,0.256881,
10,San Francisco,3020,0.316225,0.259494,0.378472,0.186047,0.253333,0.486111
12,Atlanta,2950,0.137288,0.186275,0.111399,0.101983,,
13,Las Vegas,2830,0.374558,0.4,0.307692,0.387755,0.267857,
14,Baltimore,2800,0.257143,0.132812,0.361842,0.391459,,


In [6]:
# Find the cities where the less than half of black officers live inside city 
# limits or more than half of hispanic officers live inside city limits
or_force = police_df[(police_df.black < 0.5) | (police_df.hispanic > 0.5)]
or_force.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
0,New York,32300,0.617957,0.446387,0.764419,0.770891,0.762861,0.749235
1,Chicago,12120,0.875,0.871963,0.8774,0.897406,0.839827,0.966667
2,Los Angeles,10100,0.228218,0.152778,0.263848,0.387387,0.21768,0.305263
3,Washington,9340,0.115632,0.056774,0.157365,0.170189,0.089888,0.230769
4,Houston,7700,0.292208,0.173735,0.399258,0.366379,0.457143,0.408163


In [7]:
# Find the cities where the the percentage of all officers that live inside
# city limits is almost exactly 0.5
half_force = police_df[np.isclose(police_df['all'], 0.5)]
half_force

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
33,New Orleans,1560,0.5,0.324074,0.593137,0.623711,,


In [8]:
# Find the rows for New York, Miami, and Cincinnati
specific_cities = police_df[police_df.city.isin(["New York", "Miami", "Cincinnati"])]
specific_cities

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
0,New York,32300,0.617957,0.446387,0.764419,0.770891,0.762861,0.749235
26,Miami,1860,0.072581,0.030612,0.087591,0.0,0.116751,
50,Cincinnati,1145,0.227074,0.147727,0.490566,0.648649,,


In [9]:
# Find the rows where the estimate for Asian officers is NA
asian_na = police_df[np.isnan(police_df.asian)]
asian_na.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
5,Philadelphia,6045,0.835401,0.776899,0.89948,0.924658,0.817391,
6,Phoenix,4475,0.311732,0.270802,0.42735,0.521739,0.427711,
8,Dallas,3605,0.191401,0.171504,0.21345,0.214634,0.256881,
9,Detroit,3265,0.370597,0.081967,0.542787,0.568,0.333333,
11,San Antonio,2955,0.624365,0.443878,0.713924,0.574468,0.73913,


In [10]:
## sort your dataframe

# Sort the dataframe by the overall percentage of police officers that live in 
# city limits from smallest to largest
all_sort = police_df.sort_values(by=['all'], ascending=True)
all_sort.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
26,Miami,1860,0.072581,0.030612,0.087591,0.0,0.116751,
28,"Sacramento, Calif.",1820,0.07967,0.06338,0.1375,0.32,0.0,
32,"Santa Ana, Calif.",1590,0.09434,0.058824,0.120879,,0.148649,0.0
34,"Oakland, Calif.",1530,0.094771,0.026667,0.160256,0.0625,0.108108,0.28125
49,"Rochester, N.Y.",1150,0.1,0.040936,0.271186,0.195122,,


In [11]:
# Sort the dataframe by the overall percentage of police officers that live in 
# city limits from largest to smallest
all_rev_sort = police_df.sort_values(by=['all'], ascending=False)
all_rev_sort.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
40,"Laredo, Texas",1435,0.937282,0.962963,0.93133,,0.93133,
1,Chicago,12120,0.875,0.871963,0.8774,0.897406,0.839827,0.966667
73,"Corpus Christi, Texas",770,0.857143,0.893333,0.822785,,0.847222,
17,"El Paso, Texas",2260,0.85177,0.826446,0.861027,,0.861027,
5,Philadelphia,6045,0.835401,0.776899,0.89948,0.924658,0.817391,


In [12]:
# Sort the dataframe by the overall percentage of police officers that live in 
# city limits first (ascending order), then by police force size second (ascending order)
all_white_sort = police_df.sort_values(by=['all', 'police_force_size'], ascending=[1, 1])
all_white_sort.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian
26,Miami,1860,0.072581,0.030612,0.087591,0.0,0.116751,
28,"Sacramento, Calif.",1820,0.07967,0.06338,0.1375,0.32,0.0,
32,"Santa Ana, Calif.",1590,0.09434,0.058824,0.120879,,0.148649,0.0
34,"Oakland, Calif.",1530,0.094771,0.026667,0.160256,0.0625,0.108108,0.28125
57,Minneapolis,1000,0.1,0.052632,0.37931,,,


In [13]:
# Select the last five columns using slicing
last_five = police_df.iloc[:, 3:8]
last_five.head()

Unnamed: 0,white,non-white,black,hispanic,asian
0,0.446387,0.764419,0.770891,0.762861,0.749235
1,0.871963,0.8774,0.897406,0.839827,0.966667
2,0.152778,0.263848,0.387387,0.21768,0.305263
3,0.056774,0.157365,0.170189,0.089888,0.230769
4,0.173735,0.399258,0.366379,0.457143,0.408163


In [14]:
# Select the last five columns using inverse slicing
last_five = police_df.iloc[:, -5:]
last_five.head()

Unnamed: 0,white,non-white,black,hispanic,asian
0,0.446387,0.764419,0.770891,0.762861,0.749235
1,0.871963,0.8774,0.897406,0.839827,0.966667
2,0.152778,0.263848,0.387387,0.21768,0.305263
3,0.056774,0.157365,0.170189,0.089888,0.230769
4,0.173735,0.399258,0.366379,0.457143,0.408163


In [15]:
## mutate() - create new variables (i.e. columns) in your dataframe

# Create a new percentage variable "all_perc" that multiplies 
# all by 100
police_df['all_perc'] = police_df['all'].values * 100
police_df.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian,all_perc
0,New York,32300,0.617957,0.446387,0.764419,0.770891,0.762861,0.749235,61.795666
1,Chicago,12120,0.875,0.871963,0.8774,0.897406,0.839827,0.966667,87.5
2,Los Angeles,10100,0.228218,0.152778,0.263848,0.387387,0.21768,0.305263,22.821782
3,Washington,9340,0.115632,0.056774,0.157365,0.170189,0.089888,0.230769,11.563169
4,Houston,7700,0.292208,0.173735,0.399258,0.366379,0.457143,0.408163,29.220779


In [16]:
# Create a new variable that ranks police forces by size
rank_df = police_df.sort_values(by=['police_force_size'], ascending=False)
rank_df['rank_force'] = [x for x in range(1, rank_df.shape[0]+1)]
rank_df.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian,all_perc,rank_force
0,New York,32300,0.617957,0.446387,0.764419,0.770891,0.762861,0.749235,61.795666,1
1,Chicago,12120,0.875,0.871963,0.8774,0.897406,0.839827,0.966667,87.5,2
2,Los Angeles,10100,0.228218,0.152778,0.263848,0.387387,0.21768,0.305263,22.821782,3
3,Washington,9340,0.115632,0.056774,0.157365,0.170189,0.089888,0.230769,11.563169,4
4,Houston,7700,0.292208,0.173735,0.399258,0.366379,0.457143,0.408163,29.220779,5


In [17]:
# Create a new variable that indicates whether half or more of the police force lives in town 
police_df['half_in'] = police_df['all'] >= 0.5
police_df.head()

Unnamed: 0,city,police_force_size,all,white,non-white,black,hispanic,asian,all_perc,half_in
0,New York,32300,0.617957,0.446387,0.764419,0.770891,0.762861,0.749235,61.795666,True
1,Chicago,12120,0.875,0.871963,0.8774,0.897406,0.839827,0.966667,87.5,True
2,Los Angeles,10100,0.228218,0.152778,0.263848,0.387387,0.21768,0.305263,22.821782,False
3,Washington,9340,0.115632,0.056774,0.157365,0.170189,0.089888,0.230769,11.563169,False
4,Houston,7700,0.292208,0.173735,0.399258,0.366379,0.457143,0.408163,29.220779,False


In [18]:
# groupby()
# group by whether half or more of police officers live in city limits
for bool_inicator, sub_df in police_df.groupby(['half_in']):
    print(bool_inicator)
    print(sub_df.head())

False
          city  police_force_size       all     white  non-white     black  \
2  Los Angeles              10100  0.228218  0.152778   0.263848  0.387387   
3   Washington               9340  0.115632  0.056774   0.157365  0.170189   
4      Houston               7700  0.292208  0.173735   0.399258  0.366379   
6      Phoenix               4475  0.311732  0.270802   0.427350  0.521739   
7    San Diego               4460  0.362108  0.372984   0.348485  0.538462   

   hispanic     asian   all_perc  half_in  
2  0.217680  0.305263  22.821782    False  
3  0.089888  0.230769  11.563169    False  
4  0.457143  0.408163  29.220779    False  
6  0.427711       NaN  31.173184    False  
7  0.297794  0.515625  36.210762    False  
True
                  city  police_force_size       all     white  non-white  \
0             New York              32300  0.617957  0.446387   0.764419   
1              Chicago              12120  0.875000  0.871963   0.877400   
5         Philadelphia      

In [19]:
# Calculate the average fraction of police officers that live in city 
# limits, across all cities in the data set
police_df['all'].mean()

0.39863239689333346