# Pandas For data Science: Fixing Bad Data

Based on Pandas Practices by Kevin Markham at PYCON CLEVELAND 2018

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Reading Datastes

   - Dataset 1: Rhode Island Dataset from [Stanford Open Policing Project](https://openpolicing.stanford.edu/)

In [2]:
ri = pd.read_csv('police.csv')
ri.head()

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


### Exercise 10: Find the bad data in the stop_duration column and fix it

___First Way: Removing these rows___

In [3]:
ri['stop_duration'].value_counts(dropna= False)

0-15 Min     69543
16-30 Min    13635
NaN           5333
30+ Min       3228
2                1
1                1
Name: stop_duration, dtype: int64

In [4]:
ri[(ri['stop_duration'] == '2') | (ri['stop_duration'] == '1') ]

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
55873,2012-01-23,11:17,,M,1960.0,52.0,Black,Other Traffic Violation,Moving violation,False,,Arrest Passenger,True,2,False
76624,2014-04-13,08:34,,F,1965.0,49.0,White,Speeding,Speeding,False,,Citation,False,1,False


In [5]:
# This is the way to remove these rows

#ri.drop(index= [55873,76624],inplace= True)

___Second Way___

In [6]:
# This is right but returns a warning. The best way to do this is using loc or iloc

ri[(ri['stop_duration'] == '2') | (ri['stop_duration'] == '1')]['stop_duration'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
ri['stop_duration'].value_counts(dropna= False)

0-15 Min     69543
16-30 Min    13635
NaN           5333
30+ Min       3228
2                1
1                1
Name: stop_duration, dtype: int64

In [8]:
ri.loc[(ri['stop_duration'] == '2') | (ri['stop_duration'] == '1'),'stop_duration']
#ri.loc[(ri['stop_duration'] == '2') | (ri['stop_duration'] == '1'),'stop_duration'] = np.nan

55873    2
76624    1
Name: stop_duration, dtype: object

___Third Way: More Pythonic___

In [9]:
ri.iloc[[55873,76624],13]

55873    2
76624    1
Name: stop_duration, dtype: object

In [10]:
ri.iloc[[55873,76624],13] = np.nan

In [11]:
ri.iloc[[55873,76624],13]

55873    NaN
76624    NaN
Name: stop_duration, dtype: object