# Part 4 - filtering

In [99]:
import pandas as pd

In [100]:
survey_df = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')

In [101]:
people_df = pd.DataFrame({
    'first': 'Corey Jane John'.split(),
    'last': 'Schafer Doe Doe'.split(),
    'email': 'CoreyMSchafer@gmail.com JaneDoe@gmail.com JohnDoe@gmail.com'.split()
})
people_df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [102]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

# creating columns

In [103]:
full_name = people_df['first'] + ' ' + people_df['last']

In [104]:
people_df['full_name'] = full_name; people_df

Unnamed: 0,first,last,email,full_name
0,Corey,Schafer,CoreyMSchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@gmail.com,Jane Doe
2,John,Doe,JohnDoe@gmail.com,John Doe


# dropping columns

In [105]:
people_df.drop(columns=['first', 'last'], inplace=True); people_df

Unnamed: 0,email,full_name
0,CoreyMSchafer@gmail.com,Corey Schafer
1,JaneDoe@gmail.com,Jane Doe
2,JohnDoe@gmail.com,John Doe


# adding columns

## split columns

In [106]:
first = people_df['full_name'].str.split(' ').str[0]; first

0    Corey
1     Jane
2     John
Name: full_name, dtype: object

In [107]:
last = people_df['full_name'].str.split(' ').str[1]; last

0    Schafer
1        Doe
2        Doe
Name: full_name, dtype: object

## create column of lists

In [108]:
people_df['full_name'].str.split(' ')

0    [Corey, Schafer]
1         [Jane, Doe]
2         [John, Doe]
Name: full_name, dtype: object

## create columns from column

In [109]:
people_df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


## add columns from column

In [110]:
people_df[['first', 'last']] = people_df['full_name'].str.split(' ', expand=True)
people_df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@gmail.com,Jane Doe,Jane,Doe
2,JohnDoe@gmail.com,John Doe,John,Doe


# adding rows

## pd.concat  (append is deprecated)

In [111]:
# DEPRECATED
# people_df.append({'first': 'Tony', 'last': 'Stark}, ignore_index=True)

In [112]:
df = pd.concat([                         # pd.concat expects ONE iterable of dfs
        people_df,
        pd.DataFrame({
            'first': ['Tony'],      # failure to put row data into a list
            'last': ['Stark']       # will raise an error
        })
    ], 
    # ignore_index=True             # without this we have two 0 indexes
)
df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@gmail.com,Jane Doe,Jane,Doe
2,JohnDoe@gmail.com,John Doe,John,Doe
0,,,Tony,Stark


## pd.concat: duplicated index

In [113]:
display('full df', df)
display('type of row index 0, still a df', type(df.loc[0]))
display(df.loc[0])

'full df'

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@gmail.com,Jane Doe,Jane,Doe
2,JohnDoe@gmail.com,John Doe,John,Doe
0,,,Tony,Stark


'type of row index 0, still a df'

pandas.core.frame.DataFrame

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
0,,,Tony,Stark


In [114]:
row_0 = df.loc[0]
display('type of first subrow of index 0', type(row_0.iloc[0]))
display(df.loc[0].iloc[0])

'type of first subrow of index 0'

pandas.core.series.Series

email        CoreyMSchafer@gmail.com
full_name              Corey Schafer
first                          Corey
last                         Schafer
Name: 0, dtype: object

In [115]:
# failing to use ignore_index creates multiple rows with the same index
# so if we have multiple rows for the same index, we need to extract them separately
display('first sub row', row_0.iloc[0])
display('second sub row', row_0.iloc[1])

'first sub row'

email        CoreyMSchafer@gmail.com
full_name              Corey Schafer
first                          Corey
last                         Schafer
Name: 0, dtype: object

'second sub row'

email          NaN
full_name      NaN
first         Tony
last         Stark
Name: 0, dtype: object

In [116]:
filt = df.loc[0]['first'] == 'Corey';filt

0     True
0    False
Name: first, dtype: bool

In [117]:
# this raises indexing error, because:
# df has indexes 0, 1, 2, 0
# while filt has indexed 0, 0
# they cannot match
# df[filt]

# one solution:
# this works because we use a subset of two rows of df
# with index==0, exactly like filt
display(df.loc[0].index)
display(filt.index)
df.loc[0][filt]

Index([0, 0], dtype='int64')

Index([0, 0], dtype='int64')

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer


## problem with indexes

### empty slices

In [118]:
# this slice is empty because the start is > than the stop
empty_df = people_df.loc[people_df.shape[0]:1]
display('empty index', empty_df.index)


'empty index'

RangeIndex(start=3, stop=2, step=1)

### inverse index logic for series

In [119]:
# a single row (a Series) has the original df columns in the index (conversely to a df)
last_row = people_df.loc[people_df.shape[0]-1]
display('last row', last_row)
display('last row index', last_row.index)

# a single row has the original df row index in the name (conversely to a df)
display('last row name', last_row.name)

# dfs have no name attribute
# display('df name', df.name)


'last row'

email        JohnDoe@gmail.com
full_name             John Doe
first                     John
last                       Doe
Name: 2, dtype: object

'last row index'

Index(['email', 'full_name', 'first', 'last'], dtype='object')

'last row name'

2

### use shape[0] to append a row from a list

In [120]:
# this empty slice wont add a row
people_df.loc[people_df.shape[0]:1] = ['mail', 'full', 'first', 'last']
# this will add a row wit a new index
people_df.loc[people_df.shape[0]] = ['mail', 'full', 'first', 'last']

## add row from dict

In [121]:
people_df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@gmail.com,Jane Doe,Jane,Doe
2,JohnDoe@gmail.com,John Doe,John,Doe
3,mail,full,first,last


### use shape[0] to append a row from a dict

In [122]:
people_df.loc[people_df.shape[0]] = {
    'email': 'MAIL', 
    'full_name': 'FULL', 
    'first': 'FIRST', 
    'last': 'LAST'
}
people_df

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@gmail.com,Jane Doe,Jane,Doe
2,JohnDoe@gmail.com,John Doe,John,Doe
3,mail,full,first,last
4,MAIL,FULL,FIRST,LAST


# removing rows

## filter for deletion (drop)


In [123]:
filt_mail = (people_df['email'].str.lower() == 'mail')
filt_mail


0    False
1    False
2    False
3     True
4     True
Name: email, dtype: bool

In [125]:
filt_last = (people_df['last'].str.lower() == 'last')
filt_last


0    False
1    False
2    False
3     True
4     True
Name: last, dtype: bool

In [126]:
filt_delete = filt_mail & filt_last
filt_delete

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [127]:
display(filt_delete.index)
display(filt_delete.name)
display(people_df[filt_delete].index)

Index([0, 1, 2, 3, 4], dtype='int64')

None

Index([3, 4], dtype='int64')

In [162]:
people_df.drop(index=people_df[filt_delete].index)

Unnamed: 0,email,full_name,first,last
0,CoreyMSchafer@gmail.com,Corey Schafer,Corey,Schafer
1,JaneDoe@gmail.com,Jane Doe,Jane,Doe
2,JohnDoe@gmail.com,John Doe,John,Doe
