# Part 4 - filtering

In [39]:
import pandas as pd

In [40]:
survey_df = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')

In [41]:
people_df = pd.DataFrame({
    'first': 'Corey Jane John'.split(),
    'last': 'Schafer Doe Doe'.split(),
    'email': 'CoreyMSchafer@gmail.com JaneDoe@gmail.com JohnDoe@gmail.com'.split()
})
people_df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [42]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

## create filters

### bracket

In [43]:
people_df['last'] == 'Doe'

0    False
1     True
2     True
Name: last, dtype: bool

### .loc

In [44]:
people_df.loc[:, 'last'] == 'Doe'

0    False
1     True
2     True
Name: last, dtype: bool

### .iloc

In [45]:
people_df.iloc[:, 1] == 'Doe'

0    False
1     True
2     True
Name: last, dtype: bool

## assign filters

In [46]:
filt = (people_df['last'] == 'Doe')

In [47]:
people_df[filt]

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [48]:
people_df.loc[filt, 'email']

1    JaneDoe@gmail.com
2    JohnDoe@gmail.com
Name: email, dtype: object

In [49]:
filt = (people_df['last'] == 'Doe') & (people_df['first'] == 'John')

In [50]:
people_df.loc[filt, 'email']

2    JohnDoe@gmail.com
Name: email, dtype: object

In [51]:
filt = (people_df['last'] == 'Schafer') | (people_df['first'] == 'John')

In [52]:
people_df.loc[filt, 'email']

0    CoreyMSchafer@gmail.com
2          JohnDoe@gmail.com
Name: email, dtype: object

In [53]:
people_df.loc[~filt, 'email']

1    JaneDoe@gmail.com
Name: email, dtype: object

# real word example

In [54]:
survey_df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
       'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
  

In [55]:
high_salary = (survey_df['ConvertedComp'] > 70000)

In [59]:
survey_df.loc[high_salary, ['Country', 'LanguageWorkedWith', 'ConvertedComp']].head()

Unnamed: 0,Country,LanguageWorkedWith,ConvertedComp
7,United States,Python;SQL,116000.0
15,United Kingdom,Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...,108576.0
16,United States,C#;HTML/CSS;JavaScript;Python;SQL;VBA,79000.0
17,United States,Bash/Shell/PowerShell;HTML/CSS;Perl,1260000.0
18,United States,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;S...,83400.0


In [74]:
countries = 'Italy Germany Canada'.split() + ['United Kingdom', 'United Stated']
display(countries)
filt_countries = survey_df['Country'].isin(countries)
filt_countries.head()

['Italy', 'Germany', 'Canada', 'United Kingdom', 'United Stated']

0     True
1     True
2    False
3    False
4    False
Name: Country, dtype: bool

In [79]:
survey_df.loc[filt_countries, 'Country']

0               Germany
1        United Kingdom
5               Germany
9        United Kingdom
10       United Kingdom
              ...      
64380           Germany
64402           Germany
64422    United Kingdom
64432    United Kingdom
64441            Canada
Name: Country, Length: 11094, dtype: object

In [81]:
filter_python = (survey_df['LanguageWorkedWith'].str.contains('Python', na=False))

In [82]:
survey_df.loc[filter_python, 'LanguageWorkedWith']

2                                 Objective-C;Python;Swift
7                                               Python;SQL
9                      HTML/CSS;Java;JavaScript;Python;SQL
12                                     C;JavaScript;Python
14        Bash/Shell/PowerShell;C;HTML/CSS;Java;Python;SQL
                               ...                        
64433    Bash/Shell/PowerShell;HTML/CSS;JavaScript;Perl...
64438       C++;HTML/CSS;JavaScript;Python;Ruby;TypeScript
64443              C++;HTML/CSS;Java;JavaScript;Python;SQL
64446    Bash/Shell/PowerShell;C;C#;C++;HTML/CSS;Java;J...
64457    Assembly;Bash/Shell/PowerShell;C;C#;C++;Dart;G...
Name: LanguageWorkedWith, Length: 25287, dtype: object

# the str[0] accessor is the only vectorized way to get nth item from a **LIST**

In [68]:
survey_df['LanguageWorkedWith'].str.split(';')

0                               [C#, HTML/CSS, JavaScript]
1                                      [JavaScript, Swift]
2                             [Objective-C, Python, Swift]
3                                                      NaN
4                                    [HTML/CSS, Ruby, SQL]
                               ...                        
64456                                                  NaN
64457    [Assembly, Bash/Shell/PowerShell, C, C#, C++, ...
64458                                                  NaN
64459                                           [HTML/CSS]
64460                [C#, HTML/CSS, Java, JavaScript, SQL]
Name: LanguageWorkedWith, Length: 64461, dtype: object

In [None]:
survey_df['LanguageWorkedWith'].str.split(';').str[0]

0                 C#
1         JavaScript
2        Objective-C
3                NaN
4           HTML/CSS
            ...     
64456            NaN
64457       Assembly
64458            NaN
64459       HTML/CSS
64460             C#
Name: LanguageWorkedWith, Length: 64461, dtype: object