In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('pandas/employees.csv')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [3]:
df.dtypes

First Name            object
Gender                object
Start Date            object
Last Login Time       object
Salary                 int64
Bonus %              float64
Senior Management     object
Team                  object
dtype: object

In [4]:
df['Gender'] = df['Gender'].astype('category')
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [5]:
df.dtypes

First Name                   object
Gender                     category
Start Date           datetime64[ns]
Last Login Time      datetime64[ns]
Salary                        int64
Bonus %                     float64
Senior Management              bool
Team                         object
dtype: object

### we can do with other way for `.astype` method

In [6]:
df = pd.read_csv('pandas/employees.csv', parse_dates=['Start Date','Last Login Time'])
df['Gender'] = df['Gender'].astype('category')
df['Senior Management'] = df['Senior Management'].astype('bool')

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [7]:
df.dtypes

First Name                   object
Gender                     category
Start Date           datetime64[ns]
Last Login Time      datetime64[ns]
Salary                        int64
Bonus %                     float64
Senior Management              bool
Team                         object
dtype: object

## Filter A `DataFrame` Based On A Condition

### Filter with More than One Condition (AND)

In [8]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [9]:
df[(df['Gender']=='Male') & (df['Team']=='Marketing')].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2019-12-06 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2019-12-06 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2019-12-06 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2019-12-06 20:13:00,107391,1.26,True,Marketing
101,Aaron,Male,2012-02-17,2019-12-06 10:20:00,61602,11.849,True,Marketing
104,John,Male,1989-12-23,2019-12-06 07:01:00,80740,19.305,False,Marketing
112,Willie,Male,2003-11-27,2019-12-06 06:21:00,64363,4.023,False,Marketing
119,Paul,Male,2008-06-03,2019-12-06 15:05:00,41054,12.299,False,Marketing
150,Sean,Male,1996-05-04,2019-12-06 20:59:00,135490,19.934,False,Marketing


### Filter with More than One Condition (OR)

In [10]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [11]:
df[(df['Senior Management']) | (df['Start Date'] < '1990-01-01')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2019-12-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2019-12-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-12-06 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,2019-12-06 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2019-12-06 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2019-12-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2019-12-06 06:30:00,42392,19.675,False,Finance


In [12]:
df[((df['First Name'] =='Robert') & (df['Team']=='Client Services')) | (df['Start Date'] > '2016-06-01')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2019-12-06 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2019-12-06 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2019-12-06 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2019-12-06 00:29:00,140002,19.49,True,Marketing


## The `.isin()` Method

In [13]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [14]:
fil1 = df['Team'] == 'Legal'
fil2 = df['Team'] == 'Sales'
fil3 = df['Team'] == 'Product'

df[fil1 | fil2 | fil3].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2019-12-06 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2019-12-06 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2019-12-06 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2019-12-06 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2019-12-06 06:09:00,59414,1.256,False,Product


In [15]:
fil = df['Team'].isin(['Legal','Sales','Product'])
df[fil].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2019-12-06 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2019-12-06 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2019-12-06 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2019-12-06 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2019-12-06 06:09:00,59414,1.256,False,Product


## The `.isnull()` and `.notnull()` Methods

In [16]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [17]:
df[df['Team'].isnull()].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2019-12-06 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2019-12-06 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2019-12-06 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2019-12-06 23:00:00,128771,8.309,False,
109,Christopher,Male,2000-04-22,2019-12-06 10:15:00,37919,11.449,False,
139,,Female,1990-10-03,2019-12-06 01:08:00,132373,10.527,True,
199,Jonathan,Male,2009-07-17,2019-12-06 08:15:00,130581,16.736,True,
258,Michael,Male,2002-01-24,2019-12-06 03:04:00,43586,12.659,False,
290,Jeremy,Male,1988-06-14,2019-12-06 18:20:00,129460,13.657,True,


In [18]:
df[df['Gender'].notnull()].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-12-06 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-12-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-12-06 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2019-12-06 16:20:00,65476,10.012,True,Product
7,,Female,2015-07-20,2019-12-06 10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,2019-12-06 06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,2019-12-06 06:51:00,139852,7.524,True,Business Development


## The `.between()` Method

In [19]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [20]:
df[df['Salary'].between(60000, 70000)].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2019-12-06 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2019-12-06 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2019-12-06 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2019-12-06 01:08:00,66582,11.308,True,Business Development
47,Kathy,Female,2005-06-22,2019-12-06 04:51:00,66820,9.0,True,Client Services
57,Henry,Male,1996-06-26,2019-12-06 01:44:00,64715,15.107,True,Human Resources
59,Irene,Female,1997-05-07,2019-12-06 09:32:00,66851,11.279,False,Engineering
65,Steve,Male,2009-11-11,2019-12-06 23:44:00,61310,12.428,True,Distribution
74,Thomas,Male,1995-06-04,2019-12-06 14:24:00,62096,17.029,False,Marketing


In [21]:
df[df['Bonus %'].between(2.0, 5.0)].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
20,Lois,,1995-04-22,2019-12-06 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2019-12-06 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2019-12-06 12:13:00,113590,3.055,False,Sales
60,Paula,,2005-11-23,2019-12-06 14:01:00,48866,4.271,False,Distribution
61,Denise,Female,2001-11-06,2019-12-06 12:03:00,106862,3.699,False,Business Development
66,Nancy,Female,2012-12-15,2019-12-06 23:57:00,125250,2.672,True,Business Development
69,Irene,,2015-07-14,2019-12-06 16:31:00,100863,4.382,True,Finance
72,Bobby,Male,2007-05-07,2019-12-06 10:01:00,54043,3.833,False,Product
73,Frances,Female,1999-04-04,2019-12-06 16:19:00,90582,4.709,True,Sales


In [22]:
df[df['Start Date'].between('1991-01-01', '1992-01-01')].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2019-12-06 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2019-12-06 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2019-12-06 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2019-12-06 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2019-12-06 02:24:00,124488,14.837,True,Sales
166,,Female,1991-07-09,2019-12-06 18:52:00,42341,7.014,True,Sales
172,Sara,Female,1991-09-23,2019-12-06 18:17:00,97058,9.402,False,Finance
220,,Female,1991-06-17,2019-12-06 12:49:00,71945,5.56,True,Marketing
245,Victor,Male,1991-04-11,2019-12-06 07:44:00,70817,17.138,False,Engineering
277,Brenda,,1991-05-29,2019-12-06 06:32:00,82439,19.062,False,Sales


In [23]:
df[df['Last Login Time'].between('08:30AM', '12:00PM')].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2019-12-06 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2019-12-06 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2019-12-06 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2019-12-06 09:07:00,119082,16.18,False,Business Development
40,Michael,Male,2008-10-10,2019-12-06 11:25:00,99283,2.665,True,Distribution
45,Roger,Male,1980-04-17,2019-12-06 11:32:00,88010,13.886,True,Sales
54,Sara,Female,2007-08-15,2019-12-06 09:23:00,83677,8.999,False,Engineering
59,Irene,Female,1997-05-07,2019-12-06 09:32:00,66851,11.279,False,Engineering
72,Bobby,Male,2007-05-07,2019-12-06 10:01:00,54043,3.833,False,Product


## The `.duplicated()` Method

In [24]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [25]:
df['First Name']

0      Douglas
1       Thomas
2        Maria
3        Jerry
4        Larry
        ...   
995      Henry
996    Phillip
997    Russell
998      Larry
999     Albert
Name: First Name, Length: 1000, dtype: object

In [26]:
df[~df["First Name"].duplicated(keep = False)].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2019-12-06 01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,2019-12-06 06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,2019-12-06 09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,2019-12-06 03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,2019-12-06 10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,2019-12-06 10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,2019-12-06 22:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,2019-12-06 15:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,2019-12-06 08:48:00,92242,15.407,False,Legal


## The `.drop_duplicates()` Method

In [27]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [28]:
len(df)

1000

In [29]:
len(df.drop_duplicates())

1000

In [30]:
df.drop_duplicates(subset = ['First Name'], keep = False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2019-12-06 01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,2019-12-06 06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,2019-12-06 09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,2019-12-06 03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,2019-12-06 10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,2019-12-06 10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,2019-12-06 22:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,2019-12-06 15:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,2019-12-06 08:48:00,92242,15.407,False,Legal


In [31]:
df.drop_duplicates(subset = ['First Name', 'Team'], inplace = True)

In [32]:
df.head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,


In [33]:
len(df)

769

## The `.unique()` and `.nunique()` Methods

In [34]:
df = pd.read_csv('pandas/employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-12-06 11:17:00,130590,11.858,False,Finance


In [35]:
df['Gender'].unique()

[Male, Female, NaN]
Categories (2, object): [Male, Female]

In [36]:
df['Team'].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [37]:
len(df['Team'].unique())

11

In [38]:
df['Team'].nunique()

10

In [39]:
df['Team'].nunique(dropna = False)

11