In [2]:
import pandas as pd

In [2]:
df = pd.read_csv("input/employees.csv")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


## Memory optimization

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null object
Start Date           1000 non-null object
Last Login Time      1000 non-null object
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    933 non-null object
Team                 957 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [5]:
df.Gender.nunique()

2

In [6]:
df["Gender"] = df["Gender"].astype("category")

In [9]:
df["Start Date"] = pd.to_datetime(df["Start Date"])

In [10]:
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])

# String can also be converted to datetime when loading the dataset as:

# df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])

In [11]:
df["Senior Management"] = df["Senior Management"].astype("bool")

In [12]:
df.Team.nunique()

10

In [16]:
df.Team.count()

957

In [17]:
df["Team"] = df["Team"].astype("category")

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null category
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


In [19]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2018-06-20 16:47:00,101004,1.389,True,Client Services


In [20]:
(62.6-42.6)/62.6*100
# 31.94% memory optimization

31.948881789137378

### Filter a dataFrame based on a condition

In [22]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance


In [29]:
(df["Gender"] == "Male").head(3)
# return True instead of "Male" and false instead of "Female"

0     True
1     True
2    False
Name: Gender, dtype: bool

In [30]:
df[df["Gender"] == "Male"].head(3)
# returns data of only Males

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance


In [31]:
df[df["Gender"] != "Male"].head(3)
# returns data of only Females

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance
6,Ruby,Female,1987-08-17,2018-06-20 16:20:00,65476,10.012,True,Product
7,,Female,2015-07-20,2018-06-20 10:43:00,45906,11.598,True,Finance


In [34]:
mask = df["Team"] == "Finance"
df[mask].head(3)
# returns data of employee from the finance team only

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2018-06-20 10:43:00,45906,11.598,True,Finance


In [35]:
df["Senior Management"].head(3)

0     True
1     True
2    False
Name: Senior Management, dtype: bool

In [38]:
mask = df["Senior Management"] #contains only boolean values
df[mask].head(3)
# returns data of employee which are in Senior Management

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance


In [42]:
df["Salary"] >= 1000000
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance


In [44]:
mask = df["Bonus %"] < 1.5
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2018-06-20 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2018-06-20 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2018-06-20 07:18:00,72670,1.481,True,Engineering


In [45]:
df["Last Login Time"].head(3)

0   2018-06-20 12:42:00
1   2018-06-20 06:53:00
2   2018-06-20 11:17:00
Name: Last Login Time, dtype: datetime64[ns]

In [52]:
mask = df["Start Date"] >= "2010-01-01"
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,2018-06-20 10:43:00,45906,11.598,True,Finance
15,Lillian,Female,2016-06-05,2018-06-20 06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,2018-06-20 05:56:00,90370,7.369,False,Human Resources


### Filter with more than one condition (AND)

In [53]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance


In [57]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Marketing"
df[ mask1 & mask2].head(3)
# returns male employees from marketing team

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2018-06-20 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2018-06-20 07:45:00,37598,7.757,True,Marketing


### Filter with more than one condition (OR)

In [58]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance


In [65]:
mask1 = df["Team"] == "Marketing"
mask2 = df["Senior Management"]
df[mask1 | mask2].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance


In [67]:
mask1 = df["Team"] == "Marketing"
mask2 = df["Senior Management"]
mask3 = df["First Name"] == "Jerry"
df[mask1 | (mask2 & mask3)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance
21,Matthew,Male,1995-09-05,2018-06-20 02:12:00,100612,13.645,False,Marketing


### The .isin() method

In [68]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance


In [70]:
mask1 = df["Team"] == "Product"
mask2 = df["Team"] == "Engineering"
mask3 = df["Team"] == "Sales"
df[mask1 | mask2 |mask3].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
6,Ruby,Female,1987-08-17,2018-06-20 16:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,2018-06-20 06:29:00,95570,18.523,True,Engineering
13,Gary,Male,2008-01-27,2018-06-20 23:40:00,109831,5.831,False,Sales


In [73]:
mask = df["Team"].isin(["Product","Engineering","Sales"])
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
6,Ruby,Female,1987-08-17,2018-06-20 16:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,2018-06-20 06:29:00,95570,18.523,True,Engineering
13,Gary,Male,2008-01-27,2018-06-20 23:40:00,109831,5.831,False,Sales


### The .isnull() and .notnull() methods

In [87]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")

In [79]:
mask = df["Team"].isnull()
df[mask].head(3)
# returns data where Team as null

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2018-06-20 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2018-06-20 16:19:00,125792,5.042,True,


In [80]:
mask = df["Team"].notnull()
df[mask].head(3)
# returns data where Team as not null

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null category
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


In [85]:
mask1 = df["Team"].notnull()
mask2 = df["Gender"].notnull()
mask3 = df["First Name"].notnull()
df[mask1 & mask2 & mask3].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-06-20 13:00:00,138705,9.34,True,Finance


### The .between() method

In [88]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")

In [92]:
mask = df["Salary"].between(90000,100000)
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
8,Angela,Female,2005-11-22,2018-06-20 06:29:00,95570,18.523,True,Engineering
16,Jeremy,Male,2010-09-21,2018-06-20 05:56:00,90370,7.369,False,Human Resources


In [93]:
mask = df["Bonus %"].between(2,5)
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
20,Lois,,1995-04-22,2018-06-20 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2018-06-20 11:25:00,99283,2.665,True,Distribution


In [100]:
mask = df["Start Date"].between("2016-06-01","2018-06-20")
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2018-06-20 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2018-06-20 19:47:00,100705,16.961,True,Marketing
451,Terry,,2016-07-15,2018-06-20 00:29:00,140002,19.49,True,Marketing


In [111]:
mask = df["Last Login Time"].between("12:00:00","12:03:00")
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
61,Denise,Female,2001-11-06,2018-06-20 12:03:00,106862,3.699,False,Business Development
591,Rachel,Female,1988-04-22,2018-06-20 12:01:00,110924,7.808,False,Distribution
847,Nicole,,1981-05-02,2018-06-20 12:03:00,41449,4.707,False,Finance


### The .duplicated() method (applied only on Series)

In [3]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance


In [10]:
df.sort_values("First Name",inplace = True)
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-06-20 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-06-20 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-06-20 14:53:00,52119,11.343,True,Client Services


In [13]:
df["First Name"].duplicated().head(4) # by default keep = first
#treat first element from the list of duplicates as not duplicate

101    False
327     True
440     True
937     True
Name: First Name, dtype: bool

In [15]:
df["First Name"].duplicated(keep = "last").head(4)
#treat last element from the list of duplicates as not duplicate

101     True
327     True
440     True
937    False
Name: First Name, dtype: bool

In [19]:
mask = df["First Name"].duplicated(keep = False)
# treat all dupicate elements as duplicated
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-06-20 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-06-20 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-06-20 14:53:00,52119,11.343,True,Client Services


In [20]:
mask = ~df["First Name"].duplicated(keep = False)  # tilde(~) as negation operator
# only unique values as True
df[mask].head(3)
#data of employees occurred only one times

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2018-06-20 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2018-06-20 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2018-06-20 03:39:00,57783,9.129,False,Finance


### The .drop_duplicates() method (applied on  dataframe)

In [23]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")

In [24]:
df.sort_values("First Name",inplace = True)
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-06-20 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-06-20 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-06-20 14:53:00,52119,11.343,True,Client Services


In [25]:
len(df)

1000

In [28]:
len(df.drop_duplicates())
#no record is deleted, because all columns are not same for any two record

1000

In [32]:
df.drop_duplicates(subset = "First Name").head(3) # by default keep = first

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-06-20 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2018-06-20 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2018-06-20 03:54:00,111786,3.592,True,Engineering


In [35]:
df.drop_duplicates(subset = "First Name",keep = "last").head(3) # keep last occurances of repeated values

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2018-06-20 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2018-06-20 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2018-06-20 00:26:00,41453,10.084,False,Product


In [37]:
df.drop_duplicates(subset = "First Name",keep = False).head(3)
# returns values occured only once

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2018-06-20 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2018-06-20 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2018-06-20 03:39:00,57783,9.129,False,Finance


In [40]:
df.drop_duplicates(subset = ["First Name","Team"]).head(3)
# returns data by removing records in which both collumns have same value in at least 2 records

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-06-20 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2018-06-20 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2018-06-20 01:45:00,95327,15.12,False,Distribution


In [41]:
df.drop_duplicates(subset = ["First Name","Team","Gender"]).head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-06-20 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2018-06-20 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2018-06-20 19:39:00,63126,18.424,False,Client Services


### The .unique() and .nunique() methods

In [43]:
df = pd.read_csv("input/employees.csv",parse_dates=["Start Date","Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-06-20 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-06-20 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-06-20 11:17:00,130590,11.858,False,Finance


In [45]:
df["Gender"].unique()

[Male, Female, NaN]
Categories (2, object): [Male, Female]

In [46]:
df["Team"].unique()

[Marketing, NaN, Finance, Client Services, Legal, ..., Engineering, Business Development, Human Resources, Sales, Distribution]
Length: 11
Categories (10, object): [Marketing, Finance, Client Services, Legal, ..., Business Development, Human Resources, Sales, Distribution]

In [49]:
len(df["Gender"].unique())

3

In [47]:
df["Gender"].nunique() #by default dropna = True , means ignores null values to count

2

In [51]:
df["Gender"].nunique(dropna = False)

3