In [1]:
import pandas as pd

In [13]:
chicago = pd.read_csv('pandas/chicago.csv')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [6]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [11]:
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [9]:
chicago['Department'] = chicago['Department'].astype('category')

In [10]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


## Common string methods - .lower(), .upper(), .title(), and .len()

In [15]:
chicago = pd.read_csv('pandas/chicago.csv')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [19]:
"hello world".upper()

'HELLO WORLD'

In [20]:
"HELLO world".lower()

'hello world'

In [17]:
"hello world".title()

'Hello World'

In [18]:
len('hello world') # Space counted!

11

In [16]:
chicago['Position Title'].lower() # String method does not work for string methods!

AttributeError: 'Series' object has no attribute 'lower'

In [26]:
chicago['Name'] = chicago['Name'].str.title() 
chicago['Position Title'] = chicago['Position Title'].str.title() 

In [27]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",Police Officer,POLICE,$84450.00
2,"Aaron, Karina",Police Officer,POLICE,$84450.00


In [28]:
chicago['Department'].str.len() # This will calculate number of characters in every single values!
# if we do len(chicago['Department']) we will just get number of rows!

0        11.0
1         6.0
2         6.0
3        16.0
4        11.0
         ... 
32058     6.0
32059     6.0
32060     6.0
32061     4.0
32062     NaN
Name: Department, Length: 32063, dtype: float64

In [29]:
len(chicago['Department'])

32063

## The .str.replace() Method

In [32]:
chicago = pd.read_csv('pandas/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [31]:
"Hello World".replace('l', '!')

'He!!o Wor!d'

In [36]:
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MAHNGEMENT')

In [37]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MAHNGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MAHNGEMENT,$106836.00


In [40]:
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)

  chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)


In [41]:
chicago['Employee Annual Salary'].sum()

2571506375.36

In [42]:
chicago['Employee Annual Salary'].mean()

80204.178633899

In [43]:
chicago['Employee Annual Salary'].nlargest()

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
Name: Employee Annual Salary, dtype: float64

## Filtering with String methods
- in order to filter, we will need to generate boolean series!
- Will need to standarized at the very beginning!

In [44]:
chicago = pd.read_csv('pandas/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [47]:
mask = chicago['Position Title'].str.lower().str.contains('water') # return boolean values whether it contains 
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [50]:
chicago[chicago['Position Title'].str.lower().str.startswith('water')]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [52]:
mask = chicago['Position Title'].str.lower().str.endswith('ist')
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


## More string methods - .strip(), lstrip(), rstrip()

In [53]:
chicago = pd.read_csv('pandas/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [54]:
"        Hello World   ".lstrip()

'Hello World   '

In [55]:
"        Hello World   ".rstrip()

'        Hello World'

In [56]:
"        Hello World   ".strip()

'Hello World'

In [60]:
chicago['Name'] = chicago['Name'].str.strip()

In [62]:
chicago['Position Title']= chicago['Position Title'].str.strip()

## String methods on Index and columns

In [63]:
chicago = pd.read_csv('pandas/chicago.csv', index_col = 'Name').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [66]:
chicago.index = chicago.index.str.strip().str.title()

In [67]:
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [69]:
chicago.columns = chicago.columns.str.upper()

In [70]:
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


## Split string by character with .str.split() method

In [72]:
chicago = pd.read_csv('pandas/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [73]:
"Hello my name is Boris".split(' ')

['Hello', 'my', 'name', 'is', 'Boris']

In [75]:
chicago['Name'].str.split(',')

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [78]:
chicago['Name'].str.split(',').str.get(0).str.title().value_counts()

Williams             293
Johnson              244
Smith                241
Brown                185
Jones                183
                    ... 
Giuliani               1
Zakee                  1
Hozian                 1
Cleary Synowiecki      1
Carvotta               1
Name: Name, Length: 13829, dtype: int64

In [79]:
chicago['Position Title'].str.split(' ')

0                        [WATER, RATE, TAKER]
1                           [POLICE, OFFICER]
2                           [POLICE, OFFICER]
3                [CHIEF, CONTRACT, EXPEDITER]
4                       [CIVIL, ENGINEER, IV]
                         ...                 
32057    [FRM, OF, MACHINISTS, -, AUTOMOTIVE]
32058                       [POLICE, OFFICER]
32059                       [POLICE, OFFICER]
32060                       [POLICE, OFFICER]
32061            [CHIEF, DATA, BASE, ANALYST]
Name: Position Title, Length: 32062, dtype: object

In [81]:
chicago['Name'].str.split(',').str.get(0).value_counts().head(3)

WILLIAMS    293
JOHNSON     244
SMITH       241
Name: Name, dtype: int64

In [83]:
chicago['Name'].str.split(',').str.get(1).str.split(' ') # Name do have space before comma

0            [, , ELVIA, J]
1          [, , JEFFERY, M]
2              [, , KARINA]
3        [, , KIMBERLEI, R]
4          [, , VICENTE, M]
                ...        
32057      [, , MICHAEL, J]
32058        [, , PETER, J]
32059         [, , MARK, E]
32060        [, , CARLO, E]
32061         [, , DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [87]:
chicago['Name'].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0).value_counts().head() # GEt the first name

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

## The expand and n parameter of the str.split() method

In [89]:
chicago = pd.read_csv('pandas/chicago.csv').dropna(how = 'all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [92]:
chicago['Name'].str.split(',', expand = True)  # Return as dataframe instead of list

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
...,...,...
32057,ZYGADLO,MICHAEL J
32058,ZYGOWICZ,PETER J
32059,ZYMANTAS,MARK E
32060,ZYRKOWSKI,CARLO E


In [93]:
chicago[['First Name','Last Name']]= chicago['Name'].str.split(',', expand = True) 

In [94]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [96]:
chicago['Position Title'].str.split(' ', expand = True) # Different number of space!

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [98]:
chicago[['First Title Word', 'Remaining Words']] = chicago['Position Title'].str.split(' ', expand = True, n = 1)  
# n is the maximum number of split we want to do 

In [99]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
