# Working with Text Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/pandas/chicago.csv')
df['Department'] = df['Department'].astype('category')
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [5]:
df['Department'].nunique()

35

In [6]:
df['Department'].count()

32062

In [7]:
df['Department'] = df['Department'].astype('category')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


## Common String Methods - lower, upper, title, and len
In vanilla python, `.lower()`,`.upper()`,`.title()`, and `len()` can all be called directly on the sring you wish to manipulate.

In [9]:
'Hello World'.lower()

'hello world'

In [10]:
'Hello World'.upper()

'HELLO WORLD'

In [12]:
'hello world'.title()

'Hello World'

In [14]:
len('Hello World')

11

When working with series/dataframes, you need to call the `.str` method before envoling a string method.

In [15]:
df['Name'].str.lower().head()

0        aaron,  elvia j
1      aaron,  jeffery m
2         aaron,  karina
3    aaron,  kimberlei r
4    abad jr,  vicente m
Name: Name, dtype: object

In [17]:
df['Name'].str.lower().str.upper().head()

0        AARON,  ELVIA J
1      AARON,  JEFFERY M
2         AARON,  KARINA
3    AARON,  KIMBERLEI R
4    ABAD JR,  VICENTE M
Name: Name, dtype: object

In [18]:
df['Name'].str.lower().str.upper().str.title().head()

0        Aaron,  Elvia J
1      Aaron,  Jeffery M
2         Aaron,  Karina
3    Aaron,  Kimberlei R
4    Abad Jr,  Vicente M
Name: Name, dtype: object

In [19]:
df['Position Title'].str.title().head()

0            Water Rate Taker
1              Police Officer
2              Police Officer
3    Chief Contract Expediter
4           Civil Engineer Iv
Name: Position Title, dtype: object

In [20]:
df['Position Title'] = df['Position Title'].str.title()
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00
3,"AARON, KIMBERLEI R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",Civil Engineer Iv,WATER MGMNT,$106836.00


In [21]:
df['Department'] = df['Department'].str.title()
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,Water Mgmnt,$90744.00
1,"AARON, JEFFERY M",Police Officer,Police,$84450.00
2,"AARON, KARINA",Police Officer,Police,$84450.00
3,"AARON, KIMBERLEI R",Chief Contract Expediter,General Services,$89880.00
4,"ABAD JR, VICENTE M",Civil Engineer Iv,Water Mgmnt,$106836.00


In [22]:
df['Department'].str.len().head()

0    11.0
1     6.0
2     6.0
3    16.0
4    11.0
Name: Department, dtype: float64

## The .str.replace() Method
The `.replace()` method method takes looks for a substring in a python string and repalces it with a new designated string.

In [23]:
'Hello World'.replace('l','!')

'He!!o Wor!d'

In [24]:
df = pd.read_csv('data/pandas/chicago.csv').dropna(how='all')
df['Department'] = df['Department'].astype('category')
df.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [25]:
df['Department'] = df['Department'].str.replace('MGMNT','MANAGEMENT')
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [26]:
# convert string dollars to float
df['Employee Annual Salary'] = df['Employee Annual Salary'].str.replace('$','').astype(float)
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [27]:
df['Employee Annual Salary'].nlargest()

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
Name: Employee Annual Salary, dtype: float64

## Filtering with String Methods

In [None]:
df = pd.read_csv('data/pandas/chicago.csv').dropna(how='all')
df['Department'] = df['Department'].astype('category')
df.tail()

In [29]:
# filter for all rows containing the word "water"
mask = df['Position Title'].str.lower().str.contains('water')
df[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MANAGEMENT,82044.0
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MANAGEMENT,109272.0
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0


In [30]:
# startswith
mask = df['Position Title'].str.lower().str.startswith('water')
df[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MANAGEMENT,82044.0
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MANAGEMENT,82044.0
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MANAGEMENT,82044.0
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MANAGEMENT,53172.0


In [32]:
# endswith
mask = df['Position Title'].str.lower().str.endswith('ist')
df[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,99840.0
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,81948.0
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,89880.0
472,"ALLEN, ROBERT",MACHINIST,WATER MANAGEMENT,94328.0
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,91476.0


## More String Methods - strip, lstrip, and rstrip
Used to remove whitespace in a string.

In [35]:
'      Hello world     '.lstrip()

'Hello world     '

In [36]:
'      Hello world     '.rstrip()

'      Hello world'

In [37]:
'      Hello world     '.strip()

'Hello world'

In [38]:
df['Name'] = df['Name'].str.strip()

0        AARON,  ELVIA J
1      AARON,  JEFFERY M
2         AARON,  KARINA
3    AARON,  KIMBERLEI R
4    ABAD JR,  VICENTE M
Name: Name, dtype: object

In [40]:
df['Position Title'] = df['Position Title'].str.strip()
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


## String Methods on Indexes and Columns

In [41]:
df = pd.read_csv('data/pandas/chicago.csv',index_col='Name').dropna(how='all')
df['Department'] = df['Department'].astype('category')
df.tail()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [42]:
df.index

Index(['AARON,  ELVIA J', 'AARON,  JEFFERY M', 'AARON,  KARINA',
       'AARON,  KIMBERLEI R', 'ABAD JR,  VICENTE M', 'ABARCA,  ANABEL',
       'ABARCA,  EMMANUEL', 'ABASCAL,  REECE E', 'ABBASI,  CHRISTOPHER',
       'ABBATACOLA,  ROBERT J',
       ...
       'ZWIT,  JEFFREY J', 'ZWOLFER,  MATTHEW W', 'ZYCH,  MATEUSZ',
       'ZYDEK,  BRYAN', 'ZYGADLO,  JOHN P', 'ZYGADLO,  MICHAEL J',
       'ZYGOWICZ,  PETER J', 'ZYMANTAS,  MARK E', 'ZYRKOWSKI,  CARLO E',
       'ZYSKOWSKI,  DARIUSZ'],
      dtype='object', name='Name', length=32062)

In [44]:
df.index = df.index.str.strip().str.title()
df.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [49]:
df.columns = df.columns.str.lower().str.replace(' ','_')
df.head()

Unnamed: 0_level_0,position_title,department,employee_annual_salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


## Split Strings by Characters with the .str.split() Method

In [50]:
df = pd.read_csv('data/pandas/chicago.csv').dropna(how='all')
df['Department'] = df['Department'].astype('category')
df.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [51]:
'Hello my name is Carl'.split()

['Hello', 'my', 'name', 'is', 'Carl']

In [55]:
# find the most common last name in our data
df['Name'].str.split(',').str.get(0).str.title().value_counts().head()

Williams    293
Johnson     244
Smith       241
Brown       185
Jones       183
Name: Name, dtype: int64

In [57]:
# find the most common first word of position title
df['Position Title'].str.split().str.get(0).value_counts().head()

POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
Name: Position Title, dtype: int64

In [58]:
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [63]:
# most common first name
df['Name'].str.split(',').str.get(1).str.strip().str.split().str.get(0).value_counts().head()

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

## The expand and n Parameters of the .str.split() Method

In [64]:
df['Name'].str.split(',',expand=True).head()

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M


In [65]:
df[['First Name','Last Name']] = df['Name'].str.split(',',expand=True)
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [66]:
df['Position Title'].str.split(' ',expand=True).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,


In [68]:
df[['First Title Word','Remaining Words']] = df['Position Title'].str.split(' ',expand=True,n=1)
df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
