# Working with Text Data

In [2]:
import pandas as pd

## This Module's Dataset
- This module's dataset (`chicago.csv`) is a collection of public sector employees in the city of Chicago.
- Each row inclues the employee's name, position, department, and salary.

In [3]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
# Inspecting the dataset
chicago.shape

chicago.nunique()

chicago.info()
# chicago.value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


In [5]:
chicago["Position Title"].value_counts()
chicago["Position Title"].nunique()


1093

## Common String Methods
- A **Series** has a special `str` attribute that exposes an object with string methods.
- Access the `str` attribute, then invoke the string method on the nested object.
- Most method names will match their Python method equivalents (`upper`, `lower`, `title`, etc).

In [6]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [7]:
astr = "the quick brown fox"
astr.title()
astr.capitalize()

'The quick brown fox'

#### We use the `.str` accessor to apply string formatting methods to a series

In [8]:
format_change = chicago["Name"].str.title()
chicago.insert(loc=1,column="Formatted Name",value=format_change)
chicago.head()

Unnamed: 0,Name,Formatted Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J","Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M","Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA","Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R","Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M","Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


#### Multiple `str` methods can be accessed by method chaining as shown below :

In [9]:
# Replacing string parts
chicago["Department"]
# changing 'MGMNT' to "Management"
chicago["Department"].str.replace("MGMNT","MANAGEMENT").str.title()
chicago["Department"].str.cap

AttributeError: 'StringMethods' object has no attribute 'cap'

In [None]:
# Another way of replacing given names:
chicago["Department"].value_counts()

Department
POLICE                   12618
FIRE                      4796
STREETS & SAN             2152
OEMC                      1982
WATER MGMNT               1840
AVIATION                  1521
TRANSPORTN                1073
PUBLIC LIBRARY             961
GENERAL SERVICES           930
FAMILY & SUPPORT           654
FINANCE                    577
HEALTH                     529
LAW                        414
CITY COUNCIL               396
BUILDINGS                  262
COMMUNITY DEVELOPMENT      208
BUSINESS AFFAIRS           161
BOARD OF ELECTION          117
DoIT                       102
CITY CLERK                  94
MAYOR'S OFFICE              86
PROCUREMENT                 83
IPRA                        82
CULTURAL AFFAIRS            74
HUMAN RESOURCES             70
ANIMAL CONTRL               67
INSPECTOR GEN               57
ADMIN HEARNG                39
BUDGET & MGMT               39
DISABILITIES                28
TREASURER                   22
HUMAN RELATIONS             

In [None]:
# Extracting the Dept Names into a new list
proper_depts = chicago["Department"].to_list()

# declaring a new blank list -- this one will be filled with the renamed names & later inserted into the DataFrame
new_modified_names = []

# code will look for these key maps in each value of the extracted list
check_keywords = {"MGMNT":"MANAGEMENT", "MGMT":"MANAGEMENT" , "SAN":"Sanitary" , "DoIT": "Dept Of IT" ,"GEN":"GENERAL","CONTRL":"Control","COMM":"Community"}  


In [None]:
new_modified_names = []
check_keywords = {
    "MGMNT":"MANAGEMENT", "MGMT":"MANAGEMENT" , "SAN":"Sanitary" , "DoIT": "Dept Of IT" ,"GEN":"GENERAL","CONTRL":"Control","COMM":"Community"
}    

In [None]:
for record in proper_depts:
    new_rec = record
    for existing,newstring in check_keywords.items():
        new_rec = new_rec.replace(existing,newstring)
    new_modified_names.append(new_rec)
print(new_modified_names)

['WATER MANAGEMENT', 'POLICE', 'POLICE', 'GENERALERAL SERVICES', 'WATER MANAGEMENT', 'CITY COUNCIL', 'STREETS & Sanitary', 'OEMC', 'CITY COUNCIL', 'AVIATION', 'FIRE', 'POLICE', 'FAMILY & SUPPORT', 'POLICE', 'IPRA', 'POLICE', 'POLICE', 'FIRE', 'POLICE', 'FIRE', 'WATER MANAGEMENT', 'FIRE', 'FIRE', 'OEMC', 'GENERALERAL SERVICES', 'WATER MANAGEMENT', 'STREETS & Sanitary', 'FAMILY & SUPPORT', 'POLICE', 'FIRE', 'STREETS & Sanitary', 'PUBLIC LIBRARY', 'POLICE', 'FAMILY & SUPPORT', 'WATER MANAGEMENT', 'BUSINESS AFFAIRS', 'POLICE', 'OEMC', 'STREETS & Sanitary', 'POLICE', 'FIRE', 'STREETS & Sanitary', 'FIRE', 'WATER MANAGEMENT', 'STREETS & Sanitary', 'TRANSPORTN', 'HEALTH', 'FAMILY & SUPPORT', 'FIRE', 'POLICE', 'AVIATION', 'HEALTH', 'POLICE', 'WATER MANAGEMENT', 'POLICE', 'WATER MANAGEMENT', 'POLICE', 'AVIATION', 'POLICE', 'POLICE', 'STREETS & Sanitary', 'STREETS & Sanitary', "MAYOR'S OFFICE", 'POLICE', 'POLICE', 'POLICE', 'POLICE', 'POLICE', 'FIRE', 'CITY COUNCIL', 'POLICE', 'FIRE', 'POLICE', '

In [None]:
chicago.head()
chicago.insert(loc=4,column="Formatted Dept Name",value=new_modified_names)
chicago.head()

Unnamed: 0,Name,Formatted Name,Position Title,Department,Formatted Dept Name,Employee Annual Salary
0,"AARON, ELVIA J","Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M","Aaron, Jeffery M",POLICE OFFICER,POLICE,POLICE,$84450.00
2,"AARON, KARINA","Aaron, Karina",POLICE OFFICER,POLICE,POLICE,$84450.00
3,"AARON, KIMBERLEI R","Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,GENERALERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M","Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,WATER MANAGEMENT,$106836.00


In [None]:
chicago["Formatted Dept Name"] = chicago["Formatted Dept Name"].str.title()
chicago.head()

Unnamed: 0,Name,Formatted Name,Position Title,Department,Formatted Dept Name,Employee Annual Salary
0,"AARON, ELVIA J","Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,Water Management,$90744.00
1,"AARON, JEFFERY M","Aaron, Jeffery M",POLICE OFFICER,POLICE,Police,$84450.00
2,"AARON, KARINA","Aaron, Karina",POLICE OFFICER,POLICE,Police,$84450.00
3,"AARON, KIMBERLEI R","Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,Generaleral Services,$89880.00
4,"ABAD JR, VICENTE M","Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,Water Management,$106836.00


In [None]:
chicago.tail()

Unnamed: 0,Name,Formatted Name,Position Title,Department,Formatted Dept Name,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J","Zygadlo, Michael J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,Generaleral Services,$99528.00
32058,"ZYGOWICZ, PETER J","Zygowicz, Peter J",POLICE OFFICER,POLICE,Police,$87384.00
32059,"ZYMANTAS, MARK E","Zymantas, Mark E",POLICE OFFICER,POLICE,Police,$84450.00
32060,"ZYRKOWSKI, CARLO E","Zyrkowski, Carlo E",POLICE OFFICER,POLICE,Police,$87384.00
32061,"ZYSKOWSKI, DARIUSZ","Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,Dept Of It,$113664.00


## Filtering with String Methods
- The `str.contains` method checks whether a substring exists anywhere in the string.
- The `str.startswith` method checks whether a substring exists at the start of the string.
- The `str.endswith` method checks whether a substring exists at the end of the string.

##### Ensure to normalize the string characters before passing the filter conditions

In [None]:
# Searching for employee's whose title contains 'water' in their position title
# condition
has_water = chicago["Position Title"].str.lower().str.contains("water")
chicago[has_water]

Unnamed: 0,Name,Formatted Name,Position Title,Department,Formatted Dept Name,Employee Annual Salary
0,"AARON, ELVIA J","Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,Water Management,$90744.00
554,"ALUISE, VINCENT G","Aluise, Vincent G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,Water Management,$102440.00
671,"ANDER, PERRY A","Ander, Perry A",WATER CHEMIST II,WATER MGMNT,Water Management,$82044.00
685,"ANDERSON, ANDREW J","Anderson, Andrew J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,Water Management,$109272.00
702,"ANDERSON, DONALD","Anderson, Donald",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,Water Management,$102440.00
...,...,...,...,...,...,...
29669,"VERMA, ANUPAM","Verma, Anupam",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,Water Management,$111192.00
30239,"WASHINGTON, JOSEPH","Washington, Joseph",WATER CHEMIST III,WATER MGMNT,Water Management,$89676.00
30544,"WEST, THOMAS R","West, Thomas R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,Water Management,$115704.00
30991,"WILLIAMS, MATTHEW","Williams, Matthew",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,Water Management,$102440.00


In [None]:
# Filter employes whose title starts with 'Civil'
contains_civil = chicago["Position Title"].str.lower().str.startswith("civil")
chicago[contains_civil]

Unnamed: 0,Name,Formatted Name,Position Title,Department,Formatted Dept Name,Employee Annual Salary
4,"ABAD JR, VICENTE M","Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,Water Management,$106836.00
25,"ABDULSATTAR, MUDHAR","Abdulsattar, Mudhar",CIVIL ENGINEER II,WATER MGMNT,Water Management,$58536.00
34,"ABRAHAM, GIRLEY T","Abraham, Girley T",CIVIL ENGINEER IV,WATER MGMNT,Water Management,$106836.00
55,"ABUTALEB, AHMAD H","Abutaleb, Ahmad H",CIVIL ENGINEER II,WATER MGMNT,Water Management,$89676.00
147,"ADAMS, TANERA C","Adams, Tanera C",CIVIL ENGINEER IV,TRANSPORTN,Transportn,$106836.00
...,...,...,...,...,...,...
31623,"YANG, LUYANG","Yang, Luyang",CIVIL ENGINEER V,TRANSPORTN,Transportn,$116784.00
31656,"YEPEZ, JESUS","Yepez, Jesus",CIVIL ENGINEER IV,TRANSPORTN,Transportn,$106836.00
31662,"YESUFU, STEPHANIE A","Yesufu, Stephanie A",CIVIL ENGINEER III,TRANSPORTN,Transportn,$92784.00
31797,"ZAKE, JOSHUA S","Zake, Joshua S",CIVIL ENGINEER IV,TRANSPORTN,Transportn,$106836.00


In [None]:
# Filter employes whose title ends with 'IV'
contains_iv = chicago["Position Title"].str.lower().str.endswith("iv")
chicago[contains_iv]

Unnamed: 0,Name,Formatted Name,Position Title,Department,Formatted Dept Name,Employee Annual Salary
4,"ABAD JR, VICENTE M","Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,Water Management,$106836.00
34,"ABRAHAM, GIRLEY T","Abraham, Girley T",CIVIL ENGINEER IV,WATER MGMNT,Water Management,$106836.00
145,"ADAMS, SHERYLL A","Adams, Sheryll A",LIBRARIAN IV,PUBLIC LIBRARY,Public Library,$97812.00
147,"ADAMS, TANERA C","Adams, Tanera C",CIVIL ENGINEER IV,TRANSPORTN,Transportn,$106836.00
166,"ADENI, MOHAMED K","Adeni, Mohamed K",ACCOUNTANT IV,FINANCE,Finance,$97812.00
...,...,...,...,...,...,...
31777,"ZAFIRIS, CHRISTOPHER","Zafiris, Christopher",ARCHITECT IV,DISABILITIES,Disabilities,$106836.00
31797,"ZAKE, JOSHUA S","Zake, Joshua S",CIVIL ENGINEER IV,TRANSPORTN,Transportn,$106836.00
31870,"ZAVALA, FERNANDO","Zavala, Fernando",ACCOUNTANT IV,FINANCE,Finance,$97812.00
31884,"ZAWADSKI, JAMES","Zawadski, James",CLERK IV,LAW,Law,$68028.00


## String Methods on Index and Columns
- Use the `index` and `columns` attributes to access the **DataFrame** index/column labels.
- These objects support string methods via their own `str` attribute.

In [None]:
# Resetting the dataset & defining Names as the index
chicago = pd.read_csv("chicago.csv",index_col=["Name"]).dropna(how="all").sort_index()
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [None]:
chicago.index.str.title()

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [None]:
chicago.columns.str.upper()

Index(['POSITION TITLE', 'DEPARTMENT', 'EMPLOYEE ANNUAL SALARY'], dtype='object')

## The split Method
- The `str.split` method splits a string by the occurrence of a delimiter. Pandas returns a **Series** of lists.
- Use the `str.get` method to access a nested list element by its index position.

In [None]:
"The quick brown fox".split(sep=" ")

['The', 'quick', 'brown', 'fox']

In [16]:
# Most common fist word in chicago position titles
chicago["Position Title"].str.split(" ").str.get(0).value_counts()

Position Title
POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
                   ...  
PORTFOLIO              1
SUPERVISOR             1
SUPT                   1
STRUCTURAL             1
CORPORATION            1
Name: count, Length: 320, dtype: int64

In [33]:
# Most common first name
newone = chicago["Formatted Name"].str.split(",").str.get(-1)
# intermediate series to further seggregate instances that contain [FirstName + MiddleName]
newone.str.lstrip().str.split(" ").str.get(0).value_counts()

Formatted Name
Michael     1153
John         899
James        676
Robert       622
Joseph       537
            ... 
Russ           1
Fabiola        1
Jurdon         1
Nateesha       1
Lilya          1
Name: count, Length: 5091, dtype: int64

## More Practice with Splits

In [37]:
# Another approach for extracting first names
chicago["Formatted Name"].str.split(", ").str.get(1).str.strip().str.split(" ").str.get(0)
chicago["Formatted Name"].str.split(", ").str.get(1).str.strip().str.split(" ").str.get(0).value_counts()

Formatted Name
Michael     1153
John         899
James        676
Robert       622
Joseph       537
            ... 
Russ           1
Fabiola        1
Jurdon         1
Nateesha       1
Lilya          1
Name: count, Length: 5091, dtype: int64

## The expand and n Parameters of the split Method
- The `expand` parameter returns a **DataFrame** instead of a **Series** of lists.
- The `n` parameter limits the number of splits.

In [41]:
chicago[["LastName","FirstName"]]=chicago["Formatted Name"].str.split(", ",expand=True)
chicago.head()


Unnamed: 0,Name,Formatted Name,Position Title,Department,Employee Annual Salary,LastName,FirstName
0,"AARON, ELVIA J","Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00,Aaron,Elvia J
1,"AARON, JEFFERY M","Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00,Aaron,Jeffery M
2,"AARON, KARINA","Aaron, Karina",POLICE OFFICER,POLICE,$84450.00,Aaron,Karina
3,"AARON, KIMBERLEI R","Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,Aaron,Kimberlei R
4,"ABAD JR, VICENTE M","Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,Abad Jr,Vicente M


In [46]:
chicago["Formatted Name"].str.split(" ",expand=True,n=2).rename(columns={0:"last name",1:"middle name",2:"first name"})

Unnamed: 0,last name,middle name,first name
0,"Aaron,",,Elvia J
1,"Aaron,",,Jeffery M
2,"Aaron,",,Karina
3,"Aaron,",,Kimberlei R
4,Abad,"Jr,",Vicente M
...,...,...,...
32057,"Zygadlo,",,Michael J
32058,"Zygowicz,",,Peter J
32059,"Zymantas,",,Mark E
32060,"Zyrkowski,",,Carlo E


##### *End of DataFrame String methods*