## Working and Manipulating Text Data

In [1]:
import pandas as pd

In [2]:
chi = pd.read_csv("chicago.csv")

In [3]:
chi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [5]:
chi.dropna(how="all", inplace= True)

In [10]:
chi.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [8]:
chi["Department"] = chi["Department"].astype("category")

In [11]:
chi.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [14]:
chi["Department"].value_counts().head()

POLICE           12618
FIRE              4796
STREETS & SAN     2152
OEMC              1982
WATER MGMNT       1840
Name: Department, dtype: int64

### Common Pandas String Methods - .str.lower(), .str.upper(), .str.title(), .str.replace() and .str.len()

In [16]:
chi["Position Title"] = chi["Position Title"].str.title()

In [18]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [22]:
chi["Department"].str.replace("MGMNT", "Manager").head()
#Note, if you want any change(s) made to a field to stick, re-assign that change to the original field

0       WATER Manager
1              POLICE
2              POLICE
3    GENERAL SERVICES
4       WATER Manager
Name: Department, dtype: object

### Filtering with String Methods using .str.contains(), .str.startswith() and .str.endswith()

In [24]:
mask = chi["Position Title"].str.contains("water", case = False)
chi[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",Foreman Of Water Pipe Construction,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",Water Chemist Ii,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",District Superintendent Of Water Distribution,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",Foreman Of Water Pipe Construction,WATER MGMNT,$102440.00


In [26]:
chi["Employee Annual Salary"] = chi["Employee Annual Salary"].str.replace("$","")

In [28]:
chi["Employee Annual Salary"] = chi["Employee Annual Salary"].astype("float")

In [29]:
chi.describe()

Unnamed: 0,Employee Annual Salary
count,32062.0
mean,80204.178634
std,25098.329868
min,0.96
25%,72862.4
50%,84450.0
75%,93240.0
max,300000.0


### More String Methods - .str.lstrip(), .str.rstrip(), .str.strip(), .str.split()

In [30]:
chi.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,90744.0
1,"AARON, JEFFERY M",Police Officer,POLICE,84450.0
2,"AARON, KARINA",Police Officer,POLICE,84450.0
3,"AARON, KIMBERLEI R",Chief Contract Expediter,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",Civil Engineer Iv,WATER MGMNT,106836.0


I want to extract all the first names in the Name field

In [46]:
chi["Name"].str.split(",").str.get(0).str.title().value_counts()

Williams          293
Johnson           244
Smith             241
Brown             185
Jones             183
Rodriguez         171
Jackson           136
Garcia            130
Davis             127
Hernandez         110
Martinez          108
Lopez             106
Gonzalez          104
Perez             100
Wilson             94
Rivera             90
Thomas             89
Anderson           82
Torres             81
Murphy             80
Robinson           79
Moore              78
Harris             76
Sanchez            76
Miller             75
Lewis              74
Taylor             73
Martin             72
Clark              66
White              66
                 ... 
Aztlan              1
Goston              1
Aufmann             1
Fic                 1
Stehlik             1
Laureano            1
Cibrian             1
Orosco              1
Parrales            1
Flannery            1
Collay              1
Wrigley             1
Fudalewicz          1
Buisseret           1
Palarczyk 

The .str.get() gets records by their index values after they have been split into a list with a .split() method

The __expand__ and __n__ parameters of the .str.split() method<br>
- The .str.split("", expand= False, n = 0) has these params
- expand turns the splitted values to a Dataframe instead of a list (False is the default)
- the n signifies how many times you want to split the values<br>

For example, I want to split the Name field and produce two more fields with the first and last names

In [7]:
chi[["First Name", "Last Name"]] = chi["Name"].str.split(",", expand = True)
#see how we assigned the values to a list of two new fields

In [8]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
