## 6.1 Letter Casing and Whitespace

In [1]:
import pandas as pd

In [3]:
inspections = pd.read_csv("chicago_food_inspections.csv")
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


In [4]:
inspections["Name"].head()

Unnamed: 0,Name
0,MARRIOT MARQUIS CHICAGO
1,JETS PIZZA
2,ROOM 1520
3,MARRIOT MARQUIS CHICAGO
4,CHARTWELLS


In [None]:
inspections["Name"].head().values

array([' MARRIOT MARQUIS CHICAGO   ', ' JETS PIZZA ', '   ROOM 1520 ',
       '  MARRIOT MARQUIS CHICAGO  ', ' CHARTWELLS   '], dtype=object)

In [None]:
inspections["Name"].str

<pandas.core.strings.accessor.StringMethods at 0x7ff009724490>

In [5]:
dessert = "  cheesecake  "
dessert.lstrip()

'cheesecake  '

In [None]:
dessert.rstrip()

'  cheesecake'

In [None]:
dessert.strip()

'cheesecake'

In [6]:
inspections["Name"].str.lstrip().head()

Unnamed: 0,Name
0,MARRIOT MARQUIS CHICAGO
1,JETS PIZZA
2,ROOM 1520
3,MARRIOT MARQUIS CHICAGO
4,CHARTWELLS


In [None]:
inspections["Name"].str.rstrip().head()

0      MARRIOT MARQUIS CHICAGO
1                   JETS PIZZA
2                    ROOM 1520
3      MARRIOT MARQUIS CHICAGO
4                   CHARTWELLS
Name: Name, dtype: object

In [None]:
inspections["Name"].str.strip().head()

0    MARRIOT MARQUIS CHICAGO
1                 JETS PIZZA
2                  ROOM 1520
3    MARRIOT MARQUIS CHICAGO
4                 CHARTWELLS
Name: Name, dtype: object

In [None]:
inspections["Name"] = inspections["Name"].str.strip()

In [None]:
inspections.columns

Index(['Name', 'Risk'], dtype='object')

In [17]:
pd.api.types.is_string_dtype(inspections["Name"])

True

In [18]:
[c for c in inspections.columns if pd.api.types.is_string_dtype(inspections[c].dtype)]

['Name', 'Risk']

In [22]:
str_cols = [c for c in inspections.columns if pd.api.types.is_string_dtype(inspections[c].dtype)]
# .str acessor can only be used on Series
for c in str_cols:
    inspections[c] = inspections[c].str.strip()

In [None]:
inspections["Name"].str.lower().head()

0    marriot marquis chicago
1                 jets pizza
2                  room 1520
3    marriot marquis chicago
4                 chartwells
Name: Name, dtype: object

In [None]:
steaks = pd.Series(["porterhouse", "filet mignon", "ribeye"])
steaks

0     porterhouse
1    filet mignon
2          ribeye
dtype: object

In [None]:
steaks.str.upper()

0     PORTERHOUSE
1    FILET MIGNON
2          RIBEYE
dtype: object

In [23]:
inspections["Name"].head()

Unnamed: 0,Name
0,MARRIOT MARQUIS CHICAGO
1,JETS PIZZA
2,ROOM 1520
3,MARRIOT MARQUIS CHICAGO
4,CHARTWELLS


In [24]:
inspections["Name"].str.capitalize().head()

Unnamed: 0,Name
0,Marriot marquis chicago
1,Jets pizza
2,Room 1520
3,Marriot marquis chicago
4,Chartwells


In [25]:
inspections["Name"].str.title().head()

Unnamed: 0,Name
0,Marriot Marquis Chicago
1,Jets Pizza
2,Room 1520
3,Marriot Marquis Chicago
4,Chartwells


## 6.2 String Slicing

In [None]:
inspections["Risk"].head()

0      Risk 1 (High)
1    Risk 2 (Medium)
2       Risk 3 (Low)
3      Risk 1 (High)
4      Risk 1 (High)
Name: Risk, dtype: object

In [None]:
len(inspections)

153810

In [None]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [26]:
inspections = inspections.dropna(subset = ["Risk"])

In [27]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

In [32]:
# .REPLACE is a df level method
inspections = inspections.replace(
    to_replace = "All", value = "Risk 4 (Extreme)"
)

## more frame level methos -- https://pandas.pydata.org/docs/reference/frame.html

In [33]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

### 6.2.1 String Slicing and Character Replacement

In [34]:
inspections["Risk"].str.slice(5, 6).head()

Unnamed: 0,Risk
0,1
1,2
2,3
3,1
4,1


In [35]:
inspections["Risk"].str[5:6].head()

Unnamed: 0,Risk
0,1
1,2
2,3
3,1
4,1


In [36]:
inspections["Risk"].str.slice(8).head()

Unnamed: 0,Risk
0,High)
1,Medium)
2,Low)
3,High)
4,High)


In [37]:
inspections["Risk"].str[8:].head()

Unnamed: 0,Risk
0,High)
1,Medium)
2,Low)
3,High)
4,High)


In [38]:
inspections["Risk"].str.slice(8, -1).head()

Unnamed: 0,Risk
0,High
1,Medium
2,Low
3,High
4,High


In [39]:
inspections["Risk"].str[8:-1].head()

Unnamed: 0,Risk
0,High
1,Medium
2,Low
3,High
4,High


In [40]:
## Once you have applied a method on the accessor
## like the .slice
## you ghet a series
## so can again apply a string accessor .str
inspections["Risk"].str.slice(8).str.replace(")", "").head()

Unnamed: 0,Risk
0,High
1,Medium
2,Low
3,High
4,High


In [41]:
type(inspections["Risk"].str.slice(8))

In [47]:
inspections["Risk"].str[2:4].str.len().between(1,2)

Unnamed: 0,Risk
0,True
1,True
2,True
3,True
4,True
...,...
153805,True
153806,True
153807,True
153808,True


## 6.3 Boolean Methods

In [None]:
"Pizza" in "Jets Pizza"

True

In [None]:
"pizza" in "Jets Pizza"

False

In [48]:
inspections["Name"].str.lower().str.contains("pizza").head()

Unnamed: 0,Name
0,False
1,True
2,False
3,False
4,False


In [None]:
has_pizza = inspections["Name"].str.lower().str.contains("pizza")
inspections[has_pizza]

Unnamed: 0,Name,Risk
1,JETS PIZZA,Risk 2 (Medium)
19,NANCY'S HOME OF STUFFED PIZZA,Risk 1 (High)
27,"NARY'S GRILL & PIZZA ,INC.",Risk 1 (High)
29,NARYS GRILL & PIZZA,Risk 1 (High)
68,COLUTAS PIZZA,Risk 1 (High)
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,Risk 1 (High)
153764,COCHIAROS PIZZA #2,Risk 1 (High)
153772,FERNANDO'S MEXICAN GRILL & PIZZA,Risk 1 (High)
153788,REGGIO'S PIZZA EXPRESS,Risk 1 (High)


In [None]:
inspections["Name"].str.lower().str.startswith("tacos").head()

0    False
1    False
2    False
3    False
4    False
Name: Name, dtype: bool

In [None]:
starts_with_tacos = (
    inspections["Name"].str.lower().str.startswith("tacos")
)
inspections[starts_with_tacos]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,Risk 1 (High)
556,TACOS EL TIO 2 INC.,Risk 1 (High)
675,TACOS DON GABINO,Risk 1 (High)
958,TACOS EL TIO 2 INC.,Risk 1 (High)
1036,TACOS EL TIO 2 INC.,Risk 1 (High)
...,...,...
143587,TACOS DE LUNA,Risk 1 (High)
144026,TACOS GARCIA,Risk 1 (High)
146174,Tacos Place's 1,Risk 1 (High)
147810,TACOS MARIO'S LIMITED,Risk 1 (High)


In [None]:
ends_with_tacos = (
    inspections["Name"].str.lower().str.endswith("tacos")
)
inspections[ends_with_tacos]

Unnamed: 0,Name,Risk
382,LAZO'S TACOS,Risk 1 (High)
569,LAZO'S TACOS,Risk 1 (High)
2652,FLYING TACOS,Risk 3 (Low)
3250,JONY'S TACOS,Risk 1 (High)
3812,PACO'S TACOS,Risk 1 (High)
...,...,...
151121,REYES TACOS,Risk 1 (High)
151318,EL MACHO TACOS,Risk 1 (High)
151801,EL MACHO TACOS,Risk 1 (High)
153087,RAYMOND'S TACOS,Risk 1 (High)


## 6.4 Splitting Strings

In [64]:
customers = pd.read_csv("customers.csv")
customers.head()

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."


In [None]:
customers["Name"].str.len().head()

0    13
1    17
2    15
3    19
4    14
Name: Name, dtype: int64

In [None]:
phone_number = "555-123-4567"
phone_number.split("-")

['555', '123', '4567']

In [53]:
# The two lines below are equivalent
# interesting cause i thought str.split would give me multiple series but no
# CAN STILL USE STRING ACCESSOR FOR A LIST OF STRINGS INSTEAD OF A SINGLE STRING
customers["Name"].str.split(pat = " ").head()
customers["Name"].str.split(" ").head()

Unnamed: 0,Name
0,"[Frank, Manning]"
1,"[Elizabeth, Johnson]"
2,"[Donald, Stephens]"
3,"[Michael, Vincent, III]"
4,"[Jasmine, Zamora]"


In [54]:
customers["Name"].str.split(" ").str.len().head()

Unnamed: 0,Name
0,2
1,2
2,2
3,3
4,2


In [55]:
customers["Name"].str.split(pat = " ", n = 1).head()

Unnamed: 0,Name
0,"[Frank, Manning]"
1,"[Elizabeth, Johnson]"
2,"[Donald, Stephens]"
3,"[Michael, Vincent III]"
4,"[Jasmine, Zamora]"


In [56]:
customers["Name"].str.split(pat = " ", n = 1).str.get(0).head()

Unnamed: 0,Name
0,Frank
1,Elizabeth
2,Donald
3,Michael
4,Jasmine


In [None]:
customers["Name"].str.split(pat = " ", n = 1).str.get(1).head()

0        Manning
1        Johnson
2       Stephens
3    Vincent III
4         Zamora
Name: Name, dtype: object

In [None]:
customers["Name"].str.split(pat = " ", n = 1).str.get(-1).head()

0        Manning
1        Johnson
2       Stephens
3    Vincent III
4         Zamora
Name: Name, dtype: object

In [58]:
customers["Name"].str.split(
    pat = " ", n = 1, expand = True
).head()

Unnamed: 0,0,1
0,Frank,Manning
1,Elizabeth,Johnson
2,Donald,Stephens
3,Michael,Vincent III
4,Jasmine,Zamora


In [59]:
customers["Name"].str.split(pat = " ", expand = True).head()

Unnamed: 0,0,1,2
0,Frank,Manning,
1,Elizabeth,Johnson,
2,Donald,Stephens,
3,Michael,Vincent,III
4,Jasmine,Zamora,


In [65]:
## str.split ==> Splits the string in the Series/Index from the beginning
## n = 1 => at max one split to keep!
customers[["First Name", "Last Name"]] = customers[
    "Name"
].str.split(pat = " ", n = 1, expand = True)

In [66]:
customers

Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [None]:
customers = customers.drop(labels = "Name", axis = "columns")

In [67]:
customers = customers.drop(labels = "Name", axis =1)

In [68]:
customers.head()

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora


## 6.5 Coding Challenge

### 6.5.1 Problems

### 6.5.2 Solutions

In [None]:
customers["Address"].str.split(",").head()

0    [6461 Quinn Groves,  East Matthew,  New Hampsh...
1    [1360 Tracey Ports Apt. 419,  Kyleport,  Vermo...
2    [19120 Fleming Manors,  Prestonstad,  Montana,...
3    [441 Olivia Creek,  Jimmymouth,  Georgia,  82991]
4    [4246 Chelsey Ford Apt. 310,  Karamouth,  Utah...
Name: Address, dtype: object

In [None]:
customers["Address"].str.split(", ").head()

0    [6461 Quinn Groves, East Matthew, New Hampshir...
1    [1360 Tracey Ports Apt. 419, Kyleport, Vermont...
2    [19120 Fleming Manors, Prestonstad, Montana, 2...
3       [441 Olivia Creek, Jimmymouth, Georgia, 82991]
4    [4246 Chelsey Ford Apt. 310, Karamouth, Utah, ...
Name: Address, dtype: object

In [None]:
customers["Address"].str.split(", ", expand = True).head()

Unnamed: 0,0,1,2,3
0,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,19120 Fleming Manors,Prestonstad,Montana,23495
3,441 Olivia Creek,Jimmymouth,Georgia,82991
4,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


In [70]:
new_cols = ["Street", "City", "State", "Zip"]

customers[new_cols] = customers["Address"].str.split(
    pat = ", ", expand = True
)

In [None]:
customers.drop(labels = "Address", axis = "columns").head()

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
0,Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


In [None]:
del customers["Address"]

In [None]:
customers.tail()

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
9956,Dana,Browning,762 Andrew Views Apt. 254,North Paul,New Mexico,28889
9957,Amanda,Anderson,44188 Day Crest Apt. 901,Lake Marcia,Maine,37378
9958,Eric,Davis,73015 Michelle Squares,Watsonville,West Virginia,3933
9959,Taylor,Hernandez,129 Keith Greens,Haleyfurt,Oklahoma,98916
9960,Sherry,Nicholson,355 Griffin Valley,Davidtown,New Mexico,17581


## 6.6 A Note on Regular Expressions

In [None]:
customers["Street"].head()

0             6461 Quinn Groves
1    1360 Tracey Ports Apt. 419
2          19120 Fleming Manors
3              441 Olivia Creek
4    4246 Chelsey Ford Apt. 310
Name: Street, dtype: object

In [71]:
customers["Street"].str.replace(
    "\d{4,}", "****", regex = True
).head()

  "\d{4,}", "*", regex = True


Unnamed: 0,Street
0,* Quinn Groves
1,* Tracey Ports Apt. 419
2,* Fleming Manors
3,441 Olivia Creek
4,* Chelsey Ford Apt. 310


In [76]:
customers["Street"].str.replace(
    "\d{3}", "***", regex = True
).head()

  "\d{3}", "***", regex = True


Unnamed: 0,Street
0,***1 Quinn Groves
1,***0 Tracey Ports Apt. ***
2,***20 Fleming Manors
3,*** Olivia Creek
4,***6 Chelsey Ford Apt. ***


## 6.7 Summary