# Vectorized String Operations

In [2]:
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [4]:
data = ['peter' , 'Paul', 'Mary', 'gUIDO']
[s.capitalize() for s in data]
# Breaks is there are any missing values.

['Peter', 'Paul', 'Mary', 'Guido']

In [5]:
import pandas as pd

In [7]:
names = pd.Series(data)
names

0    peter
1     Paul
2     Mary
3    gUIDO
dtype: object

In [8]:
names.str.capitalize()

0    Peter
1     Paul
2     Mary
3    Guido
dtype: object

### Table of Pandas String Methods

In [13]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [12]:
'''
len()   	lower()     	translate() 	islower()
ljust() 	upper()     	startswith()	isupper()
rjust() 	find()      	endswith()  	isnumeric()
center() 	rfind()     	isalnum()   	isdecimal()
zfill() 	index()     	isalpha()   	split()
strip() 	rindex()    	isdigit()   	rsplit()
rstrip() 	capitalize() 	isspace()   	partition()
lstrip() 	swapcase()  	istitle()   	rpartition()
''';

In [14]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [15]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [16]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [18]:
monte.str.swapcase()

0    gRAHAM cHAPMAN
1       jOHN cLEESE
2     tERRY gILLIAM
3         eRIC iDLE
4       tERRY jONES
5     mICHAEL pALIN
dtype: object

In [19]:
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [31]:
monte.str.find('John Cleese')
    
monte.str.center?

In [26]:
'''
Methods Using Regular Expressions
match() 	Call re.match() on each element, returning a boolean.
extract() 	Call re.match() on each element, returning matched groups as strings.
findall() 	Call re.findall() on each element
replace() 	Replace occurrences of pattern with some other string
contains() 	Call re.search() on each element, returning a boolean
count() 	Count occurrences of pattern
split() 	Equivalent to str.split(), but accepts regexps
rsplit() 	Equivalent to str.rsplit(), but accepts regexps
''';

In [40]:
monte.str.extract('([A-Za-r]+)', expand=False)

0     Graham
1       John
2       Terr
3       Eric
4       Terr
5    Michael
dtype: object

In [42]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

## Miscellaneous Methods

In [44]:
'''
get()           	Index each element
slice()         	Slice each element
slice_replace() 	Replace slice in each element with passed value
cat()           	Concatenate strings
repeat()        	Repeat values
normalize()     	Return Unicode form of string
pad()           	Add whitespace to left, right, or both sides of strings
wrap()          	Split long strings into lines with length less than a given width
join()          	Join strings in each element of the Series with passed separator
get_dummies()   	extract dummy variables as a dataframe

''';

In [55]:
monte.str.get(1)

0    r
1    o
2    e
3    r
4    e
5    i
dtype: object

In [62]:
monte.str.slice(1,5)

0    raha
1    ohn 
2    erry
3    ric 
4    erry
5    icha
dtype: object

In [72]:
print(monte.str[3:6],'\n')
print(monte.str[3:6],'\n')
print(monte.str.slice(3,6),'\n')
print(monte.str[0:3])

0    ham
1    n C
2    ry 
3    c I
4    ry 
5    hae
dtype: object 

0    ham
1    n C
2    ry 
3    c I
4    ry 
5    hae
dtype: object 

0    ham
1    n C
2    ry 
3    c I
4    ry 
5    hae
dtype: object 

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object


In [92]:
monteLASTnameFIRST = monte.str.split().str.get(1) + ', ' + monte.str.split().str.get(0)
monteLASTnameFIRST

0    Chapman, Graham
1       Cleese, John
2     Gilliam, Terry
3         Idle, Eric
4       Jones, Terry
5     Palin, Michael
dtype: object

In [97]:
print(type(monteLASTnameFIRST),'\n')
print(type(monteLASTnameFIRST[1]))

<class 'pandas.core.series.Series'> 

<class 'str'>


### Indicator Variables

In [98]:
full_monte = pd.DataFrame({'name': monte, 'info':['B|C|D','B|D','A|C','B|D','B|C','B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [99]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [101]:
try:
    recipes = pd.read_json('/Users/anthonyesposito/recipeitems-latest.json')
except ValueError as e:
    print("ValueError:", e)

ValueError: Expected object or value


ValueError: Expected object or value