In [1]:
import numpy as np
import pandas as pd

# `pandas vectorized string operations`

In [2]:
df = pd.read_csv('titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


`We cannot direct use string methods on a series or dataframe. If we want to use the string operations on a series we must use the series accessor.`

In [3]:
names = df['Name']

In [4]:
names

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [5]:
try:
     names.upper()
except Exception as ex:
    print(ex)

'Series' object has no attribute 'upper'


### `So in order to use the string methods, we need to use the str namespace`

In [6]:
try:
    print(names.str.upper())
except Exception as ex:
    print(ex)

0                                BRAUND, MR. OWEN HARRIS
1      CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                                 HEIKKINEN, MISS. LAINA
3           FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                               ALLEN, MR. WILLIAM HENRY
                             ...                        
886                                MONTVILA, REV. JUOZAS
887                         GRAHAM, MISS. MARGARET EDITH
888             JOHNSTON, MISS. CATHERINE HELEN "CARRIE"
889                                BEHR, MR. KARL HOWELL
890                                  DOOLEY, MR. PATRICK
Name: Name, Length: 891, dtype: object


### `Now by using str namespace, we can use all the string methods.`

## `upper(), lower(), capitalize() and title()`

In [7]:
names.head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [8]:
names.str.lower().head()

0                              braund, mr. owen harris
1    cumings, mrs. john bradley (florence briggs th...
2                               heikkinen, miss. laina
3         futrelle, mrs. jacques heath (lily may peel)
4                             allen, mr. william henry
Name: Name, dtype: object

In [9]:
names.str.upper().head()

0                              BRAUND, MR. OWEN HARRIS
1    CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                               HEIKKINEN, MISS. LAINA
3         FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                             ALLEN, MR. WILLIAM HENRY
Name: Name, dtype: object

In [10]:
names.str.capitalize().head()

0                              Braund, mr. owen harris
1    Cumings, mrs. john bradley (florence briggs th...
2                               Heikkinen, miss. laina
3         Futrelle, mrs. jacques heath (lily may peel)
4                             Allen, mr. william henry
Name: Name, dtype: object

In [11]:
names.str.title().head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

## `len()`

In [12]:
names.head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [13]:
names.str.len().head()

0    23
1    51
2    22
3    44
4    24
Name: Name, dtype: int64

## `split()`

    The split() method in the str namespace is used to split the string values inside the series using a seperator.

In [14]:
names.str.split(",")

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

#### `We may also use the expand parameter. We can set that to True to convert it into a dataframe.`

In [15]:
names.str.split(",", expand = True)

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


#### `Now to get the individual values, we can use the get() method`

In [16]:
splitted_names = names.str.split(",")
splitted_names

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [17]:
splitted_names.str.get(0)

0         Braund
1        Cumings
2      Heikkinen
3       Futrelle
4          Allen
         ...    
886     Montvila
887       Graham
888     Johnston
889         Behr
890       Dooley
Name: Name, Length: 891, dtype: object

In [18]:
splitted_names.str.get(1)

0                                  Mr. Owen Harris
1       Mrs. John Bradley (Florence Briggs Thayer)
2                                      Miss. Laina
3               Mrs. Jacques Heath (Lily May Peel)
4                                Mr. William Henry
                          ...                     
886                                    Rev. Juozas
887                           Miss. Margaret Edith
888                 Miss. Catherine Helen "Carrie"
889                                Mr. Karl Howell
890                                    Mr. Patrick
Name: Name, Length: 891, dtype: object

## `strip()`

    The strip() method is used to remove the whitespaces or a character from the start and end of a string.

In [19]:
names = pd.Series(['     Abhishek Jha', 'Ibrahim    ', 'Daniyaal', '   Sameer    ', 'Siddhart Jha'])
names

0         Abhishek Jha
1          Ibrahim    
2             Daniyaal
3           Sameer    
4         Siddhart Jha
dtype: object

In [20]:
names[0]

'     Abhishek Jha'

In [21]:
stripped_names = names.str.strip(" ")
stripped_names

0    Abhishek Jha
1         Ibrahim
2        Daniyaal
3          Sameer
4    Siddhart Jha
dtype: object

In [22]:
names[0], stripped_names[0]

('     Abhishek Jha', 'Abhishek Jha')

In [23]:
names[1], stripped_names[1]

('Ibrahim    ', 'Ibrahim')

In [24]:
names[2], stripped_names[2]

('Daniyaal', 'Daniyaal')

In [25]:
names[3], stripped_names[3]

('   Sameer    ', 'Sameer')

In [26]:
names[4], stripped_names[4]

('Siddhart Jha', 'Siddhart Jha')

#### `We also have the lstrip() and rstrip() methods which are used to remove a pattern string from the start and end of the given string.`

In [27]:
names.str.rstrip("")

0         Abhishek Jha
1          Ibrahim    
2             Daniyaal
3           Sameer    
4         Siddhart Jha
dtype: object

In [28]:
names.str.lstrip('xyz')

0         Abhishek Jha
1          Ibrahim    
2             Daniyaal
3           Sameer    
4         Siddhart Jha
dtype: object

## `replace()`

In [29]:
df = pd.read_csv('titanic.csv')
titles = df['Name'].str.split(",").str.get(1).str.strip(" ").str.split(" ", n = 1).str.get(0).apply(lambda x : x[:-1:])
titles

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Name, Length: 891, dtype: object

In [30]:
titles.value_counts()

Name
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
th            1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: count, dtype: int64

#### `replace Mlle and Ms with Miss`

In [31]:
titles = titles.str.replace('Mlle', 'Miss')
titles = titles.str.replace('Ms', 'Miss')
titles

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Name, Length: 891, dtype: object

In [32]:
titles.value_counts()

Name
Mr          517
Miss        185
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Col           2
Don           1
Mme           1
Lady          1
Sir           1
Capt          1
th            1
Jonkheer      1
Name: count, dtype: int64

#### `Now there are no Mlle and Ms in the series, they have been replaced with Miss.`

## `Boolean Masking`

    We also have some methods which return us a new boolean series which can then be used to filter the series or dataframe.
    
    Some of these methods are startswith(), endswith(), contains(), isdigit(), isalpha(), etc...

In [33]:
names

0         Abhishek Jha
1          Ibrahim    
2             Daniyaal
3           Sameer    
4         Siddhart Jha
dtype: object

In [34]:
mask = names.str.startswith('A')
names[mask]

Series([], dtype: object)

In [35]:
mask = names.str.endswith('z')
names[mask]

Series([], dtype: object)

In [36]:
mask = names.str.contains('john', case = False)
names[mask]

Series([], dtype: object)

#### `We can also use regex expression inside the contains() method. This will do the advanced searching.`

## `slicing`

In [37]:
names = df['Name'].head().copy()
names

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [38]:
names.str[1::]

0                               raund, Mr. Owen Harris
1    umings, Mrs. John Bradley (Florence Briggs Tha...
2                                eikkinen, Miss. Laina
3          utrelle, Mrs. Jacques Heath (Lily May Peel)
4                              llen, Mr. William Henry
Name: Name, dtype: object

In [39]:
names.str[:-1]

0                               Braund, Mr. Owen Harri
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                Heikkinen, Miss. Lain
3          Futrelle, Mrs. Jacques Heath (Lily May Peel
4                              Allen, Mr. William Henr
Name: Name, dtype: object

In [40]:
names.str[::2]

0                  Ban,M.Oe ars
1    Cmns r.Jh rde Foec rgsTae)
2                   Hiknn is an
3        Ftel,Ms aqe et Ll a el
4                  Aln r ila er
Name: Name, dtype: object