## Pandas - Text Methods for String Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Cleaning of string data

# Example separating username and domain from email using python string method

email = 'ayush@gmail.com'

email.split('@')

['ayush', 'gmail.com']

In [4]:
# Using pandas str method to do string works as in simple python string methods

# creating series

names = pd.Series(['andrew', 'bob', 'clarie', 'david', '5'])

names.str.upper()

0    ANDREW
1       BOB
2    CLARIE
3     DAVID
4         5
dtype: object

In [5]:
# Like working with python string

email.isdigit()

False

In [6]:
'5'.isdigit()

True

In [7]:
# Similary working with pandas str

names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [8]:
# Working on splitting and grabbing using pandas str and python normal string

tech_finance = ['GOOG,APPL,AMZN', 'JPM,BAC,GS']

len(tech_finance)

2

In [9]:
tickers = pd.Series(tech_finance)

In [10]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [11]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [12]:
# indexing accessing with normal python string and str method

tech = 'GOOG,APPL,AMZN'

tech.split(',')[0]

'GOOG'

In [19]:
# similarly in pandas str

tickers.str.split(',').str[0][0] # Returns particular data

'GOOG'

In [20]:
tickers.str.split(',')[0] # Returns specified row data

['GOOG', 'APPL', 'AMZN']

In [21]:
tickers.str.split(',').str[0] # Returns specified index of every row data

0    GOOG
1     JPM
dtype: object

In [22]:
# creating columns while splitting

tickers.str.split(',', expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [24]:
# removing unwanted characters like whitespaces or special characters if not needed

# creating series of messy_names

messy_names = pd.Series(['andrew   ', 'bo;bo', '   clarie   '])
messy_names # it creates Series with spaces and unwanted characters so to remove this see beloe cell

0       andrew   
1           bo;bo
2       clarie   
dtype: object

In [25]:
messy_names.str.replace(';', '').str.strip()

0    andrew
1      bobo
2    clarie
dtype: object

In [27]:
messy_names.str.replace(';', '').str.strip()[0]

# NOTE: str.strip() is to remove unwanted whitespaces

'andrew'

In [28]:
messy_names.str.replace(';', '').str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Clarie
dtype: object

In [30]:
# Using custom method creation and using apply method

def cleanup(name):
    name = name.replace(';', '')
    name = name.strip()
    name = name.capitalize()

    return name

messy_names.apply(cleanup)

0    Andrew
1      Bobo
2    Clarie
dtype: object

In [31]:
# NOTE: Using pandas apply method is much faster than pandas str method

import timeit

# code snippet to be executed only once

setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''

# code snippet whose execution time is to be measured 
stmt_pandas_str = ''' 
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [32]:
timeit.timeit(setup=setup, stmt=stmt_pandas_str, number=10000)

6.156685400055721

In [34]:
timeit.timeit(setup=setup, stmt=stmt_pandas_apply, number=10000)

1.702268599998206

In [35]:
timeit.timeit(setup=setup, stmt=stmt_pandas_vectorize, number=10000)

0.4119557000230998