# Text Methods for String Data

- pandas comes with special methods for string manipulation in dataframes.
- we can apply string functions to a whole series

In [1]:
import numpy as np
import pandas as pd

In [2]:
email = 'ale.alberga1@gmail.com'

In [3]:
email.split('@')

['ale.alberga1', 'gmail.com']

In [4]:
names = pd.Series(['andrew', 'bobo', 'cinnamon', 'david', '5'])

In [5]:
names

0      andrew
1        bobo
2    cinnamon
3       david
4           5
dtype: object

In [6]:
names.str.upper()

0      ANDREW
1        BOBO
2    CINNAMON
3       DAVID
4           5
dtype: object

In [7]:
# access to the .str. method library!
names.str.lower()

0      andrew
1        bobo
2    cinnamon
3       david
4           5
dtype: object

In [9]:
names.str.isdigit() # returns bool list

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [10]:
names[names.str.isdigit()] # only grabbing where it is a digit!

4    5
dtype: object

In [11]:
tech_finance = ['GOOG, APPL, AMZN', 'JPM, BAC, GS']

In [12]:
len(tech_finance)

2

In [13]:
tickers = pd.Series(tech_finance)

In [14]:
tickers

0    GOOG, APPL, AMZN
1        JPM, BAC, GS
dtype: object

In [15]:
tickers.str.split(',')

0    [GOOG,  APPL,  AMZN]
1        [JPM,  BAC,  GS]
dtype: object

In [16]:
tech = 'GOOG, APPL, AMZN'
tech.split(',')[0]

'GOOG'

In [20]:
# List will be sxpanded to just be different columns
tickers.str.split(',', expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [21]:
messy_names = pd.Series(['andrew  ', 'bob;o', '  claire  '])

In [22]:
messy_names

0      andrew  
1         bob;o
2      claire  
dtype: object

In [23]:
# Let's clean this up

In [25]:
# Remove the semi-colons
messy_names.str.replace(';', '')

0      andrew  
1          bobo
2      claire  
dtype: object

In [27]:
# Also remove whitespaces
messy_names.str.replace(';', '').str.strip()

0    andrew
1      bobo
2    claire
dtype: object

In [31]:
# Capitalise first letter, typical syntax to chain these.
# We could of course use .apply() to take a function
# If you need if statements use .apply()
(
    messy_names.str.replace(';', '')
    .str.strip()
    .str.capitalize()
)

0    Andrew
1      Bobo
2    Claire
dtype: object

In [32]:
# Which is more efficient, str or apply()?
# apply() functions are more efficient.