# Pandas String Methods

In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan])

In [4]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
dtype: object

In [5]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
dtype: object

In [6]:
s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
dtype: float64

In [7]:
val = 'a, b,   guido'

In [8]:
val.split(',')

['a', ' b', '   guido']

In [10]:
[i.strip() for i in val.split(',')]

['a', 'b', 'guido']

In [11]:
pieces = [i.strip() for i in val.split(',')]

In [12]:
'::'.join(pieces)

'a::b::guido'

In [13]:
'gui' in val

True

In [14]:
val.index(',')

1

In [15]:
val.find(':')

-1

In [16]:
val.count('a')

1

In [17]:
val.replace(',', '/')

'a/ b/   guido'

In [18]:
val.replace('/', '')

'a, b,   guido'

### Built in String Methods

- `count`
- `endswith`, `startswith`
- `join`
- `index`
- `find`
- `rfind`
- `replace`
- `strip`, `rstrip`, `lstrip`
- `split`
- `lower`, `upper`
- `ljust`, `rjust`

### Regular Expressions

- `split`
- `compile`
- `findall`
- 

In [25]:
import re

In [26]:
text = "foo  bar\t baz \tqux"

In [27]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [28]:
regex = re.compile('\s+')

In [29]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [30]:
regex.findall(text)

['  ', '\t ', ' \t']

In [77]:
titanic = pd.read_csv('data/eda_data/titanic.csv')

In [78]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
survived    891 non-null int64
pclass      891 non-null int64
name        891 non-null object
sex         891 non-null object
age         714 non-null float64
sibsp       891 non-null int64
parch       891 non-null int64
ticket      891 non-null object
fare        891 non-null float64
cabin       204 non-null object
embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


In [79]:
titanic.name[:10]

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: name, dtype: object

In [164]:
pattern = r'\A^John+'

In [165]:
regex = re.compile(pattern, flags = re.IGNORECASE)

In [170]:
titanic[titanic.name.str.contains('John$')].head().groupby('sex')[['name', 'survived']].head(7)

Unnamed: 0,name,survived
45,"Rogers, Mr. William John",0
112,"Barton, Mr. David John",0
188,"Bourke, Mr. John",0
226,"Mellors, Mr. William John",1
401,"Adams, Mr. John",0


In [171]:
rockin = pd.read_csv('data/eda_data/rocking.csv')

In [173]:
rockin.head()

Unnamed: 0.1,Unnamed: 0,Song Clean,ARTIST CLEAN,Release Year,COMBINED,First?,Year?,PlayCount,F*G
0,0,Caught Up in You,.38 Special,1982.0,Caught Up in You by .38 Special,1,1,82,82
1,1,Fantasy Girl,.38 Special,,Fantasy Girl by .38 Special,1,0,3,0
2,2,Hold On Loosely,.38 Special,1981.0,Hold On Loosely by .38 Special,1,1,85,85
3,3,Rockin' Into the Night,.38 Special,1980.0,Rockin' Into the Night by .38 Special,1,1,18,18
4,4,Art For Arts Sake,10cc,1975.0,Art For Arts Sake by 10cc,1,1,1,1


In [189]:
pattern = r'198*'

In [190]:
regex = re.compile(pattern, flags= re.IGNORECASE)

In [193]:
rockin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2230 entries, 0 to 2229
Data columns (total 9 columns):
Unnamed: 0      2230 non-null int64
Song Clean      2230 non-null object
ARTIST CLEAN    2230 non-null object
Release Year    1653 non-null object
COMBINED        2230 non-null object
First?          2230 non-null int64
Year?           2230 non-null int64
PlayCount       2230 non-null int64
F*G             2230 non-null int64
dtypes: int64(5), object(4)
memory usage: 156.9+ KB


In [194]:
rockin_clean = rockin.dropna()

In [197]:
rockin_clean_80s = rockin_clean[rockin_clean['Release Year'].str.startswith('198')]

In [198]:
rockin_clean_80s.nlargest(10, 'PlayCount')

Unnamed: 0.1,Unnamed: 0,Song Clean,ARTIST CLEAN,Release Year,COMBINED,First?,Year?,PlayCount,F*G
38,38,You Shook Me All Night Long,AC/DC,1980,You Shook Me All Night Long by AC/DC,1,1,138,138
2220,2220,Legs,ZZ Top,1983,Legs by ZZ Top,1,1,121,121
2224,2224,Sharp Dressed Man,ZZ Top,1983,Sharp Dressed Man by ZZ Top,1,1,120,120
1523,1523,Tom Sawyer,Rush,1981,Tom Sawyer by Rush,1,1,114,114
205,205,Burnin' for You,Blue Oyster Cult,1981,Burnin' for You by Blue Oyster Cult,1,1,107,107
941,941,Any Way You Want It,Journey,1980,Any Way You Want It by Journey,1,1,105,105
943,943,Don't Stop Believin',Journey,1981,Don't Stop Believin' by Journey,1,1,105,105
1351,1351,Another One Bites the Dust,Queen,1980,Another One Bites the Dust by Queen,1,1,102,102
954,954,Separate Ways (Worlds Apart),Journey,1983,Separate Ways (Worlds Apart) by Journey,1,1,100,100
11,11,Back In Black,AC/DC,1980,Back In Black by AC/DC,1,1,97,97


In [201]:
rockin_clean_80s[rockin_clean_80s['COMBINED'].str.contains('Man')].sort_values('PlayCount', ascending = False)

Unnamed: 0.1,Unnamed: 0,Song Clean,ARTIST CLEAN,Release Year,COMBINED,First?,Year?,PlayCount,F*G
2224,2224,Sharp Dressed Man,ZZ Top,1983,Sharp Dressed Man by ZZ Top,1,1,120,120
676,676,Man On The Corner,Genesis,1981,Man On The Corner by Genesis,1,1,12,12
913,913,The Old Man Down The Road,John Fogerty,1985,The Old Man Down The Road by John Fogerty,1,1,12,12
572,572,Forever Man,Eric Clapton,1985,Forever Man by Eric Clapton,1,1,9,9
1514,1514,New World Man,Rush,1982,New World Man by Rush,1,1,7,7


### Further Reading

   - [Pandas Documentation](http://pandas.pydata.org/pandas-docs/stable/text.html): Working with text in Pandas
   - [`re` library Documentation](https://docs.python.org/2/library/re.html): Documentation for the Python `re` library.