In [1]:
import numpy as np 
import pandas as pd

# Working with Text Data

In [2]:
df = pd.read_csv('Data/books.csv')
print(df)

        ID                   Title          Author PublicationDate
0  001276A  The Rise of the Falcon    John Admiral     25-Apr-2018
1  023125B         Controlled mind   Robert Greens     28-Aug-2016
2  005556E       Only love remains  Greta Blooming     17-Feb-2015


In [3]:
print(df.ID)

0    001276A
1    023125B
2    005556E
Name: ID, dtype: object


In [4]:
print(df.Title)

0    The Rise of the Falcon
1           Controlled mind
2         Only love remains
Name: Title, dtype: object


In [5]:
print(df.Author)

0      John Admiral
1     Robert Greens
2    Greta Blooming
Name: Author, dtype: object


In [6]:
df = pd.read_csv('Data/books.csv', dtype='string')
print(df)

        ID                   Title          Author PublicationDate
0  001276A  The Rise of the Falcon    John Admiral     25-Apr-2018
1  023125B         Controlled mind   Robert Greens     28-Aug-2016
2  005556E       Only love remains  Greta Blooming     17-Feb-2015


In [7]:
df.PublicationDate

0    25-Apr-2018
1    28-Aug-2016
2    17-Feb-2015
Name: PublicationDate, dtype: string

In [8]:
df.PublicationDate = pd.to_datetime(df.PublicationDate)
df.PublicationDate

0   2018-04-25
1   2016-08-28
2   2015-02-17
Name: PublicationDate, dtype: datetime64[ns]

In [9]:
df.Title = df.Title.astype('string')
df.Title

0    The Rise of the Falcon
1           Controlled mind
2         Only love remains
Name: Title, dtype: string

In [10]:
df.Title.str.upper()

0    THE RISE OF THE FALCON
1           CONTROLLED MIND
2         ONLY LOVE REMAINS
Name: Title, dtype: string

In [11]:
df.Title.str.lower()

0    the rise of the falcon
1           controlled mind
2         only love remains
Name: Title, dtype: string

In [12]:
df['Comment'] = [' Too long for a child   ',
                 'Interesting     book  ',
                 '   Very Impressive']
print(df)

        ID                   Title          Author PublicationDate  \
0  001276A  The Rise of the Falcon    John Admiral      2018-04-25   
1  023125B         Controlled mind   Robert Greens      2016-08-28   
2  005556E       Only love remains  Greta Blooming      2015-02-17   

                    Comment  
0   Too long for a child     
1    Interesting     book    
2           Very Impressive  


In [13]:
df.Comment.str.len()

0    24
1    22
2    18
Name: Comment, dtype: int64

In [14]:
df.Comment = df.Comment.str.strip()
print(df.Comment)

0    Too long for a child
1    Interesting     book
2         Very Impressive
Name: Comment, dtype: object


In [15]:
print(df.Comment.str.len())

0    20
1    20
2    15
Name: Comment, dtype: int64


In [16]:
df.Comment.str.replace('  ','')

0    Too long for a child
1        Interesting book
2         Very Impressive
Name: Comment, dtype: object

In [17]:
df

Unnamed: 0,ID,Title,Author,PublicationDate,Comment
0,001276A,The Rise of the Falcon,John Admiral,2018-04-25,Too long for a child
1,023125B,Controlled mind,Robert Greens,2016-08-28,Interesting book
2,005556E,Only love remains,Greta Blooming,2015-02-17,Very Impressive


In [18]:
df.Author = df.Author.str.replace(' ',',')
print(df.Author)

0      John,Admiral
1     Robert,Greens
2    Greta,Blooming
Name: Author, dtype: string


In [19]:
df.Comment.str.replace('book','novel')

0     Too long for a child
1    Interesting     novel
2          Very Impressive
Name: Comment, dtype: object

In [20]:
df.Title.str.split()

0    [The, Rise, of, the, Falcon]
1              [Controlled, mind]
2           [Only, love, remains]
Name: Title, dtype: object

In [21]:
df.Author.str.split(',')

0      [John, Admiral]
1     [Robert, Greens]
2    [Greta, Blooming]
Name: Author, dtype: object

In [22]:
dfw = df.Author.str.split(',', expand=True)
print(dfw)

        0         1
0    John   Admiral
1  Robert    Greens
2   Greta  Blooming


In [23]:
type(dfw)

pandas.core.frame.DataFrame

In [24]:
df[['Author_name','Author_surname']] = df.Author.str.split(',', expand=True)
del df['Author']
df

Unnamed: 0,ID,Title,PublicationDate,Comment,Author_name,Author_surname
0,001276A,The Rise of the Falcon,2018-04-25,Too long for a child,John,Admiral
1,023125B,Controlled mind,2016-08-28,Interesting book,Robert,Greens
2,005556E,Only love remains,2015-02-17,Very Impressive,Greta,Blooming


In [25]:
df['Author'] = df['Author_name'] + ' ' + df['Author_surname']
df

Unnamed: 0,ID,Title,PublicationDate,Comment,Author_name,Author_surname,Author
0,001276A,The Rise of the Falcon,2018-04-25,Too long for a child,John,Admiral,John Admiral
1,023125B,Controlled mind,2016-08-28,Interesting book,Robert,Greens,Robert Greens
2,005556E,Only love remains,2015-02-17,Very Impressive,Greta,Blooming,Greta Blooming


In [26]:
df.Author_name.str.cat(df.Author_surname, sep=' ')

0      John Admiral
1     Robert Greens
2    Greta Blooming
Name: Author_name, dtype: string

In [27]:
del df['Author_surname']
del df['Author_name']
df

Unnamed: 0,ID,Title,PublicationDate,Comment,Author
0,001276A,The Rise of the Falcon,2018-04-25,Too long for a child,John Admiral
1,023125B,Controlled mind,2016-08-28,Interesting book,Robert Greens
2,005556E,Only love remains,2015-02-17,Very Impressive,Greta Blooming


In [28]:
df['ID1'] = df.ID.str.replace('([A-Z]+)', '', regex=True)
print(df.ID1)

0    001276
1    023125
2    005556
Name: ID1, dtype: string


In [29]:
df['ID2'] = df.ID.str.replace('([0-9]+)', '', regex=True)
print(df.ID2)

0    A
1    B
2    E
Name: ID2, dtype: string


In [30]:
del df['ID']
df

Unnamed: 0,Title,PublicationDate,Comment,Author,ID1,ID2
0,The Rise of the Falcon,2018-04-25,Too long for a child,John Admiral,1276,A
1,Controlled mind,2016-08-28,Interesting book,Robert Greens,23125,B
2,Only love remains,2015-02-17,Very Impressive,Greta Blooming,5556,E


In [31]:
df['ID'] = df.ID1.str.cat(df.ID2)
df.ID.str.extract('([0-9]+)')

Unnamed: 0,0
0,1276
1,23125
2,5556


In [32]:
df.ID.str.extract('([A-Z]+)')

Unnamed: 0,0
0,A
1,B
2,E


In [33]:
del df['ID']
df

Unnamed: 0,Title,PublicationDate,Comment,Author,ID1,ID2
0,The Rise of the Falcon,2018-04-25,Too long for a child,John Admiral,1276,A
1,Controlled mind,2016-08-28,Interesting book,Robert Greens,23125,B
2,Only love remains,2015-02-17,Very Impressive,Greta Blooming,5556,E


In [34]:
df.ID2.str.cat(['USA','ITA','FRA'],sep='-')

0    A-USA
1    B-ITA
2    E-FRA
Name: ID2, dtype: string

In [35]:
df['temp'] = 'USA'
df.ID2 = df.ID2.str.cat(df['temp'],sep='-')
del df['temp']
print(df.ID2)

0    A-USA
1    B-USA
2    E-USA
Name: ID2, dtype: string


In [36]:
df

Unnamed: 0,Title,PublicationDate,Comment,Author,ID1,ID2
0,The Rise of the Falcon,2018-04-25,Too long for a child,John Admiral,1276,A-USA
1,Controlled mind,2016-08-28,Interesting book,Robert Greens,23125,B-USA
2,Only love remains,2015-02-17,Very Impressive,Greta Blooming,5556,E-USA


In [37]:
df.ID1.str.isdigit()

0    True
1    True
2    True
Name: ID1, dtype: boolean

In [38]:
df.ID2.str.isalnum()

0    False
1    False
2    False
Name: ID2, dtype: boolean

In [39]:
df.Title.str.find('the')

0    12
1    -1
2    -1
Name: Title, dtype: Int64

In [40]:
df.Title.str.find('the') > -1

0     True
1    False
2    False
Name: Title, dtype: boolean