# Demo 1 | String Manipulation Demonstration
<hr>

Using the Lazada CIKM 2017 Dataset, a few string manipulation methods in Python are presented.

In [1]:
import re

import pandas as pd
pd.options.display.max_colwidth = -1
import matplotlib.pyplot as plt
import seaborn as sns

import lzd_utils

In [2]:
# Read from CSV file
df = lzd_utils.read_lazada_csv()

# Slice, dice and reindex the data
df = df[(df.country=='sg') & (df.category_lvl_1 == 'Health & Beauty') & (df.category_lvl_2 == 'Skin Care')]
df = df[['title', 'category_lvl_1', 'category_lvl_2', 'desc', 'price']]
df['id'] = df.index
df.reset_index(inplace=True, drop=True)

display(df.head())

Unnamed: 0,title,category_lvl_1,category_lvl_2,desc,price,id
0,Nature Republic Fresh Green Tea Foam Cleanser.,Health & Beauty,Skin Care,<ul> <li>Top Korean Cosmetic Brand</li> <li>Official Nature Republic Products</li> <li>150ml&nbsp;</li> <li>All skin type</li> <li>Moisturizing</li> <li>Made in Korea</li> </ul>,13.1,168
1,Cathy Doll Chilli Bomb Sexy Firming Cream 260g,Health & Beauty,Skin Care,<ul> <li>Slimming Micro-beads Formula</li> <li>Instant Reduction Of Excessive Fats</li> <li>Keeps Skin Young And Smooth</li> <li>Gives Precisely Proportional Curves</li> </ul>,12.5,345
2,La Roche-Posay Effaclar Mat Sebo-regulating Moisturiser,Health & Beauty,Skin Care,"<ul> <li>Hydrates, inhibits sebum production, and minimises enlarged pores without overly-drying the skin.</li> <li>Oil free texture</li> <li>Paraben Free</li> </ul>",35.0,719
3,Ahava Dermud Nourishing Body Cream 844150 200ml/6.8oz (EXPORT),Health & Beauty,Skin Care,<ul> <li>Formulated with Dead Sea mud; minerals natural ingredients</li> <li>Incredibly regenerates; hydrates soothes skin</li> <li>Promptly relieves dryness; cracking; itchiness redness</li> </ul>,39.5,885
4,Simple Exfoliating Facial Cleansing Wipes With Vitamin 25's x 3 packs (Free 1 Pack),Health & Beauty,Skin Care,<ul> <li>4 packs (25 wipes/pack)</li> <li>Gently remove dead skin cells</li> <li>Keeps skin revived and revitalised</li> </ul>,39.9,1095


In [3]:
# Get the description of 1 product
s = df.loc[0, 'desc']
print(s)

# Get first 10 titles
t10 = df.loc[:10, 'title']
display(t10)

<ul> <li>Top Korean Cosmetic Brand</li> <li>Official Nature Republic Products</li> <li>150ml&nbsp;</li> <li>All skin type</li> <li>Moisturizing</li> <li>Made in Korea</li> </ul> 


0     Nature Republic Fresh Green Tea Foam Cleanser.                                                      
1     Cathy Doll Chilli Bomb Sexy Firming Cream 260g                                                      
2     La Roche-Posay Effaclar Mat Sebo-regulating Moisturiser                                             
3     Ahava Dermud Nourishing Body Cream 844150 200ml/6.8oz (EXPORT)                                      
4     Simple Exfoliating Facial Cleansing Wipes With Vitamin 25's x 3 packs (Free 1 Pack)                 
5     [LAZADA Exclusive] L'Oreal Paris Super Liner Black Lacquer + Micellar Water (Blue) Christmas Box Set
6     Academie Hypo-Sensible Nourishing Cream 50ml/1.7oz (EXPORT)                                         
7     Evoluderm Face &amp; Body Cream Peach 200ml                                                         
8     [BUNDLE DEAL] 2 Feelrekorea SOOTHING GEL Aloe Vera + SNAIL (300g)                                   
9     su:m37Ëš Secret Repair Toner 15

### Transformation of Strings

- `s.upper()` and `s.lower()`

In [16]:
print(s)

<ul> <li>Top Korean Cosmetic Brand</li> <li>Official Nature Republic Products</li> <li>150ml&nbsp;</li> <li>All skin type</li> <li>Moisturizing</li> <li>Made in Korea</li> </ul> 


In [4]:
# str.upper() changes the string to all upper case letters
print(s.upper())

<UL> <LI>TOP KOREAN COSMETIC BRAND</LI> <LI>OFFICIAL NATURE REPUBLIC PRODUCTS</LI> <LI>150ML&NBSP;</LI> <LI>ALL SKIN TYPE</LI> <LI>MOISTURIZING</LI> <LI>MADE IN KOREA</LI> </UL> 


In [5]:
# similarly, str.lower() changes the string to all lower case letters
s2 = s.lower()
print(s2)

<ul> <li>top korean cosmetic brand</li> <li>official nature republic products</li> <li>150ml&nbsp;</li> <li>all skin type</li> <li>moisturizing</li> <li>made in korea</li> </ul> 


### Check properties of string

- `s.count()`, `s.endswith()`, `s.startswith()`, `s.find()`

In [17]:
print(s2)

<ul> <li>top korean cosmetic brand</li> <li>official nature republic products</li> <li>150ml&nbsp;</li> <li>all skin type</li> <li>moisturizing</li> <li>made in korea</li> </ul> 


In [6]:
# str.count(x) returns the number of occurences of the substring x in str
print(s2.count('korea'))

<ul> <li>top korean cosmetic brand</li> <li>official nature republic products</li> <li>150ml&nbsp;</li> <li>all skin type</li> <li>moisturizing</li> <li>made in korea</li> </ul> 

2


In [18]:
# str.find(x) finds the first occurence of x and returns the position of the first character.
print(s2.find('li'))
print(s2.find('korea'))

6
13


In [19]:
# str.find(x) finds the LAST occurence of x and returns the position of the first character.
print(s2.rfind('li'))
print(s2.rfind('korea'))

168
161


In [9]:
# str.startswith(x) returns True if str starts with x, False otherwise
for t in t10.tolist():
    t2 = t.lower()
    if t2.startswith('['):
        print(t2)

[lazada exclusive] l'oreal paris super liner black lacquer + micellar water (blue) christmas box set
[bundle deal] 2 feelrekorea soothing gel aloe vera + snail (300g)


In [10]:
# str.endswith(x) returns True if str ends with x, and returns False otherwise
for t in t10.tolist():
    t2 = t.lower()
    if t2.endswith('(export)'):
        print(t2)

ahava dermud nourishing body cream 844150 200ml/6.8oz (export)
academie hypo-sensible nourishing cream 50ml/1.7oz (export)


### Manipulation of Strings

- `s.replace()`, `s.strip()`, `s.split()`

- `str.join(list)`

In [11]:
# str.replace(x, y) replaces ALL occurences of x in str with the replacement string y
s3 = s2
print(s3)
print()
s3 = s3.replace('<ul>', ' ')
s3 = s3.replace('</ul>', ' ')

# Notice how there are still spaces in the beginning and end of the string.
print('#{}#'.format(s3))
print()
s3 = s3.replace('<li>', ' ')
s3 = s3.replace('</li>', ' ')
print('#{}#'.format(s3))

<ul> <li>top korean cosmetic brand</li> <li>official nature republic products</li> <li>150ml&nbsp;</li> <li>all skin type</li> <li>moisturizing</li> <li>made in korea</li> </ul> 

#  <li>top korean cosmetic brand</li> <li>official nature republic products</li> <li>150ml&nbsp;</li> <li>all skin type</li> <li>moisturizing</li> <li>made in korea</li>   #

#   top korean cosmetic brand   official nature republic products   150ml&nbsp;   all skin type   moisturizing   made in korea    #


In [12]:
print('#{}#'.format(s3))
# str.split() removes both leading and trailing spaces of string str
s4 = s3.strip()
# Notice now that the spaces in the beginning and end of str are removed.
print('#{}#'.format(s4))

#   top korean cosmetic brand   official nature republic products   150ml&nbsp;   all skin type   moisturizing   made in korea    #
#top korean cosmetic brand   official nature republic products   150ml&nbsp;   all skin type   moisturizing   made in korea#


In [13]:
# Remove all spaces larger than size 2. There are more efficient ways using regex.
print(s4)
s5 = s4
s5 = s5.replace('  ', ' ')
s5 = s5.replace('  ', ' ')
print(s5)

top korean cosmetic brand   official nature republic products   150ml&nbsp;   all skin type   moisturizing   made in korea
top korean cosmetic brand official nature republic products 150ml&nbsp; all skin type moisturizing made in korea


In [14]:
# str.split() is used to break one string to a list of substrings with the specified delimiter.
# the default delimiter is one space (' '), but other delimiters can be passed e.g. ',' or '#'.
l5 = s5.split(' ')
print(l5)

['top', 'korean', 'cosmetic', 'brand', 'official', 'nature', 'republic', 'products', '150ml&nbsp;', 'all', 'skin', 'type', 'moisturizing', 'made', 'in', 'korea']


In [15]:
# ','.join(list) is used to concatenate a sequence of strings, using the delimiter specified.
l6 = l5[:4]
print(l6)
print(','.join(l6))

['top', 'korean', 'cosmetic', 'brand']
top,korean,cosmetic,brand


**References**

- [Github / minhcp](https://github.com/minhcp/CIKMCup17) for the dataset
- Python for Data Analysis, 2nd Edition, McKinney (2017)