# Chapter 7 - Data Cleaning and Preparation

## 7.3 String Manipulation

In [2]:
import re

import pandas as pd
import numpy as np

### String Manipulation Cheat Sheet

In [24]:
def get_split_vals(x):
    # Split to get the unique tags
    t_tags = (x.split('|'))
    # Transform all words to lower case and remove all special characters    
    t_tags = [s.lower() for s in t_tags]
    t_tags = [re.sub('[^\sa-z]', '', s) for s in t_tags]
    return '|'.join(t_tags)

In [25]:
df = pd.read_csv('dataset-H4-videos.csv', sep='#')
df['tags_split'] = df['tags'].apply(get_split_vals)
display(df)

t = df.loc[2,'tags_split']
print(t)

Unnamed: 0,video_id,title,tags,tags_split
0,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
1,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
2,bg7RjxsghNY,Camila Cabello - Real Friends (Audio),"camila cabello|""real friends""|""camila""|""camili...",camila cabello|real friends|camila|camilizers|...
3,qooQd8AA7_M,"Camila Cabello, Daddy Yankee - Havana (Remix -...","camila cabello|""camila""|""daddy yankee""|""havana...",camila cabello|camila|daddy yankee|havana|fift...
4,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
5,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
6,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
7,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
8,Ph54wQG8ynk,Camila Cabello - Never Be the Same,"camila cabello|""camila""|""camila full album""|""h...",camila cabello|camila|camila full album|havana...
9,bg7RjxsghNY,Camila Cabello - Real Friends (Audio),"camila cabello|""real friends""|""camila""|""camili...",camila cabello|real friends|camila|camilizers|...


camila cabello|real friends|camila|camilizers|fifth harmony|harmonizers|havana|omg|crying in the club|i have questions|know no better|h|never be the same|all these years|she loves control|young thug|inside out|consequences|somethings gotta give|in the dark|into it|havana feat young thug|camila cabello|pop|real friends|syco musicepic


In [26]:
t

'camila cabello|real friends|camila|camilizers|fifth harmony|harmonizers|havana|omg|crying in the club|i have questions|know no better|h|never be the same|all these years|she loves control|young thug|inside out|consequences|somethings gotta give|in the dark|into it|havana feat young thug|camila cabello|pop|real friends|syco musicepic'

In [17]:
df = pd.read_csv('dataset-H3-videos.csv')
# Get the dummy variables based on splitting using qcut
df['views_bins'] = pd.qcut(df['views'],4)
display(df.head(10))

# Use pd.get_dummies() to perform getting the dummy variables
bins_indicators = pd.get_dummies(df['views_bins'])
bins_indicators.columns = [str(i) for i in bins_indicators.columns]
display(bins_indicators.head(10))

# Finally, use df.join() to join the 2 dfs by the index
df.join(bins_indicators)

Unnamed: 0,video_id,views,views_bins
0,XAzqBDFs418,1375421,"(732104.0, 1558396.25]"
1,oRSVrtKph_k,1007920,"(732104.0, 1558396.25]"
2,aFuA50H9uek,3643003,"(1558396.25, 18574625.0]"
3,GhHBfDK4lE8,248880,"(33446.999, 277719.5]"
4,CPjWgk0UXps,1405034,"(732104.0, 1558396.25]"
5,8EK-QMtHhMI,1503192,"(732104.0, 1558396.25]"
6,a30K69hUJyo,1139752,"(732104.0, 1558396.25]"
7,dLRMA_lWsDY,1090128,"(732104.0, 1558396.25]"
8,rqTpMCq8uhk,606312,"(277719.5, 732104.0]"
9,3gTyF-wLa-E,264956,"(33446.999, 277719.5]"


Unnamed: 0,"(33446.999, 277719.5]","(277719.5, 732104.0]","(732104.0, 1558396.25]","(1558396.25, 18574625.0]"
0,0,0,1,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,0,0,1,0
5,0,0,1,0
6,0,0,1,0
7,0,0,1,0
8,0,1,0,0
9,1,0,0,0


Unnamed: 0,video_id,views,views_bins,"(33446.999, 277719.5]","(277719.5, 732104.0]","(732104.0, 1558396.25]","(1558396.25, 18574625.0]"
0,XAzqBDFs418,1375421,"(732104.0, 1558396.25]",0,0,1,0
1,oRSVrtKph_k,1007920,"(732104.0, 1558396.25]",0,0,1,0
2,aFuA50H9uek,3643003,"(1558396.25, 18574625.0]",0,0,0,1
3,GhHBfDK4lE8,248880,"(33446.999, 277719.5]",1,0,0,0
4,CPjWgk0UXps,1405034,"(732104.0, 1558396.25]",0,0,1,0
5,8EK-QMtHhMI,1503192,"(732104.0, 1558396.25]",0,0,1,0
6,a30K69hUJyo,1139752,"(732104.0, 1558396.25]",0,0,1,0
7,dLRMA_lWsDY,1090128,"(732104.0, 1558396.25]",0,0,1,0
8,rqTpMCq8uhk,606312,"(277719.5, 732104.0]",0,1,0,0
9,3gTyF-wLa-E,264956,"(33446.999, 277719.5]",1,0,0,0


In [6]:
df = pd.read_csv('dataset-H2-videos.csv')
display(df)
descriptions = df['description']
for s in descriptions.tolist():
    print(s)
    print()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,1qIj0m7-sHI,18.06.01,Top 10 NFL Rookies of the 2017 Season | NFL Hi...,NFL,17,2018-01-02T23:00:00.000Z,"NFL|""Football""|""offense""|""defense""|""afc""|""nfc""...",307093,4334,972,3054,https://i.ytimg.com/vi/1qIj0m7-sHI/default.jpg,False,False,False,NFL Network's Brian Baldinger ranks and breaks...
1,k1xvol1SCx8,18.23.02,"Dua Lipa - IDGAF ft. Charli XCX, Zara Larsson,...",BBC Radio 1,10,2018-02-21T08:00:03.000Z,"dua lipa|""IDGAF""|""i dont give a fuck""|""new rul...",2559730,139744,1296,4861,https://i.ytimg.com/vi/k1xvol1SCx8/default.jpg,False,False,False,Dua Lipa performs bonus track IDGAF for the BB...
2,8sg8lY-leE8,18.14.01,Vermilion Parish teacher gets arrested at Verm...,Chris Rosa,23,2018-01-09T01:36:00.000Z,[none],2903126,29269,2343,19675,https://i.ytimg.com/vi/8sg8lY-leE8/default.jpg,False,False,False,Teacher Deyshia Hargrave was questioning the s...
3,Y6zucdAzNi4,17.27.12,Thank You Peter Capaldi | Doctor Who Christmas...,BBC America,24,2017-12-19T20:00:06.000Z,"BBC America|""Television""|""BBC""|""British""|""doct...",69249,2455,48,224,https://i.ytimg.com/vi/Y6zucdAzNi4/default.jpg,False,False,False,"Thank you, Peter Capaldi. \n\nThe magical fina..."
4,BmMuLuG1yW8,18.18.01,"G-Eazy On Stepping Away From H&M, Being A Craz...",Breakfast Club Power 105.1 FM,24,2018-01-12T13:20:55.000Z,"the breakfast club|""power1051""|""celebrity news...",781858,12207,1162,4069,https://i.ytimg.com/vi/BmMuLuG1yW8/default.jpg,False,False,False,► Listen LIVE: http://power1051fm.com/\n► Face...


NFL Network's Brian Baldinger ranks and breaks down the Top 10 Rookies from the 2017 Season.\n\nWatch full games with NFL Game Pass: https://www.nfl.com/gamepass?campaign=sp-nf-gd-ot-yt-3000342\n\nSign up for Fantasy Football! http://www.nfl.com/fantasyfootball\n\nSubscribe to NFL: http://j.mp/1L0bVBu\n\nThe NFL YouTube channel is your home for immediate in-game highlights from your favorite teams and players, full NFL games, behind the scenes access and more!\n\nCheck out our other channels:\nNFL Network http://www.youtube.com/nflnetwork\nNFL Films http://www.youtube.com/nflfilms\n\nFor all things NFL, visit the league's official website at http://www.nfl.com/\n\nWatch NFL Now: https://www.nfl.com/now\nListen to NFL podcasts: http://www.nfl.com/podcasts\nWatch the NFL network: http://nflnonline.nfl.com/\nDownload the NFL mobile app: https://www.nfl.com/apps\n2017 NFL Schedule: http://www.nfl.com/schedules\nBuy tickets to watch your favorite team:  http://www.nfl.com/tickets\nShop NFL:

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)