# Data Preprocessing

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
sds = pd.read_csv('../data/superdatascience.csv')

In [3]:
sds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   episode_name     682 non-null    object
 1   length_episode   682 non-null    object
 2   context_episode  682 non-null    object
 3   guest_name       682 non-null    object
 4   guest_info       682 non-null    object
 5   text_episode     680 non-null    object
dtypes: object(6)
memory usage: 32.1+ KB


In [4]:
sds.head(5)

Unnamed: 0,episode_name,length_episode,context_episode,guest_name,guest_info,text_episode
0,SDS 381: How to Avoid Failing at Digital Trans...,60 minutes,BusinessData Science,"Podcast Guest: Tony SaldanhaWednesday Jul 08, ...","Subscribe on Website, Apple Podcasts, Spotify,...","Kirill Eremenko:\tThis is episode number 381, ..."
1,SDS 061: Discovering Data Science workflows an...,62 minutes,Machine LearningData SciencePython,Podcast Guest: Daniel WhitenackThursday Jun 15...,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill:\tThis is episode number 61 with data s...
2,SDS 049: Great tips on building a successful A...,65 minutes,BusinessDatabase,"Podcast Guest: Jim HadleyThursday May 04, 2017","Subscribe on Website, Apple Podcasts, Spotify,...",Kirill:\tThis is episode number 49 with Founde...
3,"SDS 029: AI in Recruitment, Machine Learning, ...",66 minutes,BusinessMachine LearningData ScienceArtificial...,"Podcast Guest: Ben TaylorFriday Feb 24, 2017","Subscribe on Website, Apple Podcasts, Spotify,...","Kirill:\tThis is episode number 29, with Chief..."
4,SDS 254: Two Wolves,6 minutes,Data Science,"Podcast Guest: Kirill EremenkoFriday Apr 19, 2019","Subscribe on Website, Apple Podcasts, Spotify,...","This is FiveMinuteFriday, episode number 254, ..."


In [5]:
# Extracting episode_number and episode_name from episode_name column
sds['episode_number'] = sds['episode_name'].str.split(':', expand = True)[0]
sds['episode_name'] = sds['episode_name'].str.split(':', expand = True)[1]
sds['episode_number'] = sds['episode_number'].str.replace('SDS', '')

# Removing Podcast Guest text from guest_name column
sds['guest_name'] = sds['guest_name'].str.split(':', expand = True)[1]

# Removing minutes from length_episode column
sds['length_episode'] = sds['length_episode'].str.split(' ', expand = True)[0]

# Extracting date from guest_name column and making a new column episode_date
sds['episode_date'] = sds['guest_name'].str.extract(r'(\b[A-Za-z]{3}\s\d{2},\s\d{4}\b)', expand = False).str.strip()
sds['episode_year'] = sds['episode_date'].str.split(',', expand = True)[1]

# Replacing/removing date in guest_name column 
sds['guest_name'] = sds['guest_name'].str.replace(r'(\b[A-Za-z]{3}\s\d{2},\s\d{4}\b)', '')

# Extracting day from guest_name column and making a new column episode_day
sds['episode_day'] = sds['guest_name'].str.extract(r'(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)', expand = False).str.strip()

#Replacing/removing day in guest_name column 
sds['guest_name'] = sds['guest_name'].str.replace('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday', '')

#Adding spaces between some words in context_episode column
sds['context_episode'] = sds['context_episode'].str.replace('([a-z])([A-Z])', r'\1 \2')

# changing the data types for episode_number and length_episode
sds['episode_number'] = sds['episode_number'].astype('int')
sds['length_episode'] = sds['length_episode'].astype('int')

  sds['guest_name'] = sds['guest_name'].str.replace(r'(\b[A-Za-z]{3}\s\d{2},\s\d{4}\b)', '')
  sds['guest_name'] = sds['guest_name'].str.replace('Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday', '')
  sds['context_episode'] = sds['context_episode'].str.replace('([a-z])([A-Z])', r'\1 \2')


In [6]:
sds.head(2)

Unnamed: 0,episode_name,length_episode,context_episode,guest_name,guest_info,text_episode,episode_number,episode_date,episode_year,episode_day
0,How to Avoid Failing at Digital Transformation,60,Business Data Science,Tony Saldanha,"Subscribe on Website, Apple Podcasts, Spotify,...","Kirill Eremenko:\tThis is episode number 381, ...",381,"Jul 08, 2020",2020,Wednesday
1,Discovering Data Science workflows and the im...,62,Machine Learning Data Science Python,Daniel Whitenack,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill:\tThis is episode number 61 with data s...,61,"Jun 15, 2017",2017,Thursday


In [7]:
# Sorting dataframe on episode number
sds = sds.sort_values('episode_number').reset_index(drop = True)

In [9]:
sds['text_episode'].loc[52] 

"Kirill:\tThis is episode number 53 with Aspiring Data Scientist Virginia Mendonca.(background music plays)Welcome to the SuperDataScience podcast. My name is Kirill Eremenko, data science coach and lifestyle entrepreneur. And each week we bring you inspiring people and ideas to help you build your successful career in data science. Thanks for being here today and now let’s make the complex simple.(background music plays)Hello, hello, hello. Hope you're having a great week, a very exciting and interesting week, and today we've got an inspiring guest. Virginia is an aspiring data scientist. So Virginia came from a background in databases and now she's decided to transition her career into data science. And the reason for that is because she has a greater vision for her future. She has a vision of doing good for the world. And she can see that it will be much easier to do that by knowing data science. How cool is that.In this podcast, we talked about quite a few things. We talked about h

In [10]:
# Add a new column host_episode to sds dataframe
sds['host_episode'] = 'Kirril Eremenko'  # Set initial value to 'Kirril Eremenko '

# Set value to 'Jon Krohn' from column 430 to 681
sds.loc[430:681, 'host_episode'] = 'Jon Krohn'

In [None]:
sds.info()

In [None]:
sds.to_csv('../data/sds_cleaned.csv', index = False)

In [None]:
sds['guest_info'].loc[0]

In [None]:
sds['text_episode'].isna().value_counts()

In [None]:
sds['guest_name'].nunique()

In [None]:
sds['guest_name'].value_counts()

In [48]:
sds['text_episode'].loc[30]

"Kirill:\tThis is episode number 31, with my good friend David Tanaskovic.(background music plays)Welcome to the SuperDataScience podcast. My name is Kirill Eremenko, data science coach and lifestyle entrepreneur. And each week we bring you inspiring people and ideas to help you build your successful career in data science. Thanks for being here today and now let’s make the complex simple.(background music plays)Hello and welcome to the SuperDataScience podcast. Super excited to have you on board, and today I have invited my good friend David Tanaskovic onto the show. So something you need to know about David is that he's the happiest person I know on this planet. He's always super excited about life, nothing can bring him down, and he's just generally having fun going through life. Like you know sometimes you wish you would enjoy the things you do and your day to day activities, more like sometimes a day passes by and you feel like oh, wow, like I didn't really enjoy this day as much 

In [None]:
sds

In [15]:
# Function to split the text over the rows

def text_split(df, column):
    
    sds_index = column.str.findall('(?:^|\\xa0|(?:\\n)+)([A-Za-z\s]+):').explode().to_frame()
    sds_index['episode_index'] = 1 
    sds_index['episode_index'] = sds_index.groupby(sds_index.index)['episode_index'].cumsum()
    sds_index = sds_index.reset_index().set_index(['index','episode_index']).rename(columns = {column.name :'speaker'})
    #sds_index
    split_text = column.str.split('(?:^|\\xa0|(?:\\n)+)[A-Za-z\s]+:', expand = True).stack().to_frame()#.droplevel(-1)
    
    split_text.index = split_text.index.rename(sds_index.index.names)
    #return sds_index,split_text 
    sds_index = sds_index.merge(split_text, left_index = True, right_index=True, how = 'right').droplevel(-1)
    return df.merge(sds_index,left_index=True, right_index=True)


In [16]:

sds_text = text_split(sds, sds['text_episode'])

In [17]:
sds_text

Unnamed: 0,episode_name,length_episode,context_episode,guest_name,guest_info,text_episode,episode_number,episode_date,episode_year,episode_day,host_episode,speaker,0
0,"Ruben Kogel on Self-Serve Analytics, R vs Pyt...",42,Business Data Science Database,Ruben Kogel,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill: This is episode number one with ex-che...,1,"Sep 10, 2016",2016,Saturday,Kirril Eremenko,,
0,"Ruben Kogel on Self-Serve Analytics, R vs Pyt...",42,Business Data Science Database,Ruben Kogel,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill: This is episode number one with ex-che...,1,"Sep 10, 2016",2016,Saturday,Kirril Eremenko,Kirill,This is episode number one with ex-chemical e...
0,"Ruben Kogel on Self-Serve Analytics, R vs Pyt...",42,Business Data Science Database,Ruben Kogel,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill: This is episode number one with ex-che...,1,"Sep 10, 2016",2016,Saturday,Kirril Eremenko,Kirill,"Hey guys, welcome to the Podcast. I’ve got Ru..."
0,"Ruben Kogel on Self-Serve Analytics, R vs Pyt...",42,Business Data Science Database,Ruben Kogel,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill: This is episode number one with ex-che...,1,"Sep 10, 2016",2016,Saturday,Kirril Eremenko,Ruben,Thank you! Thanks for having me over. I’m doi...
0,"Ruben Kogel on Self-Serve Analytics, R vs Pyt...",42,Business Data Science Database,Ruben Kogel,"Subscribe on Website, Apple Podcasts, Spotify,...",Kirill: This is episode number one with ex-che...,1,"Sep 10, 2016",2016,Saturday,Kirril Eremenko,Kirill,Awesome. It’s great to hear you and for those...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,"Contextual A.I. for Adapting to Adversaries, ...",81,Data Science Artificial Intelligence,Matar Haller,"Subscribe on Website, Apple Podcasts, Spotify,...",Jon Krohn:\t00:00:05\tThis is episode number 6...,683,"May 30, 2023",2023,Tuesday,Jon Krohn,Jon Krohn,"\t01:17:22Yeah, right. Yeah, as I mean, it act..."
681,"Contextual A.I. for Adapting to Adversaries, ...",81,Data Science Artificial Intelligence,Matar Haller,"Subscribe on Website, Apple Podcasts, Spotify,...",Jon Krohn:\t00:00:05\tThis is episode number 6...,683,"May 30, 2023",2023,Tuesday,Jon Krohn,Matar Haller,\t01:17:45\tHappy to.
681,"Contextual A.I. for Adapting to Adversaries, ...",81,Data Science Artificial Intelligence,Matar Haller,"Subscribe on Website, Apple Podcasts, Spotify,...",Jon Krohn:\t00:00:05\tThis is episode number 6...,683,"May 30, 2023",2023,Tuesday,Jon Krohn,Jon Krohn,"\t01:17:46\tNice. Well, yeah, so you mentioned..."
681,"Contextual A.I. for Adapting to Adversaries, ...",81,Data Science Artificial Intelligence,Matar Haller,"Subscribe on Website, Apple Podcasts, Spotify,...",Jon Krohn:\t00:00:05\tThis is episode number 6...,683,"May 30, 2023",2023,Tuesday,Jon Krohn,Matar Haller,\t01:18:04\tThank you for having me. This was ...


In [18]:
sds_text = sds_text.rename(columns = {0 : 'episode_split_text' })

In [46]:
sds_text.groupby(['episode_number','guest_name'])['episode_split_text'].count().sort_values().head(270)

episode_number  guest_name        
342              Kirill Eremenko      1
514              Jon Krohn            1
244              Kirill Eremenko      1
242              Kirill Eremenko      1
518              Jon Krohn            1
                                     ..
30               Kirill Eremenko      2
31               David Tanaskovic     2
32               Kirill Eremenko      2
34               Kirill Eremenko      2
35               David Venturi        2
Name: episode_split_text, Length: 270, dtype: int64

In [47]:
sds_text[sds_text['guest_name'] == ' David Tanaskovic ']

Unnamed: 0,episode_name,length_episode,context_episode,guest_name,guest_info,text_episode,episode_number,episode_date,episode_year,episode_day,host_episode,speaker,episode_split_text
30,"AB Testing, Kissmetrics and ways to a better ...",64,Data Science Data Visualization,David Tanaskovic,"Subscribe on Website, Apple Podcasts, Spotify,...","Kirill:\tThis is episode number 31, with my go...",31,"Mar 02, 2017",2017,Thursday,Kirril Eremenko,,
30,"AB Testing, Kissmetrics and ways to a better ...",64,Data Science Data Visualization,David Tanaskovic,"Subscribe on Website, Apple Podcasts, Spotify,...","Kirill:\tThis is episode number 31, with my go...",31,"Mar 02, 2017",2017,Thursday,Kirril Eremenko,Kirill,"\tThis is episode number 31, with my good frie..."


In [44]:
print(sds_text['episode_split_text'].loc[38])

38                                                     
38    \tThis is episode number 39 with Director of D...
Name: episode_split_text, dtype: object


In [None]:
sds_text['episode_split_text'] = sds_text['episode_split_text'].str.replace('\\t|\\n|\\xa0', '')

In [None]:
sds_text['episode_split_text'] = sds_text['episode_split_text'].str.replace('\(?\d{2}:\d{2}\):| ?(\d{2}:)+\d{2}', '')

In [None]:
sds_text['episode_split_text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0).value_counts()

In [None]:
sds_text = sds_text[sds_text['episode_split_text'] != '']

In [None]:
sds_text

In [None]:
sds_text.to_csv('../data/sds_text.csv')

In [None]:
#split_text.to_frame().rename()

In [None]:
#sds['text_episode'].str.split('(?:^|\\xa0|(?:\\n)+)[A-Za-z\s]+:', expand = True).stack()#.droplevel(-1)

In [None]:
#sds['text_episode'].str.split('(?:^|\\xa0|\\n)[A-Za-z\s]+:', expand = True).stack().loc[677]

In [None]:
#sds_index = sds['text_episode'].str.findall('(?:^|\\xa0|(?:\\n)+)([A-Za-z\s]+:)').explode().to_frame()#.shape#.stack().loc[0]

In [None]:
# Extracting the speakers from the text and creating index

#sds_index = sds['text_episode'].str.findall('(?:^|\\xa0|(?:\\n)+)([A-Za-z\s]+:)').explode().to_frame()
#sds_index['episode_index'] = 1 
#sds_index['episode_index'] = sds_index.groupby(sds_index.index)['episode_index'].cumsum()
#sds_index = sds_index.reset_index().set_index(['index','episode_index']).rename(columns = {'text_episode':'speaker'})
#sds_index

In [None]:
#sds_index['episode_index'] = sds_index.groupby(sds_index.index)['episode_index'].cumsum()

In [None]:
#sds_index

In [None]:
#sds_index = sds_index.reset_index().set_index(['index','episode_index']).rename(columns = {'text_episode':'speaker'})

In [None]:
#sds_index

In [None]:
#len(set(sds['text_episode'].str.findall('(^|\\xa0|\\n)[A-Za-z\s]+:').explode().index)) #.stack().droplevel(-1)

In [None]:
#sds['text_episode'].str.findall('(^|\\xa0|\\n)[A-Za-z\s]+:').explode()

In [None]:
#sds_guest = sds[~((sds['guest_name'] == ' Kirill Eremenko ') | (sds['guest_name'] == ' Jon Krohn '))].copy().reset_index(drop = True)

In [None]:
#sds_host = sds[(sds['guest_name'] == ' Kirill Eremenko ') | (sds['guest_name'] == ' Jon Krohn ')].reset_index(drop = True)

In [None]:
sds_host['context_episode'].unique()

In [None]:
sds_host[sds_host['context_episode'] == 'Data Science']

In [None]:
sds_guest['context_episode'].unique()

In [None]:
sds['context_episode'].value_counts()

In [None]:
sds_guest['text_episode'].loc[7]

In [None]:
sds['text_episode'] = sds['text_episode'].astype(str)

In [None]:
sds['text_episode'].apply(lambda x: pd.Series(x.split('[A-Z][a-z]+:')))

In [None]:
sds['text_episode'].str.split('[A-Z][a-z]+:', expand = True).stack()

In [None]:
sds_split

In [None]:
sds_split[sds_split['guest_name'] == ' Jon Krohn ']

In [None]:
sds_split.info()

In [None]:
#sds = sds.merge(sds_split.to_frame(),left_index=True, right_index=True)


In [None]:
sds.head(30)

In [None]:
sds_guest.to_csv('../data/sds_guest.csv', index = False)

In [None]:
sds[sds['context_episode'] == 'Data Science']

In [None]:
sds['episode_name'].unique()

In [None]:
sds['text_episode'].loc[13]

In [None]:
#text = BeautifulSoup(text, 'lxml').get_text(separator=' ', strip=True)

#text = re.findall(r'[a-z]+', text.lower())

In [None]:
sds['guest_info'].loc[1]