In [1]:
# Text Data Analysis for CoMeDiAnS- NLP Project
# https://github.com/NishthaChaudhary/Text-Data-Analysis-for-CoMeDiAnS--NLP-Project

# Web scraping, pickle imports
#request is used for fetching the data for a given url
import requests
from bs4 import BeautifulSoup
#BeautifulSoup fetches a specific part on the web part 
import pickle

In [2]:
# Comedian names
# comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

comedians = ['ali','anthony','bill']

# Load pickled files
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [4]:
print("\n--- result data")
# print(data)



--- result data


In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['ali', 'anthony', 'bill'])

In [7]:

# More checks
# data['ali'][:2]

In [8]:
# Let's take a look at our data again
next(iter(data.keys()))

'ali'

In [10]:
# Notice that our dictionary is currently in key: comedian, value: list of text format
# next(iter(data.values()))

In [13]:
# data.items()

In [14]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [15]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [17]:
# print(data_combined)

In [19]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()

print("\n--- data_df")
# data_df


--- data_df


In [21]:

# Let's take a look at the transcript for Ali Wong
# data_df.transcript.loc['ali']

In [22]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [24]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
# OUTPUT
# data_clean

In [25]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [26]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
# OUTPUT
# data_clean

In [32]:
# Let's take a look at our dataframe
# OUTPUT
data_df

Unnamed: 0,transcript
ali,"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have ..."
anthony,"Thank you. Thank you. Thank you, San Francisco. Thank you so much. So good to be here. People were surprised when I told ’em I was gonna tape my s..."
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a ..."


In [33]:
# Yep
# Let's add the comedians' full names as well
# full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr']
# data_df['full_name'] = full_names

# OUTPUT
data_df


Unnamed: 0,transcript
ali,"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have ..."
anthony,"Thank you. Thank you. Thank you, San Francisco. Thank you so much. So good to be here. People were surprised when I told ’em I was gonna tape my s..."
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a ..."
