In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **ASSIGNMENT 2**

# **DATA CLEANING**

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import string
import pickle

def clean_text(text):
    # Remove HTML tags and special characters
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "html.parser")
    para = soup.find_all('p')
    text = [clean_text(p.text) for p in para]
    return ' '.join(text)

# URLs of StandUp transcripts in scope
urls = [
    ('Matt Rife', 'https://scrapsfromtheloft.com/comedy/matt-rife-matthew-steven-rife-transcript/'),
    ('Trevor Noah', 'https://scrapsfromtheloft.com/comedy/trevor-noah-where-was-i-transcript/'),
    ('Ali Wong', 'https://scrapsfromtheloft.com/comedy/ali-wong-don-wong-transcript/'),
    ('Kevin James', 'https://scrapsfromtheloft.com/comedy/kevin-james-irregardless-transcript/'),
    ('Pete Davidson', 'https://scrapsfromtheloft.com/comedy/pete-davidson-turbo-fonzarelli-transcript/'),
    ('Bassem Youssef', 'https://scrapsfromtheloft.com/comedy/bassem-youssef-the-dark-side-of-making-it-on-social-media-transcript/'),
    ('Tig Notaro', 'https://scrapsfromtheloft.com/comedy/tig-notaro-hello-again-transcript/'),
    ('Dave Attell', 'https://scrapsfromtheloft.com/comedy/dave-attell-hot-cross-buns-transcript/'),
    ('Jeff Dunham', 'https://scrapsfromtheloft.com/comedy/jeff-dunham-im-with-cupid-transcript/'),
    ('Dylan Moran', 'https://scrapsfromtheloft.com/comedy/dylan-moran-yeah-yeah-2011-transcript/'),

]

# Clean and preprocess the data
cleaned_transcripts = [(comedian, url_to_transcript(url)) for comedian, url in urls]

# Create a DataFrame from the cleaned data
cleaned_data_df = pd.DataFrame(cleaned_transcripts, columns=['Comedian', 'Transcript'])
cleaned_data_df.set_index('Comedian', inplace=True)

# Save the cleaned data to a .pkl file
cleaned_data_df.to_pickle('cleaned_data.pkl')


# **ASSIGNMENT 3**

In [3]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd
import string
import re

def clean_text(text):
    # Remove HTML tags and special characters
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    
    # Remove numeric characters
    text = ''.join([char for char in text if char not in (string.punctuation + string.digits)])
    
    return text

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "html.parser")
    para = soup.find_all('p')
    text = [clean_text(p.text) for p in para]
    return ' '.join(text)

# URLs of StandUp transcripts in scope
urls = [
    ('Matt Rife', 'https://scrapsfromtheloft.com/comedy/matt-rife-matthew-steven-rife-transcript/'),
    ('Trevor Noah', 'https://scrapsfromtheloft.com/comedy/trevor-noah-where-was-i-transcript/'),
    ('Ali Wong', 'https://scrapsfromtheloft.com/comedy/ali-wong-don-wong-transcript/'),
    ('Kevin James', 'https://scrapsfromtheloft.com/comedy/kevin-james-irregardless-transcript/'),
    ('Pete Davidson', 'https://scrapsfromtheloft.com/comedy/pete-davidson-turbo-fonzarelli-transcript/')
]

# Clean and preprocess the data
cleaned_transcripts = [(comedian, url_to_transcript(url)) for comedian, url in urls]

# Create a Document-Term Matrix (DTM)
vectorizer = CountVectorizer(stop_words='english')
dtm_sparse = vectorizer.fit_transform([text for comedian, text in cleaned_transcripts])

# Convert the sparse matrix to a dense matrix
dtm_dense = dtm_sparse.toarray()

# Create a DataFrame from the dense matrix
words = vectorizer.get_feature_names_out()
comedian_names = [comedian for comedian, text in cleaned_transcripts]
dtm_df = pd.DataFrame(dtm_dense, columns=words, index=comedian_names)

# Save the DTM to a .pkl file
dtm_df.to_pickle('dtm_comedians.pkl')


In [4]:
#printing most common word 
with open('cleaned_data.pkl', 'rb') as file:
    dtm = pickle.load(file)

# Now 'loaded_dtm' is a sparse matrix representing the Document-Term Matrix
# You can print or analyze it as needed
print(dtm)


                                                       Transcript
Comedian                                                         
Matt Rife       in his second hourlong comedy special “matthew...
Trevor Noah       detroit give it up… for trevor noah  what’s ...
Ali Wong        ladies gentlemen and everybody put your hands ...
Kevin James     kevin james irregardless  in kevin james irreg...
Pete Davidson   pete davidson turbo fonzarelli released date j...
Bassem Youssef  bassem youssef illuminates the complex realiti...
Tig Notaro      tig notaro hello again  released on march 26 2...
Dave Attell                                                      
Jeff Dunham     i’m funnier than he is but they told me to int...
Dylan Moran                                                      
