In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import data_processing
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords



In [2]:
df = data_processing.transcript_to_dataframe('total_dataset.txt')

# Should be of shape (N, 1), N is the number of phrases which will vary by dataset
# For example: HLG is ~175,000
print(f'The dataframe shape is {df.shape}')

The dataframe shape is (174549, 1)


In [3]:
# Check distribution of phrase length

phrase_length = df.phrase.str.split().apply(lambda x: len(x))
print(phrase_length.describe())

count    174549.000000
mean         12.072301
std          13.286237
min           0.000000
25%           4.000000
50%           8.000000
75%          16.000000
max         449.000000
Name: phrase, dtype: float64


In [6]:
df.head()

Unnamed: 0,phrase
0,okay chris good afternoon to you how are you
1,oh i've had a packed morning today you know i ...
2,why are you so packed on a dang sunday
3,well jason i like to move my body and also exe...
4,so you were were you reading studying some aca...


In [None]:
# The Majority of our phrases should be significantly less than 50 words in length,
# So let's view that range
phrase_length.hist(range=(0, 50), bins=20)


In [14]:
# Quick look at term frequencies

# Define English stopwords
# nltk.download('stopwords')

# We can also define data specific stops to include
hlg_stops = ['like', 'know', 'yeah', 'mean', 'really', 'would']

stops = set(stopwords.words('english') + hlg_stops)

cv = CountVectorizer(stop_words=stops)
counts = cv.fit_transform(df.phrase)
pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names()).T.sort_values(0,ascending=False).head(50)


Unnamed: 0,0
think,15207
people,7789
get,6580
well,6197
right,5845
one,5761
kind,5648
good,5094
oh,4861
gonna,4739


In [15]:
# Check for common bi-grams
cv = CountVectorizer(ngram_range=(2,2), stop_words=stops)
counts = cv.fit_transform(df.phrase)
pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names()).T.sort_values(0,ascending=False).head(50)


Unnamed: 0,0
new york,1315
think think,1188
little bit,1117
lot people,545
long gone,521
oh god,470
even though,440
think people,408
every day,373
pretty good,319


In [16]:
# Check for common tri-grams
cv = CountVectorizer(ngram_range=(3,3), stop_words=stops)
counts = cv.fit_transform(df.phrase)
pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names()).T.sort_values(0,ascending=False).head(100)


Unnamed: 0,0
creation tools allow,144
need make podcast,144
heard spotify apple,144
make sure download,144
way make podcast,144
...,...
want make sure,23
two years ago,22
get little bit,22
think think gonna,22


In [17]:
# Check for common 4-grams
cv = CountVectorizer(ngram_range=(4,4), stop_words=stops)
counts = cv.fit_transform(df.phrase)
pd.DataFrame(counts.sum(axis=0),columns=cv.get_feature_names()).T.sort_values(0,ascending=False).head(100)



Unnamed: 0,0
tools allow record edit,144
easiest way make podcast,144
need make podcast one,144
creation tools allow record,144
sure download free anchor,143
...,...
heard spotify apple podcasts,7
even though even though,7
love ankur use time,7
podcast love ankur use,7
