In [0]:
import numpy as np
import pandas as pd
import spacy
from pathlib import Path
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
"""First we would like to run spacy NLP on each of the documents
that we have randomly sampled from the larger dataset. Because they
have been sampled at random, we expect that they will be representative
of the entire dataset.
"""
from spacyOps import edaLabeler
nlp = spacy.load('en')

# Add sentancizer and custom labeler
sentencizer = nlp.create_pipe('sentencizer')
nlp.add_pipe(sentencizer)
nlp.add_pipe(edaLabeler)

wikiDataPath = Path('./Wiki-sample')
wikiFiles = [x for x in wikiDataPath.glob('**/*.txt') if x.is_file()]

In [0]:
print('There are {} files in the wiki directory'.format(len(files)))
print('Running SpaCy NLP on the files (this will take a few minutes)')
wikiDocs = [nlp(fo.read_text(encoding='utf-8')) for fo in wikiFiles]

In [0]:

podcastDataPath = Path('./data/podcasts')
podcastFiles = [x for x in podcastDataPath.glob('**/*.txt') if x.is_file()]
print('There are {} files in the podcast directory'.format(len(podcastFiles)))
print('Running SpaCy NLP on the files (this will take a few minutes)')

podcastDocs = [nlp(fo.read_text(encoding='utf-8')) for fo in podcastFiles]


In [0]:
"""First let's provide some stucture by organizing our documents into a
pandas dataframe"""

In [0]:

wiki_df = pd.DataFrame()
wiki_df['doc_name'] = [str(x.stem) for x in wikiFiles]
print(wiki_df.head())

pod_df = pd.DataFrame()
pod_df['doc_name'] = [str(x.stem) for x in podcastFiles]
print(pod_df.head())


In [0]:
# Next let's compare some key characteristics about the 2 datasets

# Number of words - Right away, we can see that the podcasts are much longer
# on average than the wiki articles.

In [0]:
wiki_num_words = [len(x) for x in docs]
wiki_df['num_words'] = wiki_num_words

pod_num_words = [len(x) for x in podcastDocs]
pod_df['num_words'] = pod_num_words

In [0]:
# Let's add our spacy tokenized sentences and corresponding labels to the dataframe
# number of sentences
label_arr = []
sent_arr = []
for doc in docs:
    labels = np.array(doc.user_data['labels'])
    sents = np.array(doc.user_data['sents'], dtype=object)
    label_arr.append(labels)
    sent_arr.append(sents)

wiki_df['sents'] = sent_arr
wiki_df['labels'] = label_arr

label_arr = []
sent_arr = []
for doc in podcastDocs:
    labels = np.array(doc.user_data['labels'])
    sents = np.array(doc.user_data['sents'], dtype=object)
    label_arr.append(labels)
    sent_arr.append(sents)

pod_df['sents'] = sent_arr
pod_df['labels'] = label_arr

In [0]:

wiki_df['num_sent'] = wiki_df.apply(lambda row: len(row.sents), axis=1)
pod_df['num_sent'] = pod_df.apply(lambda row: len(row.sents), axis=1)

sns.distplot( wiki_df['num_sent'] , color="skyblue", label='Wiki')
sns.distplot( pod_df['num_sent'] , color="red", label='Podcast')
plt.legend()
plt.show()


In [0]:
# number of segments
wiki_df['num_seg'] = wiki_df.apply(lambda row: sum(row.labels), axis=1)
pod_df['num_seg'] = pod_df.apply(lambda row: sum(row.labels), axis=1)

sns.distplot( wiki_df['num_seg'] , color="skyblue", label='Wiki')
sns.distplot( pod_df['num_seg'] , color="red", label='Podcast')
plt.legend()
plt.show()


In [0]:
# average segment length
wiki_df['avg_seg_len'] = wiki_df.apply(lambda row: row.num_sent/row.num_seg, axis=1)
pod_df['avg_seg_len'] = pod_df.apply(lambda row: row.num_sent/row.num_seg, axis=1)

sns.distplot( wiki_df['avg_seg_len'] , color="skyblue", label='Wiki')
sns.distplot( pod_df['avg_seg_len'] , color="red", label='Podcast')
plt.legend()
plt.show()


In [0]:
# parts of speech

In [0]:
# poloarity

In [0]:
"""One of the major challenges we would like to address is whether our neural network
can be trained on the Wiki dataset, but also generalize to the podcast data. Central
to this question is how similar the datasets are. Wikipedia articles are typically
dispassionate, informative, and well structured. Conversely, podcasts are conversational
in nature. They assume all of the nuances of human speech such as studders, runaway thoughts,
interruptions, etc."""

# Actionable insights - From this exploratory data analysis, we can use some of the
# insights gained to inform the construction of our network and preprocessing decisions.
# Specifically, we note the large discrepency in average segment length between the podcasts
# and wiki datasets. To combat this effect, we decided to eliminate segments shorter than 5
# sentences in the wiki dataset during preprocessing. Additionally, we eliminated all Wiki 
# articles with fewer than 3 segments. Our goal is to extract from the wiki dataset articles
# that will be more structurally similar to the podcasts. 