In [9]:
from unsupervised_topic_segmentation import core, eval, types, dataset
import create_test_data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [10]:
# read transcripts.pickle
with open('transcripts.pickle', 'rb') as handle:
    transcripts = pickle.load(handle)

Options:
- datasets: `types.TopicSegmentationDatasets.AMI`, `types.TopicSegmentationDatasets.ICSI` _(need to download and implement in unsupervised_topic_segmentation/dataset.py)_
- algorithms: `types.BERTSegmentation`, `types.RandomSegmentation`, `types.TopicSegmentationAlgorithm.EvenSegmentation` _(SBERT not implemented)_

Hyperparams to consider for `BERTSegmentation`:
- `PARALLEL_INFERENCE_INSTANCES = 20` in `core.py`: infer multiple meetings at once
- `MAX_SEGMENTS_CAP`: True or False, default True. "Add a max segment limit so there are not too many segments"; in this implementation, "local maxima are sorted by depth_score value and we take only the first K where the K+1th local maxima is lower then the threshold". False "is the vanilla TextTiling used for Pk optimization". 
- `MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH`: used as cap if above is True, int default 60. This was originally supposed to be in seconds (see explanation below) but we can easily make it refer to words or sentences.
- `TEXT_TILING`: additional tiling hyperparams. Set to `types.TextTilingHyperparameters` with below additional hyperparams, otherwise defaults to None (which throws error).

Additional hyperparams in `TextTilingHyperparameters`:
- `SENTENCE_COMPARISON_WINDOW`: int default 15. Number of sentences in each non-overlapping window to consider as chunk.
- `SMOOTHING_PASSES` int default 2. Number of smoothing iterations on similarity scores.
- `SMOOTHING_WINDOW`: int default 1. Neighborhood considered in smoothing similarity scores (unit is chunks).
- `TOPIC_CHANGE_THRESHOLD`: float default 0.6.

Hyperparams for `RandomSegmentation`:
- `random_threshold`

Hyperparams for `EvenSegmentation`:
- `k`: Number of sentences per segment

Note: the meeting duration is used in the original code in the following way, when `MAX_SEGMENTS_CAP` is True: The total meeting duration for each meeting is divided by `MAX_SEGMENTS_CAP_AVERAGE_SEGMENT_LENGTH` to determine the number of max segments.

In [11]:
old_algorithm = types.BERTSegmentation(
    sentence_comparison_window=50,
    text_tiling=types.OriginalSegmentation(
        smoothing_passes=2,
        smoothing_window=1,
        topic_change_threshold=0.6,
        max_segments_cap=True,
        max_segments_cap__average_segment_length=120))
new_algorithm = types.BERTSegmentation(
    sentence_comparison_window=50,
    text_tiling=types.NewSegmentation(
        stdevs=1))

new_algorithm

BERTSegmentation(SENTENCE_COMPARISON_WINDOW=50, TEXT_TILING=NewSegmentation(STDEVS=1))

Just to run inference (no eval), use `core.topic_segmentation` with arguments:
- `topic_segmentation_algorithm`: choose from above
- `df`: transcript pandas dataframe with columns below
- `meeting_id_col_name`: str for col of meetings to produce splits within
- `start_col_name`: str for col with start time of caption (barely used, could cut)
- `end_col_name`: str for col with end time of caption (barely used, could cut)
- `caption_col_name`: str for col of sentence strings

In [12]:
test_data = pd.DataFrame(data={'meeting_id':'d0a7e5864959','caption':transcripts['d0a7e5864959']})
test_data = dataset.add_durations(test_data)  # "duration" column is just number of words
test_data = dataset.preprocessing(test_data,min_caption_len=5)  # note that this adds (old) `index` column, but topic_segmentation uses actual index
test_data

Unnamed: 0,index,meeting_id,start_time,end_time,caption
0,0,d0a7e5864959,0,5,And older woman Jocasta Zamarripa
1,1,d0a7e5864959,5,15,"Shortly, Alderman Rainey and Alderman Hamilton..."
2,2,d0a7e5864959,15,19,So let's get started
3,3,d0a7e5864959,19,36,"Item nber 211814, reappointment of Brian Rott ..."
4,4,d0a7e5864959,36,42,"Brian Rott, I see you"
...,...,...,...,...,...
483,454,d0a7e5864959,4342,4349,And that concludes our committee meeting today
484,455,d0a7e5864959,4349,4351,Thank you
485,456,d0a7e5864959,4351,4357,See you all in the community
486,457,d0a7e5864959,4357,4360,Thanks to additionally


In [13]:
core.topic_segmentation(
    topic_segmentation_algorithm=new_algorithm,
    df=test_data,
    meeting_id_col_name='meeting_id',
    start_col_name='start_time',
    end_col_name='end_time',
    caption_col_name='caption')

[112, 158, 175, 226, 348, 405, 415]


{'d0a7e5864959': [112, 158, 175, 226, 348, 405, 415]}

In [14]:
i = 112
test_data.iloc[(i-10):(i+10)]

Unnamed: 0,index,meeting_id,start_time,end_time,caption
102,103,d0a7e5864959,988,997,So we will do our grand opening in September
103,104,d0a7e5864959,997,1016,And I can let when that actual date will be o...
104,105,d0a7e5864959,1016,1018,"Excellent, excellent"
105,106,d0a7e5864959,1018,1034,Do me a favor and send an invitation to the co...
106,107,d0a7e5864959,1034,1038,We'll definitely do that
107,108,d0a7e5864959,1038,1044,"All right, anybody have any questions?"
108,109,d0a7e5864959,1044,1051,Any of my colleagues have any questions?
109,110,d0a7e5864959,1051,1058,We have been joined by Alderman Hamilton
110,111,d0a7e5864959,1058,1064,"Good to see you, Alderman Hamilton"
111,112,d0a7e5864959,1064,1070,"And I'll move approval, Mr"


In [15]:
results, labels, doc_count = create_test_data.generate_segment()
test_data = pd.DataFrame(data={'sentences':results,'label':labels,'meeting_id':1})
test_data['duration'] = test_data.caption.apply(lambda x: len(x.split(' ')))  # 1 word/s
test_data['end_time'] = test_data.duration.cumsum()
test_data['start_time'] = test_data.duration.cumsum() - test_data.duration
test_data = test_data[['meeting_id','start_time','end_time','caption','label']]
test_data = dataset.preprocessing(test_data, 'caption')  # note that this adds (old) `index` column, but topic_segmentation uses actual index
test_data

TypeError: generate_segment() missing 1 required positional argument: 't'

In [16]:
eval.eval_topic_segmentation(
    topic_segmentation_algorithm=new_algorithm,
    input_df = test_data)

[112, 158, 175, 226, 348, 405, 415]


KeyError: 'label'

In [None]:
even_algorithm = types.EvenSegmentation(k=100)
eval.eval_topic_segmentation(
    topic_segmentation_algorithm=even_algorithm,
    input_df = test_data)

Even segmentation: [0, 100, 200, 300, 400, 500, 600]
Pk on 1 meetings: 0.5935483870967742
WinDiff on 1 meetings: 0.7913978494623656


{'average_Pk_': 0.5935483870967742, 'average_windiff_': 0.7913978494623656}

In [None]:
random_algorithm = types.RandomSegmentation(random_threshold=0.99)
eval.eval_topic_segmentation(
    topic_segmentation_algorithm=random_algorithm,
    input_df = test_data)

Random segmentation: [89, 236, 370, 518]
Pk on 1 meetings: 0.5935483870967742
WinDiff on 1 meetings: 0.6064516129032258


{'average_Pk_': 0.5935483870967742, 'average_windiff_': 0.6064516129032258}