In [2]:
from unsupervised_topic_segmentation import core, eval, types, dataset
import create_test_data
import pandas as pd
import numpy as np
import pickle

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# read transcripts.pickle
with open('transcripts.pickle', 'rb') as handle:
    transcripts = pickle.load(handle)

Options:
- datasets: `types.TopicSegmentationDatasets.AMI`, `types.TopicSegmentationDatasets.ICSI` _(need to download and implement in unsupervised_topic_segmentation/dataset.py)_
- algorithms: `types.BERTSegmentation`, `types.RandomSegmentation`, `types.TopicSegmentationAlgorithm.EvenSegmentation` _(SBERT not implemented)_

Hyperparams to consider for `BERTSegmentation`:
- `PARALLEL_INFERENCE_INSTANCES = 20` in `core.py`: infer multiple meetings at once
- `MAX_SEGMENTS_CAP`: True or False, default True. "Add a max segment limit so there are not too many segments"; in this implementation, "local maxima are sorted by depth_score value and we take only the first K where the K+1th local maxima is lower then the threshold". False "is the vanilla TextTiling used for Pk optimization". 
- `MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH`: used as cap if above is True, int default 60. This was originally supposed to be in seconds (see explanation below) but we can easily make it refer to words or sentences.
- `TEXT_TILING`: additional tiling hyperparams. Set to `types.TextTilingHyperparameters` with below additional hyperparams, otherwise defaults to None (which throws error).

Additional hyperparams in `TextTilingHyperparameters`:
- `SENTENCE_COMPARISON_WINDOW`: int default 15. Number of sentences in each non-overlapping window to consider as chunk.
- `SMOOTHING_PASSES` int default 2. Number of smoothing iterations on similarity scores.
- `SMOOTHING_WINDOW`: int default 1. Neighborhood considered in smoothing similarity scores (unit is chunks).
- `TOPIC_CHANGE_THRESHOLD`: float default 0.6.

Hyperparams for `RandomSegmentation`:
- `random_threshold`

Hyperparams for `EvenSegmentation`:
- `k`: Number of sentences per segment

Note: the meeting duration is used in the original code in the following way, when `MAX_SEGMENTS_CAP` is True: The total meeting duration for each meeting is divided by `MAX_SEGMENTS_CAP_AVERAGE_SEGMENT_LENGTH` to determine the number of max segments.

In [3]:
algorithm = types.BERTSegmentation(
    text_tiling=types.TextTilingHyperparameters(
        sentence_comparison_window=50,
        smoothing_passes=2,
        smoothing_window=1,
        topic_change_threshold=0.6),
    max_segments_cap=True,
    max_segments_cap__average_segment_length=120)
algorithm

BERTSegmentation(TEXT_TILING=TextTilingHyperparameters(SENTENCE_COMPARISON_WINDOW=50, SMOOTHING_PASSES=2, SMOOTHING_WINDOW=1, TOPIC_CHANGE_THRESHOLD=0.6), MAX_SEGMENTS_CAP=True, MAX_SEGMENTS_CAP__AVERAGE_SEGMENT_LENGTH=120)

Just to run inference (no eval), use `core.topic_segmentation` with arguments:
- `topic_segmentation_algorithm`: choose from above
- `df`: transcript pandas dataframe with columns below
- `meeting_id_col_name`: str for col of meetings to produce splits within
- `start_col_name`: str for col with start time of caption (barely used, could cut)
- `end_col_name`: str for col with end time of caption (barely used, could cut)
- `caption_col_name`: str for col of sentence strings

In [4]:
test_data = pd.DataFrame(data={'meeting_id':'d0a7e5864959','caption':transcripts['d0a7e5864959']})
test_data['duration'] = test_data.caption.apply(lambda x: len(x.split(' ')))  # 1 word/s
test_data['end_time'] = test_data.duration.cumsum()
test_data['start_time'] = test_data.duration.cumsum() - test_data.duration
test_data = test_data[['meeting_id','start_time','end_time','caption']]
test_data = dataset.preprocessing(test_data, 'caption')  # note that this adds (old) `index` column, but topic_segmentation uses actual index
test_data

Unnamed: 0,index,meeting_id,start_time,end_time,caption
0,0,d0a7e5864959,0,5,And older woman Jocasta Zamarripa.
1,1,d0a7e5864959,5,15,"Shortly, Alderman Rainey and Alderman Hamilton..."
2,2,d0a7e5864959,15,19,So let's get started.
3,3,d0a7e5864959,19,36,"Item nber 211814, reappointment of Brian Rott ..."
4,4,d0a7e5864959,36,42,"Mr. Brian Rott, I see you."
...,...,...,...,...,...
376,452,d0a7e5864959,4328,4332,"With that, thank you."
377,453,d0a7e5864959,4332,4342,"And with that, everyone, thank you for showing..."
378,454,d0a7e5864959,4342,4349,And that concludes our committee meeting today.
379,456,d0a7e5864959,4351,4357,See you all in the community.


In [5]:
core.topic_segmentation(
    topic_segmentation_algorithm=algorithm,
    df=test_data,
    meeting_id_col_name='meeting_id',
    start_col_name='start_time',
    end_col_name='end_time',
    caption_col_name='caption')

[249 205 140  35]


{'d0a7e5864959': array([249, 205, 140,  35])}

In [6]:
i = 249
test_data.iloc[(i-10):(i+10)]

Unnamed: 0,index,meeting_id,start_time,end_time,caption
239,282,d0a7e5864959,2753,2761,Do you have any questions by committee members?
240,284,d0a7e5864959,2763,2772,"With that, owner woman Zamparipa moves for you..."
241,285,d0a7e5864959,2772,2777,"Hearing no objections, so ordered."
242,286,d0a7e5864959,2777,2781,"Thank you, Mr. Kennedy."
243,287,d0a7e5864959,2781,2788,I will see you in the community.
244,288,d0a7e5864959,2788,2798,"All right, nber eight, I mean, nber nine, excu..."
245,289,d0a7e5864959,2798,2821,"File nber 220049, reappointment of Jeff Welk t..."
246,290,d0a7e5864959,2821,2829,"All right, Ms. Elmer, what's his attendance re..."
247,292,d0a7e5864959,2832,2839,"All right, Mr. Welk, are you available?"
248,293,d0a7e5864959,2839,2846,"With some, does anybody know Mr. Welk?"


In [7]:
# don't we need to add sentence_comparison_window+2 to those indices in order to get the true indices? don't see where they fixed that
k = algorithm.TEXT_TILING.SENTENCE_COMPARISON_WINDOW
test_data.iloc[(i+k+2-10):(i+k+2+10)]

Unnamed: 0,index,meeting_id,start_time,end_time,caption
291,342,d0a7e5864959,3362,3369,I'm sure we would to attend.
292,344,d0a7e5864959,3370,3409,"We also, just as a side note, our beer garden ..."
293,345,d0a7e5864959,3409,3414,", that's wonderful, that's wonderful."
294,346,d0a7e5864959,3414,3419,"Okay, thank you for that."
295,347,d0a7e5864959,3419,3425,Alderman Hamilton moves for your appointment.
296,348,d0a7e5864959,3425,3433,"You are confirmed, hearing no objections, so o..."
297,349,d0a7e5864959,3433,3440,"Thank you, Melissa, have a wonderful day."
298,351,d0a7e5864959,3444,3465,"All right, next we have nber 13, file nber 220..."
299,352,d0a7e5864959,3465,3470,"Mr. Hennessey, are you available?"
300,353,d0a7e5864959,3470,3475,"He is, for 10 minutes."


In [8]:
results, labels, doc_count = create_test_data.generate_segment()
test_data = pd.DataFrame(data={'caption':results,'label':labels,'meeting_id':1})
test_data['duration'] = test_data.caption.apply(lambda x: len(x.split(' ')))  # 1 word/s
test_data['end_time'] = test_data.duration.cumsum()
test_data['start_time'] = test_data.duration.cumsum() - test_data.duration
test_data = test_data[['meeting_id','start_time','end_time','caption','label']]
test_data = dataset.preprocessing(test_data, 'caption')  # note that this adds (old) `index` column, but topic_segmentation uses actual index
test_data

Unnamed: 0,index,meeting_id,start_time,end_time,caption,label
0,0,1,0,11,The first item on our agenda is the call to or...,0
1,1,1,11,24,"This April 11th, 2023 regular meeting of the c...",0
2,2,1,24,31,The next item is the roll call.,0
3,3,1,31,39,Will the executive secretary please call the r...,0
4,7,1,44,47,Vice President Miller?,0
...,...,...,...,...,...,...
587,725,1,11942,11995,"I thought yeah, okay That's okay Item 14 22018...",7
588,726,1,11995,12000,Any questions on that matter?,7
589,727,1,12000,12004,Any objections Hearing none.,7
590,728,1,12004,12048,So ordered item 15 220185 an ordinance to furt...,7


In [9]:
eval.eval_topic_segmentation(
    topic_segmentation_algorithm=algorithm,
    input_df = test_data)

[456 433 428 420 234 174 167 161 159]
Pk on 1 meetings: 0.6116152450090744
WinDiff on 1 meetings: 0.6715063520871143


{'average_Pk_': 0.6116152450090744, 'average_windiff_': 0.6715063520871143}

In [12]:
even_algorithm = types.EvenSegmentation(k=100)
eval.eval_topic_segmentation(
    topic_segmentation_algorithm=even_algorithm,
    input_df = test_data)

[0, 100, 200, 300, 400, 500]
Pk on 1 meetings: 0.41197822141560797
WinDiff on 1 meetings: 0.41197822141560797


{'average_Pk_': 0.41197822141560797, 'average_windiff_': 0.41197822141560797}

In [15]:
random_algorithm = types.RandomSegmentation(random_threshold=0.99)
eval.eval_topic_segmentation(
    topic_segmentation_algorithm=random_algorithm,
    input_df = test_data)

[54, 119, 332, 427, 451, 476, 557]
Pk on 1 meetings: 0.5807622504537205
WinDiff on 1 meetings: 0.5807622504537205


{'average_Pk_': 0.5807622504537205, 'average_windiff_': 0.5807622504537205}