Setup for upload to hugging face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Import resources

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

import os 
from pydub import AudioSegment
import moviepy.editor as mp 
import json
import sys

from datasets import load_dataset

dataset = load_dataset("huuuyeah/meetingbank")

In [None]:
dataset['train'][0]

Setup Tokenizer

In [None]:
from transformers import AutoTokenizer
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "

Preprocess

In [None]:
def preprocess_function(examples):

    inputs = [prefix + doc for doc in examples["transcript"]]

    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Evaluation

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Train

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="meeting_summarizer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Save model 

In [None]:
trainer.push_to_hub()

# Inference

In [None]:
text = "I am secretary. Please close the voting and announce the results. It is eight is final consideration of Council Bill 1013 with its public hearing has been postponed until Monday, December 10th. Madam Secretary, if you please put the next item up on our screens and Councilwoman Black, will you please be accountable? 1006 on the floor for passage? Yes. I move that council bill 1006 be placed upon final consideration and do pass. It has been moved and seconded. Councilwoman Black, your motion to postpone. I move that final consideration of Council Bill 18, dash 1006 with its public hearing be postponed to Monday, November 19th, 2018. And it looks like that has been moved and seconded. Questions or comments from members of Council Councilman Black. This postponement was requested by the applicant and is not a reflection on the merits of the application. All right. See no other questions or comments. Madam Secretary, Raquel. Black Eye. Espinosa. Hi. Flynn. I. Cashman. Hi. Lopez. I knew Ortega. I. Mr. President. Madam Secretary, please close voting and announce results. Eight Eyes Final Consideration of Council Bill 1006 with its public hearing has been postponed until Monday, November 19th. All right. Wraps up everything that was called out, all other bills for introduction or published. And we are now ready for the block vote on resolutions and bills on final consideration. Council members remember that this is a consent or block vote and you will need to vote I. Otherwise, this is your last chance to call it an item for a separate vote. Councilman Black, would you please put the resolutions for adoption and the bills on final consideration for final passage on the floor? Yes, I move that resolutions be adopted and bills and final, final consideration be placed upon final consideration and do pass in a block for the following items. All Series 18 1180 1224 1230. 1097 1220 1221 1228 zero 936 1188 1332 1198 1199 1200 1201 1196. That's it. All right. Thank you, Councilman Black. It has been moved and seconded. Madam Secretary, roll call. Black eye. Espinosa. Hi. Flynn. Hi. Cashman. Hi. Lopez. All right. Ortega. Hi, Mr. President. I am secretary. Please close the voting. Announce the results. 88 ayes. The resolutions have been adopted and bills have been placed upon final consideration and do pass. Tonight there will be a required public hearing on Council Bill 996 changing the zoning classification of 374023850 York Street in the Clayton neighborhood."
from transformers import pipeline

summarizer = pipeline("summarization", model="cameronslee/meeting_summarizer_model")
summarizer(text)

# Retrieve Transcript

In [None]:
import whisper_timestamped as whisper
import json
def get_transcript(input_file):
    root, extention  = os.path.splitext(input_file)
    audio = whisper.load_audio(input_file)
    model = whisper.load_model("base")

    result = whisper.transcribe(model, audio, language="en")

    # Specify the file path where you want to save the JSON data
    output_file = root+".json"

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(result, file, indent=2, ensure_ascii=False)

    return result

input_file = "test1.mp4"
transcript = get_transcript(input_file)

In [None]:
# JSON file
f = open ('test1.json', "r")
data = json.loads(f.read())

transcript = data['text']

data

In [None]:
def get_summary(transcript):
    summary =  summarizer("summarize: "+transcript)
    return summary
get_summary(transcript)