<a href="https://colab.research.google.com/github/anupkashyap/podcast-summarizer/blob/main/Podcast_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Podcast summarizer using PEGASUS abstractive summerization model**

Dependencies
1. PyTorch
2. Hungging face Transformers
3. Sentence Piece
4. Pydub

In [None]:
!pip install torch
!pip install sentencePiece
!pip install SpeechRecognition
!pip install transformers
!pip install pydub

Import packages

In [None]:
import speech_recognition as sr
import os
import math
from pydub import AudioSegment
from pydub.silence import split_on_silence
from transformers import  PegasusForConditionalGeneration, PegasusTokenizer


In [None]:
#Configurations
debug=True
outputLines=20
test_file="test.mp3"

In [None]:
#Load models

tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
model=PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum");

Helpers

In [None]:
def process_audio_chunk(chunk):
  silence=AudioSegment.silent(duration=10)
  audio_chunk=silence+chunk+silence
  chunk_file="chunk"+str(i)+".wav"
  print("Creating {0}".format(chunk_file)) if debug==True else None
  audio_chunk.export("./"+chunk_file,bitrate='192k',format="wav")
  recognizer=sr.Recognizer()
  with sr.AudioFile(chunk_file) as source:
    recognizer.adjust_for_ambient_noise(source)
    return(recognizer.listen(source))

In [None]:
def read_audio_file(file_name):
  if(file_name.endswith(".mp3")):
    #Extract from mp3
    audio = AudioSegment.from_mp3(file_name)
  elif(file_name.endswith(".wav")):
    #Extract from wav
    audio=AudioSegment.from_wav(file_name)
  else:
    print("Invalid Input format")
    return
  return audio

In [None]:
def preprocess_text(text_data):
  text=textData[0]
  length=len(text);
  print(length)
  quantSize=math.ceil(length/outputLines)
  print(quantSize)
  textArray=[];
  for i in range(0,outputLines):
    textArray.append(text[i*quantSize:(i*quantSize)+quantSize])
  return textArray



Transcribing audio files

In [None]:
def transcribe_audio(file_name):
  print("Transcribing audio")
  audio=read_audio_file(file_name)
  #Create directory
  try:
    os.mkdir("temp_audio_chunks")
  except(FileExistsError):
    print("Directory already exists") if debug else None
  os.chdir("temp_audio_chunks") 
  outputText = open("outputText.txt", "w+")
  print(audio)
  #Process audio
  audio_chunks=split_on_silence(audio,min_silence_len=800,silence_thresh=-32)
  print("Number of chunks is "+str(len(audio_chunks))) if debug else None
  i=0
  text=""
  for chunk in audio_chunks:
    audio_listened=process_audio_chunk(chunk)
    try:
      res= recognizer.recognize_google(audio_listened)
      text= text+ " "+ res
    except sr.UnknownValueError:
      print("Audio not clear") if debug==True else None
    except sr.RequestError as e:
      print ("Could not connect to API. Check internet connection")
    i+=1
  outputText.write(text)
  os.chdir("..")

Abstractive summarization.

In [None]:
def summarize(textChunks):
  summaries=[]
  test=[]
  for i in range(1,outputLines):
    token=tokenizer(textChunks[i],truncation=True,padding="longest",return_tensors="pt")
    summary=append(model.generate(**token))
    output=output + tokenizer.decode(summary[0]) +"."
  return output

In [None]:
transcribe_audio(test_file)
textData=""
with open ("outputText.txt", "r") as textFile:
  textData=textFile.readlines() 
textChunks=pre_process_text(text_data)
summary=summarize(textChunks)
print(summary)