##TASK 1: Transcripting the audio file into text

In [None]:
!python --version
!pip install -q transformers
!pip install -q pydub
!pip install torch

Python 3.10.12
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
import librosa
import torch
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def load_wav2vec_960h_model():
  tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
  return tokenizer, model

def convert_corrected_uppercase(input_text):
  sentences = nltk.sent_tokenize(input_text)
  return (' '.join([st.replace(st[0],st[0].capitalize(),1) for st in sentences]))

In [None]:
def asr_transcript(tokenizer, model, input_file):
  speech, sample_rate = sf.read(input_file)
  #make it 1-D
  if len(speech.shape) > 1:
      speech = speech[:,0] + speech[:,1]
  #Resample to 16khz
  if sample_rate != 16000:
      speech = librosa.resample(speech, sample_rate, 16000)
  #tokenize
  input_val = tokenizer(speech, return_tensors="pt").input_values
  #take logits
  logits = model(input_val).logits
  #take argmax (find most probable word id)
  pred_ids = torch.argmax(logits, dim=-1)
  #get the words from the predicted word ids
  transcription = tokenizer.decode(pred_ids[0])
  #output is all uppercase, make only the first letter in first word capitalized
  transcription = convert_corrected_uppercase(transcription.lower())
  return transcription

In [None]:
wav_input = '/content/drive/MyDrive/Internship_Assignment/sales_call_telephone_marketers.wav'
tokenizer, model = load_wav2vec_960h_model()
text = asr_transcript(tokenizer,model,wav_input)
print(text)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Hello i nancy this is like from eightient incorporation yes how can i help you nancy you have been using our preepa connection for a couple of years now right ye that's right how would you like a postpa connection that allows you to make free unlimited voice calls to three eightent numbers i would love that but what's the catch there's no catch there will be a monthly rental which you will have to pay like any other postpaid connection fantastic sign me up


In [None]:
text

"Hello i nancy this is like from eightient incorporation yes how can i help you nancy you have been using our preepa connection for a couple of years now right ye that's right how would you like a postpa connection that allows you to make free unlimited voice calls to three eightent numbers i would love that but what's the catch there's no catch there will be a monthly rental which you will have to pay like any other postpaid connection fantastic sign me up"

##TASK 2:  Classify intents and recognize entities
##TASK 3:  Convert it to JSON format

In [None]:
!pip install rasa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rasa
  Downloading rasa-3.6.0-py3-none-any.whl (830 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m830.4/830.4 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Collecting PyJWT[crypto]<3.0.0,>=2.0.0 (from rasa)
  Downloading PyJWT-2.7.0-py3-none-any.whl (22 kB)
Collecting SQLAlchemy<1.5.0,>=1.4.0 (from rasa)
  Downloading SQLAlchemy-1.4.48-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
Collecting aio-pika<8.2.4,>=6.7.1 (from rasa)
  Downloading aio_pika-8.2.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiogram<2.26 (from rasa)
  Downloading aiogram-2.25.1-py3-none-any.w

In [None]:
!pip install -U ipython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython
  Downloading ipython-8.14.0-py3-none-any.whl (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.7/798.7 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython)
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 (from ipython)
  Downloading prompt_toolkit-3.0.38-py3-none-any.whl (385 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.8/385.8 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Collecting stack-data (from ipython)
  Downloading stack_data-0.6.2-py3-none-any.whl (24 kB)
Collecting executing>=1.2.0 (from stack-data->ipython)
  Downloading executing-1.2.0-py2.py3-none-any.whl (24 kB)
Collecting asttokens>=2.1.0 (from stack

In [None]:
!rasa init --no-prompt

  Base: DeclarativeMeta = declarative_base()
(0lqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqk(B
(0x(B Rasa Open Source reports anonymous usage telemetry to help improve the product (0x(B
(0x(B for all its users.                                                             (0x(B
(0x(B                                                                                (0x(B
(0x(B If you'd like to opt-out, you can use `rasa telemetry disable`.                (0x(B
(0x(B To learn more, check out https://rasa.com/docs/rasa/telemetry/telemetry.       (0x(B
(0mqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqj(B
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420)

In [4]:
!rasa data convert nlu --data data/nlu.yml --out data/nlu.json -f json

  Base: DeclarativeMeta = declarative_base()
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementin

In [5]:
!rasa train nlu

  Base: DeclarativeMeta = declarative_base()
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementin

In [6]:
!rasa shell nlu

  Base: DeclarativeMeta = declarative_base()
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementin

##TASK 4: Summarize the call

In [None]:
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement datsets (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for datsets[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [None]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

In [None]:
tokens

{'input_ids': tensor([[ 8087,   532,  3178, 21461,   136,   117,   172,   135,  1965, 23040,
         25129,  2816,   199,   137,   532,   225,   119,  3178, 21461,   119,
           133,   174,   303,   150,  1133, 55694,  1654,   118,   114,   932,
           113,   231,   239,   268, 13983,   120,   131,   116,   268,   199,
           192,   119,   172,   114,   450,  6035,  1654,   120,   871,   119,
           112,   193,   294,  6063,  1773,  2101,   112,   339,  1965,  8057,
          1586,   532,   192,   298,   120,   155,   180,   131,   116,   109,
          2602,   186,   131,   116,   220,  2602,   186,   138,   129,   114,
          2357,  2441,   162,   119,   138,   133,   112,   626,   172,   189,
           176,   450, 16097,  1654,  1949,  1243,   213,   164,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1,

In [None]:
summary = model.generate(**tokens)



In [None]:
summary[0]

tensor([    0,   463,   117,   114,   450,  6035,  1654,   111,   180,   117,
          109,  2602,   186,   117,   220,  2602,   186,   138,   129,   114,
         2357,  2441,   162,   119,   138,   133,   112,   626,   172,   189,
          176,   450, 16097,  1654,  1949,  1243,   213,   164,     1])

In [None]:
tokenizer.decode(summary[0])

'<pad>What is a postpa connection and what is the catch there is no catch there will be a monthly rental which you will have to pay like any other postpaid connection fantastic sign me up</s>'