In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

import os
import random
import warnings
from sklearn.model_selection import train_test_split

import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings(action='ignore') 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
CFG = {
    'SR':16000
}
MODEL_NAME = "openai/whisper-large-v2"

In [3]:
df = pd.read_csv('./speech_data/train.csv')
df.path = df.path.str.replace('./','./speech_data/')

In [4]:
def speech_file_to_array_fn(df):
    feature = []
    for path in tqdm(df['path']):
        speech_array, _ = librosa.load(path, sr=CFG['SR'])
        feature.append(speech_array)
    return feature

In [5]:
df_x = speech_file_to_array_fn(df)


  0%|          | 0/5001 [00:00<?, ?it/s]

In [6]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model.config.forced_decoder_ids = None

In [7]:
input_features = processor(df_x[1],sampling_rate=CFG['SR'],return_tensors="pt").input_features
input_features = input_features.to(device)
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [8]:
model = model.to(device)
transcript_list = []
for i in tqdm(df_x):
    input_features = processor(i,sampling_rate=CFG['SR'],return_tensors="pt").input_features
    input_features = input_features.to(device)
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    transcript_list.append(transcription[0])

  0%|          | 0/5001 [00:00<?, ?it/s]

In [9]:
df['transcript'] = transcript_list

In [10]:
df

Unnamed: 0,id,path,label,transcript
0,TRAIN_0000,./speech_data/train/TRAIN_0000.wav,1,It's 11 o'clock.
1,TRAIN_0001,./speech_data/train/TRAIN_0001.wav,2,The surface is slick.
2,TRAIN_0002,./speech_data/train/TRAIN_0002.wav,4,We'll stop in a couple of minutes.
3,TRAIN_0003,./speech_data/train/TRAIN_0003.wav,5,Maybe tomorrow it will be cold.
4,TRAIN_0004,./speech_data/train/TRAIN_0004.wav,4,Don't forget a jacket.
...,...,...,...,...
4996,TRAIN_4996,./speech_data/train/TRAIN_4996.wav,5,Maybe tomorrow it will be cold.
4997,TRAIN_4997,./speech_data/train/TRAIN_4997.wav,0,It's 11 o'clock!
4998,TRAIN_4998,./speech_data/train/TRAIN_4998.wav,1,I wonder what this is about.
4999,TRAIN_4999,./speech_data/train/TRAIN_4999.wav,1,That is exactly what happened.


In [11]:
df.to_csv('./train_with_transcript.csv', encoding = 'utf-8',index = False)