In [2]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

2.3.1
2.3.1
cuda


In [3]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H

print("Sample Rate:", bundle.sample_rate)

print("Labels:", bundle.get_labels())

Sample Rate: 16000
Labels: ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')


In [4]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

In [5]:
model = bundle.get_model().to(device)

print(model.__class__)

<class 'torchaudio.models.wav2vec2.model.Wav2Vec2Model'>


In [6]:
waveform, sample_rate = torchaudio.load("E:/project_summer/welcome_modified.wav")
waveform = waveform.to(device)

if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

In [8]:
with torch.inference_mode():
    emission, _ = model(waveform)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [9]:
decoder = GreedyCTCDecoder(labels=bundle.get_labels())
transcript = decoder(emission[0])

In [10]:
transcript

'THE|FOLLOWING|IS|AN|INTERVIEW|WITH|THE|PRES'

In [1]:
from pydub import AudioSegment
import os

# Load the audio file
audio = AudioSegment.from_file("E:\\chd\\putin_interview.wav")

# Define the number of chunks
num_chunks = 3024
chunk_length = len(audio) // num_chunks

# Create a directory to save chunks if it doesn't exist
output_dir = "E:\\chd\\audio_chunks"
os.makedirs(output_dir, exist_ok=True)

# Split the audio into chunks
audio_chunks = [audio[i*chunk_length:(i+1)*chunk_length] for i in range(num_chunks)]

# Save each chunk to the output directory with its index as the filename
for i, chunk in enumerate(audio_chunks):
    chunk.export(os.path.join(output_dir, f"chunk_{i}.wav"), format="wav")




In [6]:
import os

def list_files(directory):
    # Get the list of all files and directories in the specified directory
    files = os.listdir(directory)
    
    # Filter out only the files from the list
    files = [file for file in files if os.path.isfile(os.path.join(directory, file))]
    
    return files

# Example usage:
directory_path = 'E:/project_summer/data/audio_chunks'  # Replace with your directory path
files_list = list_files(directory_path)

print("Files in directory:")
for file_name in files_list:
    print(file_name)


Files in directory:
chunk_0.wav
chunk_1.wav
chunk_10.wav
chunk_100.wav
chunk_1000.wav
chunk_1001.wav
chunk_1002.wav
chunk_1003.wav
chunk_1004.wav
chunk_1005.wav
chunk_1006.wav
chunk_1007.wav
chunk_1008.wav
chunk_1009.wav
chunk_101.wav
chunk_1010.wav
chunk_1011.wav
chunk_1012.wav
chunk_1013.wav
chunk_1014.wav
chunk_1015.wav
chunk_1016.wav
chunk_1017.wav
chunk_1018.wav
chunk_1019.wav
chunk_102.wav
chunk_1020.wav
chunk_1021.wav
chunk_1022.wav
chunk_1023.wav
chunk_1024.wav
chunk_1025.wav
chunk_1026.wav
chunk_1027.wav
chunk_1028.wav
chunk_1029.wav
chunk_103.wav
chunk_1030.wav
chunk_1031.wav
chunk_1032.wav
chunk_1033.wav
chunk_1034.wav
chunk_1035.wav
chunk_1036.wav
chunk_1037.wav
chunk_1038.wav
chunk_1039.wav
chunk_104.wav
chunk_1040.wav
chunk_1041.wav
chunk_1042.wav
chunk_1043.wav
chunk_1044.wav
chunk_1045.wav
chunk_1046.wav
chunk_1047.wav
chunk_1048.wav
chunk_1049.wav
chunk_105.wav
chunk_1050.wav
chunk_1051.wav
chunk_1052.wav
chunk_1053.wav
chunk_1054.wav
chunk_1055.wav
chunk_1056.wav
chun

In [7]:
import re

# List of filenames
filenames = files_list

# Define a function to extract the index from the filename
def extract_index(filename):
    # Use regular expression to find the index
    match = re.search(r'\d+', filename)
    if match:
        return int(match.group())
    else:
        return float('inf')  # return a very large number if no index found

# Sort filenames based on the extracted index
sorted_filenames = sorted(filenames, key=extract_index)

# Print the sorted filenames
# for filename in sorted_filenames:
#     print(filename)
sorted_filenames 

['chunk_0.wav',
 'chunk_1.wav',
 'chunk_2.wav',
 'chunk_3.wav',
 'chunk_4.wav',
 'chunk_5.wav',
 'chunk_6.wav',
 'chunk_7.wav',
 'chunk_8.wav',
 'chunk_9.wav',
 'chunk_10.wav',
 'chunk_11.wav',
 'chunk_12.wav',
 'chunk_13.wav',
 'chunk_14.wav',
 'chunk_15.wav',
 'chunk_16.wav',
 'chunk_17.wav',
 'chunk_18.wav',
 'chunk_19.wav',
 'chunk_20.wav',
 'chunk_21.wav',
 'chunk_22.wav',
 'chunk_23.wav',
 'chunk_24.wav',
 'chunk_25.wav',
 'chunk_26.wav',
 'chunk_27.wav',
 'chunk_28.wav',
 'chunk_29.wav',
 'chunk_30.wav',
 'chunk_31.wav',
 'chunk_32.wav',
 'chunk_33.wav',
 'chunk_34.wav',
 'chunk_35.wav',
 'chunk_36.wav',
 'chunk_37.wav',
 'chunk_38.wav',
 'chunk_39.wav',
 'chunk_40.wav',
 'chunk_41.wav',
 'chunk_42.wav',
 'chunk_43.wav',
 'chunk_44.wav',
 'chunk_45.wav',
 'chunk_46.wav',
 'chunk_47.wav',
 'chunk_48.wav',
 'chunk_49.wav',
 'chunk_50.wav',
 'chunk_51.wav',
 'chunk_52.wav',
 'chunk_53.wav',
 'chunk_54.wav',
 'chunk_55.wav',
 'chunk_56.wav',
 'chunk_57.wav',
 'chunk_58.wav',
 'chunk

In [10]:
files_list=sorted_filenames

In [11]:
files_list

['chunk_0.wav',
 'chunk_1.wav',
 'chunk_2.wav',
 'chunk_3.wav',
 'chunk_4.wav',
 'chunk_5.wav',
 'chunk_6.wav',
 'chunk_7.wav',
 'chunk_8.wav',
 'chunk_9.wav',
 'chunk_10.wav',
 'chunk_11.wav',
 'chunk_12.wav',
 'chunk_13.wav',
 'chunk_14.wav',
 'chunk_15.wav',
 'chunk_16.wav',
 'chunk_17.wav',
 'chunk_18.wav',
 'chunk_19.wav',
 'chunk_20.wav',
 'chunk_21.wav',
 'chunk_22.wav',
 'chunk_23.wav',
 'chunk_24.wav',
 'chunk_25.wav',
 'chunk_26.wav',
 'chunk_27.wav',
 'chunk_28.wav',
 'chunk_29.wav',
 'chunk_30.wav',
 'chunk_31.wav',
 'chunk_32.wav',
 'chunk_33.wav',
 'chunk_34.wav',
 'chunk_35.wav',
 'chunk_36.wav',
 'chunk_37.wav',
 'chunk_38.wav',
 'chunk_39.wav',
 'chunk_40.wav',
 'chunk_41.wav',
 'chunk_42.wav',
 'chunk_43.wav',
 'chunk_44.wav',
 'chunk_45.wav',
 'chunk_46.wav',
 'chunk_47.wav',
 'chunk_48.wav',
 'chunk_49.wav',
 'chunk_50.wav',
 'chunk_51.wav',
 'chunk_52.wav',
 'chunk_53.wav',
 'chunk_54.wav',
 'chunk_55.wav',
 'chunk_56.wav',
 'chunk_57.wav',
 'chunk_58.wav',
 'chunk

In [12]:
texts=[]
for file in files_list:
    waveform, sample_rate = torchaudio.load(f"E:/project_summer/data/audio_chunks/{file}")
    waveform = waveform.to(device)

    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
    with torch.inference_mode():
        emission, _ = model(waveform)
    decoder = GreedyCTCDecoder(labels=bundle.get_labels())
    transcript = decoder(emission[0])
    texts.append(transcript)

In [13]:
texts

['THE|FOLLOWING|IS|AN|INTERVIEW|WITH|THE|PRES',
 'AN|OF|RUSSHOF|LADIMIR|PUTEN|',
 'SHOT|FEBRUARY|SIXTH|TWENTY|TWENTY|FOU',
 'RD|ABOUT|SEVEN|P|M|IN|THE|BUILDING|BEHIND|US|WHICH|IS|',
 'ORSE|THE|KREMLET|THE|INTERVIEW|AS|YOU',
 'SEE|IF|YOU|WATCH|IT|AS|PRIMARILY|ABOUT|TH|E|WAR|',
 'IN|PROGRESS|THE|WAREN|YOU|CRANE|HOW|IT|STARTED|',
 "WHAT'S|HAPPENING|AND|MOST|PRESSI",
 'LY|HOW|IT|MIGHT|AUNT|ONE|NOTE|BEFORE|YO',
 'O|WATCH|AT|THE|BEGINNING|THE|INTERVIEW|WE|ASKED|THEM|',
 'OST|OBVIOUS|QUESTION|WHICH|IS|WHY|DID|YOU|DO|THIS|',
 'DID|YOU|FEEL|A|THREAT|AND|THE|EMMINENT|PHYSICAL|THREAT|',
 "AND|THAT'S|YOUR|JUSTIFICATION|AND|THE|ANSWER|WE|",
 'OT|SHOCKED|US|PUDEN',
 '|WENT|ON|FOR|A|VERY|LONG|TIME|PROBABLY|H',
 'ALF|AN|HOUR|ABOUT|THE|HISTORY|RUSSIA|GOING',
 '|BACK|TO|THE|EIGHTH|CENTURY|AND|',
 'HONESTLY|WE|THOUGHT|THIS|WAS|A|FI',
 'BUSTERING|TECHNIQUE|AND|FOUNDIT|ANNOING|INTERRUPTED|HIM',
 'EVERAL|TIMES|AND|HE|RESPONDED|HE|WAS|ANNOYED|',
 'BY|THE|INTERRUPTION|BUT|WE|C',
 "|INCLUDED|IN|THE|END|FOR|WHA

In [6]:
file_path = r'E:\project_summer\texts.txt'

# Initialize an empty list to store lines
texts = []

# Open the file in read mode
with open(file_path, 'r') as file:
    # Read all lines into the list
    lines = file.readlines()

# Optionally, you can strip newline characters from each line
texts = [line.strip() for line in lines]

# Now, 'lines' contains the contents of the file as a list of strings
print(texts)

['THE|FOLLOWING|IS|AN|INTERVIEW|WITH|THE|PRES', 'AN|OF|RUSSHOF|LADIMIR|PUTEN|', 'OST|OBVIOUS|QUESTION|WHICH|IS|WHY|DID|YOU|DO|THIS|', 'ADOPTED|TH|ORTHODOXY|A|E', 'IS|IS|A|LARGE|NUMBER|OF|PEOPLE|YBEBI', 'ITE|TE|LITAR|AND|IT|WAS|VERY|DIFFICULT|TO|SW', 'AY|THIS|ELECTORATE|WHICH|HAD|THE|POSITIVE|', '|SHOW|TOWARDS|RUSSHASIT|QUITE|TIPISHOL|FIX|', 'URIYANIKOVITCH|CAME|TO|POWER|AND|HA', 'I|IN|RESTON||BEIN||FIRST|TIME|HE|WON|OFF|TE|PR', 'AS|THEN|KUCHMA|THEY|ORGANIZED|THE|THIRD|RAU', 'ONDE|WHICH|IS|NOT|PROVIDED|FOR|IN|THE|CONS', 'TITUTION|OF|YOUR|CRAING|MUST|BE|ETTO|THIS|IS|', 'COURDETAR|TE|JUST|IMAGINE|SOM', 'STER|IN|CHRISTIANITY|CENTRLIZON|ERECIS|', "O|ONE|IN|THE|UNITED|STATES||WOULDN'T|LIKE|THE|OU", 'COME|YES|IN|TWO|THOUSAND|FOURTEEN|', 'AND|YET|BEFORE|THAT|', 'NO|THIS|WAS|BEFORE|THAT|AFTER|', 'EZDEN|KUCHMA|VICTORIAN|OVICWON|THE|L', 'ECTIONS|HOWEVER|HIS|OPPONENTS|', 'ID|NOT|RECOGNIZE|THAT|VICTORYOR|DE|U|S|', 'UPPORTED|THE|OPPOSITION|AND|THE|THIRD|ROUND|', 'SCATTU|RAP|AND|USUALLY|AMRISTES|', '

In [14]:
# file_path = r"E:\project_summer\texts.txt"

# Open the file in write mode ('w')
# with open(file_path+"2", 'w') as file:
#     # Write each string from the list to the file, one per line
#     for string in texts:
#         file.write(string + "\n")
    
# print(f"Strings have been saved to {file_path}")

Strings have been saved to E:\project_summer\texts.txt


In [15]:
def replace_pipes_with_spaces(text):
    return text.replace('|', ' ')

# Example usage:
text = "THE|FOLLOWING|IS|AN|INTERVIEW|WITH|THE|PRES"
result = replace_pipes_with_spaces(text)
print(result)


THE FOLLOWING IS AN INTERVIEW WITH THE PRES


In [16]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the dataset
# data = pd.read_csv('sentences.csv')
sentences = [replace_pipes_with_spaces(item) for item in texts]

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(sentences)

# Save embeddings to a file for further use
np.save('embeddings.npy', embeddings)


  from tqdm.autonotebook import tqdm, trange


In [17]:
sentences

['THE FOLLOWING IS AN INTERVIEW WITH THE PRES',
 'AN OF RUSSHOF LADIMIR PUTEN ',
 'SHOT FEBRUARY SIXTH TWENTY TWENTY FOU',
 'RD ABOUT SEVEN P M IN THE BUILDING BEHIND US WHICH IS ',
 'ORSE THE KREMLET THE INTERVIEW AS YOU',
 'SEE IF YOU WATCH IT AS PRIMARILY ABOUT TH E WAR ',
 'IN PROGRESS THE WAREN YOU CRANE HOW IT STARTED ',
 "WHAT'S HAPPENING AND MOST PRESSI",
 'LY HOW IT MIGHT AUNT ONE NOTE BEFORE YO',
 'O WATCH AT THE BEGINNING THE INTERVIEW WE ASKED THEM ',
 'OST OBVIOUS QUESTION WHICH IS WHY DID YOU DO THIS ',
 'DID YOU FEEL A THREAT AND THE EMMINENT PHYSICAL THREAT ',
 "AND THAT'S YOUR JUSTIFICATION AND THE ANSWER WE ",
 'OT SHOCKED US PUDEN',
 ' WENT ON FOR A VERY LONG TIME PROBABLY H',
 'ALF AN HOUR ABOUT THE HISTORY RUSSIA GOING',
 ' BACK TO THE EIGHTH CENTURY AND ',
 'HONESTLY WE THOUGHT THIS WAS A FI',
 'BUSTERING TECHNIQUE AND FOUNDIT ANNOING INTERRUPTED HIM',
 'EVERAL TIMES AND HE RESPONDED HE WAS ANNOYED ',
 'BY THE INTERRUPTION BUT WE C',
 " INCLUDED IN THE END FOR WHA

In [9]:
embeddings[0]

array([-7.42278472e-02,  1.38715133e-01,  4.33864035e-02,  9.53941327e-03,
        1.03020826e-02,  7.35115856e-02,  2.54281890e-02,  2.17058342e-02,
       -1.88467428e-02,  4.06900235e-02, -2.77303420e-02,  2.90727168e-02,
       -5.50810732e-02, -7.44152740e-02, -7.59306774e-02, -9.97447036e-03,
        9.77737978e-02, -9.48362723e-02, -1.54412659e-02,  2.59865467e-02,
       -4.32665199e-02, -2.42114887e-02,  2.19918098e-02,  4.87505384e-02,
        3.16648036e-02,  1.94319384e-03,  4.35861119e-04,  6.68247230e-03,
        6.22695945e-02,  4.58670743e-02,  3.43116955e-03, -3.42065170e-02,
        1.10349707e-01,  5.85297085e-02, -3.94177996e-02,  6.77843466e-02,
        9.52500924e-02, -1.19562466e-02,  6.31892532e-02,  1.84966642e-02,
        1.05395890e-03, -9.57159325e-02, -1.38647705e-02,  4.19445075e-02,
       -4.45200801e-02,  1.45932031e-03,  2.60545164e-02,  3.50528583e-02,
       -5.21483757e-02, -1.12036169e-02, -5.83827682e-02, -5.54148145e-02,
        3.33681516e-02, -

In [24]:
search_query ="presedent"

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming `model.encode([search_query])` gives you `query_embedding` and `embeddings` is your list of embeddings
query_embedding = model.encode([search_query])

# Compute cosine similarity between query_embedding and embeddings
similarities = cosine_similarity(query_embedding, embeddings)

# Get indices of top 10 most similar sentences (original indices in `sentences` list)
top_10_indices = similarities.argsort()[0][-10:][::-1]

# Retrieve the top 10 sentences and their original indices
top_10_sentences = [(sentences[i], i) for i in top_10_indices]

# Now `top_10_sentences` is a list of tuples where each tuple contains (sentence, original_index)
for sentence, original_index in top_10_sentences:
    print(f"Original Index: {original_index}, Sentence: {sentence}")

Original Index: 2010, Sentence: OR DOES IT NOT MATTER WHO THE PRESIDERT IS 
Original Index: 1558, Sentence: THE FORMER PRESIDENT CONDELISE SE 
Original Index: 1505, Sentence: MOREOVER THE PRESIDENT YOU RE
Original Index: 434, Sentence:  FOR THE FIRST TWENTY TWO YEARS AS PRESIDENT 
Original Index: 2345, Sentence: HIMSELF PRESIDENT THEN HE IS RECOGNIS
Original Index: 0, Sentence: THE FOLLOWING IS AN INTERVIEW WITH THE PRES
Original Index: 819, Sentence: ALSO THE PRESIDENT OF THE UNITED STATES 
Original Index: 1223, Sentence: LL OTHER OFFICIALS AND THEN PRESIDENT HIMS
Original Index: 1422, Sentence: AGINED COCTHE PRESIDENT OF YOU CRAN HIM
Original Index: 332, Sentence: CAME PRESIDENT TWENTY FOUR YEARS A


In [26]:
def convert_seconds(seconds):
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    
    return hours, minutes, seconds
se,ori=top_10_sentences[0]
# Example usage:
total_seconds = ori*2.53  # Replace with any number of seconds you want to convert
hours, minutes, seconds = convert_seconds(total_seconds)

print(f"{total_seconds} seconds is equal to {hours} hours, {minutes} minutes, and {seconds} seconds.")


5085.299999999999 seconds is equal to 1.0 hours, 24.0 minutes, and 45.29999999999927 seconds.


In [27]:
import IPython

In [28]:
IPython.display.Audio(f"E:/project_summer/data/audio_chunks/chunk_{ori}.wav")

In [20]:
import wave

def get_audio_length(file_path):
    with wave.open(file_path, 'rb') as audio_file:
        # Get the total number of frames
        total_frames = audio_file.getnframes()
        # Get the frame rate (frames per second)
        frame_rate = audio_file.getframerate()
        # Calculate the duration of the audio file
        duration = total_frames / float(frame_rate)
        return duration

# Example usage:
audio_file_path = r'E:\project_summer\data\audio_chunks\chunk_0.wav'
length_in_seconds = get_audio_length(audio_file_path)
print(f"Length of audio file '{audio_file_path}' is {length_in_seconds:.2f} seconds")


Length of audio file 'E:\project_summer\data\audio_chunks\chunk_0.wav' is 2.53 seconds
