In [7]:
import os


In [10]:
#List audio files
audio_files_lst = os.listdir("/home/alexandertchk/VSCode/multimodal/how2-dataset/audio_300/concat")
audio_files_lst[:2]

['-cmG4MzqyjE.npy', '0EZaw5V-LfU.npy']

In [11]:
#List video files
video_files_lst = os.listdir("/home/alexandertchk/VSCode/multimodal/how2-dataset/video_action_features")
video_files_lst[:2]

['0-4j8fghNOE.npy', '00DKFksyVnQ.npy']

In [12]:
#Compare audio/video equality
audio_files_lst == video_files_lst

False

In [14]:
import os

# The source and destination paths
source_folder_path = "/home/alexandertchk/VSCode/multimodal/how2-dataset/en_sum/text"
destination_folder_path = "/home/alexandertchk/VSCode/multimodal/dataset"

# Ensure the destination folder exists
os.makedirs(destination_folder_path, exist_ok=True)

# Your list of audio file names, converted to video IDs
video_ids = [file.split('.')[0] for file in audio_files_lst]

# Sub folders inside source_folder_path
sub_folders = ['sum_cv', 'sum_devtest', 'sum_train']

# Open destination files
with open(os.path.join(destination_folder_path, 'desc.tok.txt'), 'w') as desc_out, \
     open(os.path.join(destination_folder_path, 'tran.tok.txt'), 'w') as tran_out:

    # Loop through each video id
    for video_id in video_ids:
        # Loop through each subfolder
        for sub_folder in sub_folders:
            # Construct the file paths
            desc_file_path = os.path.join(source_folder_path, sub_folder, 'desc.tok.txt')
            tran_file_path = os.path.join(source_folder_path, sub_folder, 'tran.tok.txt')

            # Loop through each line in the desc and tran files, and write to output if it starts with the video_id
            with open(desc_file_path, 'r') as desc_in:
                for line in desc_in:
                    if line.startswith(video_id):
                        desc_out.write(line)

            with open(tran_file_path, 'r') as tran_in:
                for line in tran_in:
                    if line.startswith(video_id):
                        tran_out.write(line)


In [16]:
import os

def check_and_delete_non_corresponding_files(audio_path, txt_path):
    audio_files = [f for f in os.listdir(audio_path) if f.endswith('.npy')]
    tran_tok_file = os.path.join(txt_path, 'tran.tok.txt')
    desc_tok_file = os.path.join(txt_path, 'desc.tok.txt')
    tran_lines = []
    desc_lines = []

    # Read tran.tok.txt and desc.tok.txt files into lists
    with open(tran_tok_file, 'r') as tran_file:
        tran_lines = tran_file.readlines()
    
    with open(desc_tok_file, 'r') as desc_file:
        desc_lines = desc_file.readlines()

    # Create sets to store video IDs from npy files and text files
    npy_video_ids = set(os.path.splitext(f)[0] for f in audio_files)
    txt_video_ids = set(line.split(' ')[0] for line in tran_lines)

    # Find the intersection of video IDs between npy files and text files
    common_video_ids = npy_video_ids.intersection(txt_video_ids)

    # Filter lines that have corresponding npy files
    tran_lines_filtered = [line for line in tran_lines if line.split(' ')[0] in common_video_ids]
    desc_lines_filtered = [line for line in desc_lines if line.split(' ')[0] in common_video_ids]

    # Filter npy files that have corresponding lines in both tran.tok.txt and desc.tok.txt
    npy_files_filtered = [f for f in audio_files if os.path.splitext(f)[0] in common_video_ids]

    # Write the filtered lines back to tran.tok.txt and desc.tok.txt
    with open(tran_tok_file, 'w') as tran_file:
        tran_file.writelines(tran_lines_filtered)
    
    with open(desc_tok_file, 'w') as desc_file:
        desc_file.writelines(desc_lines_filtered)

    # Delete npy files that don't have corresponding lines
    for npy_file in audio_files:
        video_id = os.path.splitext(npy_file)[0]
        if video_id not in common_video_ids:
            npy_path = os.path.join(audio_path, npy_file)
            os.remove(npy_path)

    return npy_files_filtered

# Example usage:
audio_path = '/home/alexandertchk/VSCode/multimodal/how2-dataset/audio_300/concat'
txt_path = '/home/alexandertchk/VSCode/multimodal/dataset/'

filtered_npy_files = check_and_delete_non_corresponding_files(audio_path, txt_path)
print(len(filtered_npy_files))

1318


### Split the text data from dataset folder

In [17]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

def split_data(dataset_path, train_percent, cv_percent, test_percent):
    # Assert that the provided percentages add up to 100
    assert train_percent + cv_percent + test_percent == 1.0, "Provided percentages don't add up to 1.0"

    # Define the file paths
    desc_file_path = os.path.join(dataset_path, 'desc.tok.txt')
    tran_file_path = os.path.join(dataset_path, 'tran.tok.txt')

    # Load the files
    with open(desc_file_path, 'r') as file:
        desc_lines = file.readlines()

    with open(tran_file_path, 'r') as file:
        tran_lines = file.readlines()

    # Split the data into train, cv and test
    desc_train, desc_temp, tran_train, tran_temp = train_test_split(desc_lines, tran_lines, test_size=1-train_percent, random_state=42)
    test_ratio = test_percent / (cv_percent + test_percent)
    desc_cv, desc_test, tran_cv, tran_test = train_test_split(desc_temp, tran_temp, test_size=test_ratio, random_state=42)

    # Create the destination folders and files
    subsets = ['sum_train_300', 'sum_cv_300', 'sum_devtest_300']
    data = [(desc_train, tran_train), (desc_cv, tran_cv), (desc_test, tran_test)]
    for subset, (desc_data, tran_data) in zip(subsets, data):
        subset_folder = os.path.join(dataset_path, subset)
        os.makedirs(subset_folder, exist_ok=True)
        with open(os.path.join(subset_folder, 'desc.tok.txt'), 'w') as file:
            file.writelines(desc_data)
        with open(os.path.join(subset_folder, 'tran.tok.txt'), 'w') as file:
            file.writelines(tran_data)

# Use the function
dataset_path = "/home/alexandertchk/VSCode/multimodal/dataset"
split_data(dataset_path, train_percent=0.70, cv_percent=0.15, test_percent=0.15)

In [22]:
import os
# get video_ids for each split
folders = ['sum_train_300', 'sum_devtest_300', 'sum_cv_300']
base_path = '/home/alexandertchk/VSCode/multimodal/dataset'

for folder in folders:
    input_file = os.path.join(base_path, folder, 'desc.tok.txt')
    output_file = os.path.join(base_path, folder, 'id.txt')

    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            video_id, _ = line.strip().split(' ', 1)
            f_out.write(video_id + '\n')#Copy audio and video files from audio concat and bideo
import os
import shutil

# Define the paths to the source and destination folders
source_action_folder = "/home/alexandertchk/VSCode/multimodal/how2-dataset/video_action_features"
source_audio_folder = "/home/alexandertchk/VSCode/multimodal/how2-dataset/audio_300/concat"

dest_folders = {
    "train": {
        "action": "/home/alexandertchk/VSCode/multimodal/dataset/actions_train_300",
        "audio": "/home/alexandertchk/VSCode/multimodal/dataset/audio_train_300",
    },
    "cv": {
        "action": "/home/alexandertchk/VSCode/multimodal/dataset/actions_cv_300",
        "audio": "/home/alexandertchk/VSCode/multimodal/dataset/audio_cv_300",
    },
    "devtest": {
        "action": "/home/alexandertchk/VSCode/multimodal/dataset/actions_devtest_300",
        "audio": "/home/alexandertchk/VSCode/multimodal/dataset/audio_devtest_300",
    },
}

# Read video_ids from txt files
def read_video_ids(file_path):
    with open(file_path) as f:
        return [line.strip() for line in f]

# Function to copy files from source to destination
def copy_files(video_ids, source_folder, dest_folder):
    os.makedirs(dest_folder, exist_ok=True)
    for video_id in video_ids:
        action_file = os.path.join(source_action_folder, f"{video_id}.npy")
        audio_file = os.path.join(source_audio_folder, f"{video_id}.npy")
        if os.path.exists(action_file):
            shutil.copy(action_file, dest_folder)
        if os.path.exists(audio_file):
            shutil.copy(audio_file, dest_folder)

# Read video_ids from each txt file and copy corresponding files
for split, dest_paths in dest_folders.items():
    train_file_path = f"/home/alexandertchk/VSCode/multimodal/dataset/{split}_id.txt"
    video_ids = read_video_ids(train_file_path)
    copy_files(video_ids, source_action_folder, dest_paths["action"])
    copy_files(video_ids, source_audio_folder, dest_paths["audio"])


In [21]:
import os

def create_txt_files_audio_video(root_folder):
    # Get the list of directories in root_folder
    directories = ['actions_cv_300',]

    # Loop over each directory
    for directory in directories:
        txt_file_path = os.path.join(root_folder, directory, directory + '.txt')
        # Open the .txt file for writing
        with open(txt_file_path, 'w') as txt_file:
            # Get the list of .npy files in the directory
            npy_files = [f for f in os.listdir(os.path.join(root_folder, directory)) if f.endswith('.npy')]
            # Loop over each .npy file
            for npy_file in npy_files:
                # Write the video_id and the path to the .npy file to the .txt file
                video_id = npy_file.split('.')[0]
                npy_file_path = os.path.join(root_folder, directory, npy_file)
                txt_file.write('npy_file_path\n')

root_folder = "/home/alexandertchk/VSCode/multimodal/dataset"
create_txt_files_audio_video(root_folder)


['sum_train_300', 'sum_devtest_300', 'sum_cv_300', 'vocab']


In [20]:
import nltk
from collections import Counter

# If you have not already downloaded the NLTK tokenization package, uncomment the following line to do so:
#nltk.download('punkt')

def generate_vocab_file(input_file_path, output_file_path):
    # Read the input file
    with open(input_file_path, 'r') as file:
        lines = file.readlines()

    # Tokenize the text and count the occurrence of each token
    token_counts = Counter()
    for line in lines:
        # The first token of each line is the video id, which we don't want to include in the vocabulary
        text = line.split(' ', 1)[1]
        tokens = nltk.word_tokenize(text)
        token_counts.update(tokens)

    # Write the tokens and their counts to the output file
    with open(output_file_path, 'w') as file:
        for token, count in token_counts.items():
            file.write(f'{token} {count}\n')

input_file_path = '/home/alexandertchk/VSCode/multimodal/how2-dataseten_sum/text/sum_train/tran.tok.txt'
output_file_path = '/home/alexandertchk/VSCode/multimodal/how2-datasetdataset/tran.tok.vocab.txt'
generate_vocab_file(input_file_path, output_file_path)


In [26]:
import os
import glob

# Define the list of folders
folders = [
    'audio_devtest_300',
    'actions_train_300',
    'audio_train_300',
    'actions_devtest_300',
    'audio_cv_300',
    'actions_cv_300'
]

# Define the base directory path
base_directory = '/home/alexandertchk/VSCode/multimodal/dataset'

# Loop through each folder
for folder_name in folders:
    # Create the full path to the current folder
    folder_path = os.path.join(base_directory, folder_name)

    # Get a list of all files in the current folder using glob
    files_list = glob.glob(os.path.join(folder_path, '*'))

    # Create the text file name by appending '.txt' to the folder name
    txt_file_name = folder_name + '.txt'

    # Create and write the file list to the text file
    with open(os.path.join(folder_path, txt_file_name), 'w') as txt_file:
        txt_file.write('\n'.join(files_list))

    print(f"Created '{txt_file_name}' in folder '{folder_name}'")

Created 'audio_devtest_300.txt' in folder 'audio_devtest_300'
Created 'actions_train_300.txt' in folder 'actions_train_300'
Created 'audio_train_300.txt' in folder 'audio_train_300'
Created 'actions_devtest_300.txt' in folder 'actions_devtest_300'
Created 'audio_cv_300.txt' in folder 'audio_cv_300'
Created 'actions_cv_300.txt' in folder 'actions_cv_300'


In [3]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/alexandertchk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
import nltk
import json
from collections import Counter

# If you have not already downloaded the NLTK tokenization package, uncomment the following line to do so:
#nltk.download('punkt')

def generate_vocab_file(input_file_path, output_file_path):
    # Read the input file
    with open(input_file_path, 'r') as file:
        lines = file.readlines()

    # Tokenize the text and count the occurrence of each token
    token_counts = Counter()
    for line in lines:
        # The first token of each line is the video id, which we don't want to include in the vocabulary
        text = line.split(' ', 1)[1]
        tokens = nltk.word_tokenize(text)
        token_counts.update(tokens)

    # Write the tokens and their counts to the output file in JSON format
    with open(output_file_path, 'w') as file:
        json.dump(dict(token_counts), file)

input_file_path = '/home/alexandertchk/VSCode/multimodal/how2-dataset/en_sum/text/sum_train/desc.tok.txt'
output_file_path = '/home/alexandertchk/VSCode/multimodal/dataset/desc.tok.vocab.json'
generate_vocab_file(input_file_path, output_file_path)