In [2]:
import os
import glob
import numpy as np
import pandas as pd
import multiprocessing
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
os.chdir("/Users/huongn/Desktop/open_whisper")

In [3]:
metadata_files = glob.glob("data/metadata/*.parquet")
metadata_files

['data/metadata/captions-0001.parquet',
 'data/metadata/captions-0008.parquet',
 'data/metadata/captions-0009.parquet',
 'data/metadata/captions-0010.parquet',
 'data/metadata/captions-0000.parquet',
 'data/metadata/captions-0002.parquet',
 'data/metadata/captions-0003.parquet',
 'data/metadata/captions-0006.parquet',
 'data/metadata/captions-0007.parquet',
 'data/metadata/captions-0005.parquet',
 'data/metadata/captions-0004.parquet']

## UDFs

In [4]:
def standardize_dialects(s):
    words = s.split(",")
    transformed_words = [word.split("-")[0] if "-" in word else word for word in words]
    return ",".join(transformed_words)

In [5]:
def standardize_dialects_2(s):
    words = s.split(",")
    transformed_words = [word.split("-")[0] if "-" in word else word for word in words]
    return transformed_words

In [6]:
def get_percent_empty(df):
    return df.apply(lambda col: (col == "").mean() * 100)

In [7]:
def clean_df(df):
    df = df.copy()
    # fill in missing values with empty string
    df["automatic_caption_orig_language"] = df[
        "automatic_caption_orig_language"
    ].fillna("")
    df["language"] = df["language"].fillna("")

    # maybe don't need code above because this takes care of en case as well
    df["manual_caption_languages"] = df["manual_caption_languages"].apply(
        standardize_dialects
    )
    df["automatic_caption_orig_language"] = df["automatic_caption_orig_language"].apply(
        standardize_dialects
    )
    df["language"] = df["language"].apply(standardize_dialects)

    # check if all "en" types have been changed to strictly "en"
    if "-" in df["manual_caption_languages"].unique():
        print("dialects still exists in manual_caption_languages")
    else:
        print(
            "all dialects have been changed to strictly family language in manual_caption_languages"
        )

    if "-" in df["automatic_caption_orig_language"].unique():
        print("dialects still exists in automatic_caption_orig_language")
    else:
        print(
            "all dialects have been changed to strictly family language in automatic_caption_orig_language"
        )

    if "-" in df["language"].unique():
        print("dialects still exists in language")
    else:
        print("all dialects have been changed to strictly family language in language")

    return df

In [8]:
def check_in_col(row, ref_col, tgt_col):
    words = [word.strip() for word in row[tgt_col].split(",")]
    # Check if the second column's value is in the list of words
    return row[ref_col] in words

In [9]:
def get_duration_ml_t(df):
    condition = (
        df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    )
    not_strictly_en_df = df[~condition]
    condition_4 = not_strictly_en_df["manual_caption_languages"].str.contains("en")
    no_en_df = not_strictly_en_df[~condition_4]
    condition_5 = (
        no_en_df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: len(lst) == 1)
    )
    one_man_lang_df = no_en_df[condition_5]
    condition_6 = (
        (
            one_man_lang_df["manual_caption_languages"]
            == one_man_lang_df["automatic_caption_orig_language"]
        )
        | (one_man_lang_df["automatic_caption_orig_language"] == "")
        | (one_man_lang_df["automatic_caption_orig_language"] == "en")
    )
    condition_7 = (
        no_en_df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: len(lst) > 1)
    )
    many_man_lang_df = no_en_df[condition_7]
    condition_8 = many_man_lang_df.apply(
        check_in_col,
        axis=1,
        args=("automatic_caption_orig_language", "manual_caption_languages"),
    )

    temp_2 = many_man_lang_df[condition_8]
    # getting data complement of condition 8
    temp_3 = many_man_lang_df[~condition_8]
    # getting data where automatic_caption_orig_language == ""
    condition_9 = temp_3["automatic_caption_orig_language"] == ""
    temp_4 = temp_3[condition_9][
        temp_3[condition_9].apply(
            check_in_col, axis=1, args=("language", "manual_caption_languages")
        )
    ]
    condition_10 = temp_3["automatic_caption_orig_language"] != ""
    temp_5 = temp_3[condition_10][
        temp_3[condition_10].apply(
            check_in_col, axis=1, args=("language", "manual_caption_languages")
        )
    ]
    condition_11 = (
        condition_4
        & (not_strictly_en_df["automatic_caption_orig_language"] != "en")
        & (not_strictly_en_df["automatic_caption_orig_language"] != "")
    )
    condition_12 = not_strictly_en_df[condition_11].apply(
        check_in_col,
        axis=1,
        args=("automatic_caption_orig_language", "manual_caption_languages"),
    )
    temp_6 = not_strictly_en_df[condition_11][condition_12]
    condition_13 = (
        (not_strictly_en_df["manual_caption_languages"].str.contains("en"))
        & ("" == not_strictly_en_df["automatic_caption_orig_language"])
        & (not_strictly_en_df["language"] != "en")
    )
    condition_14 = (
        (not_strictly_en_df["manual_caption_languages"].str.contains("en"))
        & (not_strictly_en_df["automatic_caption_orig_language"] == "en")
        & (not_strictly_en_df["language"] != "en")
        & (not_strictly_en_df["language"] != "")
    )
    condition_15 = (
        (not_strictly_en_df["manual_caption_languages"].str.contains("en"))
        & ("" == not_strictly_en_df["automatic_caption_orig_language"])
        & (not_strictly_en_df["language"] != "en")
        & (not_strictly_en_df["language"] != "")
    )
    condition_16 = not_strictly_en_df[condition_15].apply(
        check_in_col,
        axis=1,
        args=("language", "manual_caption_languages"),
    )
    temp_7 = not_strictly_en_df[condition_15][condition_16]
    condition_17 = not_strictly_en_df[condition_14].apply(
        check_in_col,
        axis=1,
        args=("language", "manual_caption_languages"),
    )
    temp_8 = not_strictly_en_df[condition_14][condition_17]

    ml_dur = (
        one_man_lang_df[condition_6]["duration"].sum() / (60 * 60)
        + temp_2["duration"].sum() / (60 * 60)
        + temp_4["duration"].sum() / (60 * 60)
        + temp_5["duration"].sum() / (60 * 60)
        + temp_6["duration"].sum() / (60 * 60)
        + temp_7["duration"].sum() / (60 * 60)
        + temp_8["duration"].sum() / (60 * 60)
    )
    t_dur = (
        (not_strictly_en_df[condition_11]["duration"].sum() / (60 * 60))
        + (not_strictly_en_df[condition_13]["duration"].sum() / (60 * 60))
        + (not_strictly_en_df[condition_14]["duration"].sum() / (60 * 60))
    )

    return ml_dur, t_dur

In [10]:
def get_duration_en_only(df):
    # getting data that's strictly en in manual_caption_languages
    condition = (
        df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    )
    manual_en_df = df[condition]

    temp = df[~condition]

    # getting data where en in manual_caption_languages (not exactly "en") (temp) and en == automatic_caption_orig_language
    condition_2 = (
        (temp["manual_caption_languages"].str.contains("en"))
        & (temp["automatic_caption_orig_language"] == "en")
        & ((temp["language"] == "en") | (temp["language"] == ""))
    )

    # getting data where en in manual_caption_languages (not exactly "en") (temp) and "" == automatic_caption_orig_language but en == language
    condition_3 = (
        (temp["manual_caption_languages"].str.contains("en"))
        & ("" == temp["automatic_caption_orig_language"])
        & (temp["language"] == "en")
    )

    # returning total duration
    return (
        (manual_en_df["duration"].sum() / (60 * 60))
        + (temp[condition_2]["duration"].sum() / (60 * 60))
        + (temp[condition_3]["duration"].sum() / (60 * 60))
    )

In [11]:
def view_lang_cols(df, num_rows=5):
    return df[["id", "title", "manual_caption_languages", "automatic_caption_orig_language", "language"]].head(num_rows)

In [12]:
def view_lang_cols_2(df, num_rows=5):
    return df[["id", "title", "manual_caption_languages", "automatic_caption_orig_language", "language", "man_cap_langs_2", "filtered_langs"]].head(num_rows)

In [13]:
def view_lang_cols_4(df, num_rows=5):
    return df[["id", "title", "manual_caption_languages", "automatic_caption_orig_language", "language", "man_cap_langs_2", "filtered_langs", "download_lang"]].head(num_rows)

In [14]:
def get_num_rows(df):
    return df.shape[0]

## Getting Video IDs

### Loading in data from parquet files

In [15]:
df_list = []
for path in metadata_files:
    df = pd.read_parquet(path)
    df_list.append(df)
main_df = pd.concat(df_list)
main_df.reset_index(drop=True, inplace=True)

In [17]:
len(main_df)

60102381

In [18]:
main_df_dedup = main_df.drop_duplicates(subset=["id"], keep="first")
len(main_df_dedup)

60098151

In [14]:
main_df.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license'],
      dtype='object')

In [17]:
# downloaded already
with open("logs/data/download/440K/440K_download_ids.txt", "r") as f:
    id_downloaded = [line.strip().split('\t')[0] for line in f]
id_downloaded

['05xOF5ubabQ',
 'GjTPw_aQmgg',
 'T9_3s_yybq8',
 'nKZSV-h0SGA',
 '3993tVeKd-w',
 'CI6V07nPBzk',
 'colODoThH3Q',
 'w9B7VcQDkaU',
 'wEozeRtGOss',
 '9-c1cjfcMWA',
 'slKm_e5BlM8',
 'DlShMKZnO3M',
 'Trwb7V7DUpw',
 'klwH8qyun8E',
 'z3F53ixYOhg',
 'SpU-wjIq-ks',
 '2LMwCAOc3Z8',
 'SAuyecc22os',
 'XtKeJ6KGabo',
 'v82FvnlLrJI',
 'BjrJJHsRGzM',
 '4T7CX48QfNU',
 'Ta77u4_QFE0',
 'sJTxmp8Sgxo',
 '3Y0IOS9MjrQ',
 'YOJ5tb7djtE',
 'rHJl01EJLxI',
 '9QreGvzk0EE',
 'uipO3h824ls',
 'A_62KQA43N8',
 'wlQyW9653_Y',
 '8kFbaTuCx7o',
 '_86o9kvgGZ4',
 'wXamXakPR7g',
 'RC75WARX6VQ',
 'D81VRIRbZN8',
 'K7itE9u35YA',
 '2INSSypNeSI',
 'IZ7KXOG-pa4',
 'VnUz01iG340',
 'qbdksuS94w4',
 '_H8e8A_Z6a0',
 'xjeluIUSk3c',
 '8xlS5loCooI',
 'Tu3C6P3hZC8',
 'Y475Yx-aLDo',
 'OvrulSLqjGg',
 '0w8VX8fe1uw',
 'Rluzhtv24Cg',
 'cKKq3HFW5zg',
 '183f15W-fy4',
 'bNNC9VLUi4A',
 'slZktP6GB70',
 'f94-KGDotIs',
 'hOgmWpTFkEk',
 '4x6cR9IpPLw',
 'Cu_fzUDAXPk',
 'At-90RplBsw',
 'UPZoZyGZI8M',
 '9Lhgx3ZqPTU',
 'jpNSYkltC0w',
 '3tXG-TloXfY',
 'BFk-si

In [18]:
len(id_downloaded)

2448500

In [20]:
# downloaded already
with open("logs/data/download/440K/440K_download_ids.txt", "r") as f:
    duration = sum([int(line.strip().split('\t')[-1]) for line in f]) / (60 * 60)
duration

438202.3872222222

### Duplicates in df from joining parquet files

In [63]:
len(main_df)

60102381

In [58]:
len(set(main_df["id"]))

60098151

In [60]:
len(main_df["id"]) - len(set(main_df["id"]))

4230

#### Confirmation that there are duplicates when joining all parquet files

In [57]:
from collections import defaultdict
d = defaultdict(int)
for id in main_df["id"].tolist():
    d[id] += 1

In [59]:
for id, count in d.items():
    if count > 1:
        print(id)

fyvEDS-PQO8
QIySuiQ3_Yw
-t-J098gF10
QAgrHKTqkIE
5F5oJyZv0Dc
LLIwgjSQnj0
X3-I_5gdZ00
2We5HvMPLFY
EcqahSwiJRo
xqS5zApPnKw
V7gZj9C9OUk
liPcdDFouP0
lqNAbx1nXkc
xFAckAQoIFs
OdJyNoOeYHo
XdC3qMWmap8
xUC88-IuEkM
kNgP0TF-ENQ
Rz8zo1iPr0k
Gm-zwbxzzOQ
LK4r2ePVb1U
wseZE-FzyQw
WG7XUDDBnpM
L6Hi4HSVBDQ
Woa4MU04aVk
slUYJZ1Fm3E
_MjflJq6f7w
I2ADZMmP0Uc
CpxeV9XMvdg
gfv9EjSMQ60
FKvAfEpijPE
ScRAOOK-Vlc
ppAk683jX_0
BnQ_T-rjeJA
EaYAwNY5OHw
8mGu6xP74Hk
dtrfTUw6vKc
XfdnPE7bQ_A
bzWAKI7lFp4
D3Fn3914hOI
s6vH0tYrh9o
L7joInxvxkw
YKXT1oMhCWk
ML_d9o0eQ7g
6lhiTVDJzio
tSuKLGf1jXE
e_MzXz3D8qQ
X80Epyyy450
r3jSMTmdhP8
ew0w8-StOb4
nkesANaBEBc
sD4tmb_VYUU
XKIC5ZAcumE
C42IcopodNE
yHwFqC8k4bE
KH8HYrX-JvY
GvIyZ1kVoTw
3-4RV5d9w4U
7Nt_W_TM1yw
YELcmcSY1eA
SlNh4aRMRT8
_0j9JoWm9Uo
877UTIzw_Oo
I51gsm0FHTg
mdXmMblgj9Y
ojnM5FfSHNY
gmpdNWFCLwk
LIk59pRMU58
F_LESLVzU_M
MlZvjMePLaM
cVnZuq1XBa0
O9npRJ6OVvM
PCyiFADtKHY
xLdrfWIs84Y
KlkqaZuqkoo
5wy9JRzJ7aM
ohc5qPk3y6w
wE6-OYTH7ks
C5SaOTC0tIM
Qd1HedDfZRU
ucg0U4qZ1UU
XKhHI3wMcdc
LRjt2rDtQok
t7li

In [61]:
main_df[main_df["id"] == "fyvEDS-PQO8"]

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
16753897,fyvEDS-PQO8,404 (FILE NOT FOUND),368,19.0,People & Blogs,1.0,2.0,Nubz,UCVlYKhbTiO8TT6fRD-aY6vw,53.0,...,125.77,125.77,vp09.00.50.08,48000.0,2.0,,en-GB,en,,
18029445,fyvEDS-PQO8,404 (FILE NOT FOUND),368,19.0,People & Blogs,1.0,2.0,Nubz,UCVlYKhbTiO8TT6fRD-aY6vw,53.0,...,125.77,125.77,vp09.00.50.08,48000.0,2.0,,en-GB,en,,


### Getting information on remaining to download (after getting 440K hours/2.4M audio-transcript pairs)

In [21]:
unique_count_main_df = len(set(main_df["id"]))
unique_count_downloaded = len(set(id_downloaded))
print(unique_count_main_df, unique_count_downloaded)
print(unique_count_main_df - unique_count_downloaded)

60098151 2448500
57649651


In [14]:
main_df_dedup = main_df.drop_duplicates(subset=["id"], keep="first")
len(main_df_dedup)

60098151

In [23]:
all_ids = set(main_df_dedup['id'])
downloaded_ids = set(id_downloaded)

not_downloaded_ids = all_ids - downloaded_ids

not_downloaded_df = main_df_dedup[main_df_dedup['id'].isin(not_downloaded_ids)]

In [24]:
not_downloaded_df.reset_index(drop=True, inplace=True)

In [25]:
not_downloaded_df.head()

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
0,LwJv3vSd8no,Free Art Wednesday!,372,378.0,People & Blogs,175.0,70.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,120.177,120.177,vp9,48000.0,2.0,,en,en,,
1,XXjYyBCNJEY,FREE Art Wednesday,293,194.0,People & Blogs,69.0,48.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,129.7,129.7,vp9,48000.0,2.0,,en,en,,
2,OREjUtUT2Sc,New Intro | I keep losing them,11,39.0,People & Blogs,3.0,9.0,Sade Royalty,UCE_3_c7b769PCiS1iqc1APw,363.0,...,158.206,158.206,vp9,48000.0,2.0,,en,en,,
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),20,29.0,People & Blogs,,5.0,Overk,UCOxiCFiOYLUgPCtdcDZiK5A,12.0,...,129.293,129.293,vp9,48000.0,2.0,,sv,sv,,
4,2XitiLKk6rM,Анонимки в тик ток,102,578.0,People & Blogs,3.0,17.0,Overk,UCOxiCFiOYLUgPCtdcDZiK5A,12.0,...,121.21,121.21,vp09.00.21.08.01.05.01.06.00,48000.0,2.0,,sv,ru,,


In [26]:
not_downloaded_df.head()
print(len(not_downloaded_df))

57649651


In [27]:
not_downloaded_df.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license'],
      dtype='object')

#### English

In [28]:
clean_not_downloaded_df = clean_df(not_downloaded_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


In [29]:
clean_not_downloaded_df.head()

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
0,LwJv3vSd8no,Free Art Wednesday!,372,378.0,People & Blogs,175.0,70.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,120.177,120.177,vp9,48000.0,2.0,,en,en,,
1,XXjYyBCNJEY,FREE Art Wednesday,293,194.0,People & Blogs,69.0,48.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,129.7,129.7,vp9,48000.0,2.0,,en,en,,
2,OREjUtUT2Sc,New Intro | I keep losing them,11,39.0,People & Blogs,3.0,9.0,Sade Royalty,UCE_3_c7b769PCiS1iqc1APw,363.0,...,158.206,158.206,vp9,48000.0,2.0,,en,en,,
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),20,29.0,People & Blogs,,5.0,Overk,UCOxiCFiOYLUgPCtdcDZiK5A,12.0,...,129.293,129.293,vp9,48000.0,2.0,,sv,sv,,
4,2XitiLKk6rM,Анонимки в тик ток,102,578.0,People & Blogs,3.0,17.0,Overk,UCOxiCFiOYLUgPCtdcDZiK5A,12.0,...,121.21,121.21,vp09.00.21.08.01.05.01.06.00,48000.0,2.0,,sv,ru,,


In [30]:
view_lang_cols(clean_not_downloaded_df)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,en,en,
2,OREjUtUT2Sc,New Intro | I keep losing them,en,en,
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),sv,sv,
4,2XitiLKk6rM,Анонимки в тик ток,sv,ru,ru


In [31]:
condition = (
        clean_not_downloaded_df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    )
only_en_1 = clean_not_downloaded_df[condition]

In [32]:
only_en_1.shape[0]

25507801

In [33]:
not_only_en_1_id = set(clean_not_downloaded_df["id"]) - set(only_en_1["id"])
len(not_only_en_1_id)

32141850

In [34]:
not_only_en_1 = clean_not_downloaded_df[clean_not_downloaded_df["id"].isin(not_only_en_1_id)]

In [35]:
view_lang_cols(not_only_en_1)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),sv,sv,
4,2XitiLKk6rM,Анонимки в тик ток,sv,ru,ru
5,IPxXLQpOJRU,Marisa Marisa,it,it,
6,ug5TkquQEHo,Marisa marisa,it,it,
7,Ar-XwEnnFPY,Formation des Classe D | Serveur UPRP,fr,fr,fr


In [36]:
get_num_rows(not_only_en_1)

32141850

In [88]:
view_lang_cols(not_only_en_1[not_only_en_1["language"] == "en"], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
75,qVlXu1zanWE,【游玩克拉普】 平常小方块怎么测试声音的,zh,en,en
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
181,AxmibPoLlvo,[𝐏𝐋𝐀𝐘𝐋𝐈𝐒𝐓] 일렁이는 여름 햇살,ko,en,en
280,wboahvmJHyU,Resident Evil Revelations 2 - Trailer Oficial ...,pt,en,en
281,p_GO0YcfN4M,Resident Evil Revelations 2 - Trailer Oficial ...,pt,en,en


In [90]:
get_num_rows(not_only_en_1[not_only_en_1["language"] == "en"])

4070589

In [92]:
view_lang_cols(not_only_en_1[not_only_en_1["manual_caption_languages"].str.contains("en")], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
11,hWQZq0wHIwQ,rimne - First Color / MV,"en,ja",unknown,
19,4w-OpSyV5dw,Sangat Sedap | Semua Pasti Menyukainya!,"en,live_chat,ms",unknown,
21,JiHuFJ9RoHM,DEI UMA PASSADA NO CANADÁ ANTES DE IR PARA O B...,"en,pt",pt,pt
22,YhPHqdGcCQ4,"MINHA COLEÇÃO INTEIRA DA SUPERESTRELA DA NBA, ...","en,pt",pt,pt
23,zwPpeXqHsZs,JOGUEI O NBA 2K23 (PS4) CURRENT GEN PELA PRIME...,"en,pt",unknown,
24,VdlUZgEL1Qs,JOGUEI BASQUETE COM OS GRINGOS EM UMA QUADRA A...,"en,pt",pt,pt
56,9wY84F2-aUQ,DOĞADA HAYAT 44.BÖLÜM | LİFE İN NATURE EPİSOD...,"en,ko,ur",tr,tr
90,so8TfVlMuVs,"COMING SEPT 2021 - NEW MUSIC FROM ""CUSTOM""! - ...","en,live_chat",en,
92,OFZyHn-GP48,Control Z - bad guy,"en,es,pt",unknown,
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en


In [37]:
only_en_2 = not_only_en_1[not_only_en_1["manual_caption_languages"].str.contains("en")]

In [38]:
only_en_2.shape[0]

9991767

In [95]:
view_lang_cols(only_en_2[only_en_2["language"] == "en"], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
547,4khB0ZyEhVM,DFAT-CSIRO partnership,"en,vi",en,en
665,stIDKDUA-Xs,Panty Bleaching Gone Viral,"en,live_chat",en,en
696,1G1Fv0oGcpM,Adding subtitles using Chromebook only,"de,en",en,en
750,LeXguFYox0o,"ENTREVISTA | Neil Newbon, dublador de Nicholai...","en,pt",en,en


In [96]:
get_num_rows(only_en_2[only_en_2["language"] == "en"])

1933919

In [98]:
(only_en_2[only_en_2["language"] == "en"]["automatic_caption_orig_language"] == "").sum() / len(only_en_2[only_en_2["language"] == "en"])

0.0011768848643609168

In [97]:
view_lang_cols(only_en_2[only_en_2["automatic_caption_orig_language"] == "en"], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
90,so8TfVlMuVs,"COMING SEPT 2021 - NEW MUSIC FROM ""CUSTOM""! - ...","en,live_chat",en,
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
433,tFtImC8E0wY,My Heart And Seoul 😍 | 15 Days Around Seoul - ...,"en,live_chat",en,
547,4khB0ZyEhVM,DFAT-CSIRO partnership,"en,vi",en,en
665,stIDKDUA-Xs,Panty Bleaching Gone Viral,"en,live_chat",en,en


In [100]:
get_num_rows(only_en_2[only_en_2["automatic_caption_orig_language"] == "en"])

1980299

In [102]:
view_lang_cols(only_en_2[(only_en_2["automatic_caption_orig_language"] == "en") & (only_en_2["language"] == "en")], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
547,4khB0ZyEhVM,DFAT-CSIRO partnership,"en,vi",en,en
665,stIDKDUA-Xs,Panty Bleaching Gone Viral,"en,live_chat",en,en
696,1G1Fv0oGcpM,Adding subtitles using Chromebook only,"de,en",en,en
750,LeXguFYox0o,"ENTREVISTA | Neil Newbon, dublador de Nicholai...","en,pt",en,en


In [103]:
get_num_rows(only_en_2[(only_en_2["automatic_caption_orig_language"] == "en") & (only_en_2["language"] == "en")])

1930688

In [39]:
lang_en = only_en_2[only_en_2["language"] == "en"]
auto_lang_en = only_en_2[only_en_2["automatic_caption_orig_language"] == "en"]
lang_en_auto_lang_en = only_en_2[(only_en_2["automatic_caption_orig_language"] == "en") & (only_en_2["language"] == "en")]

In [40]:
lang_en.shape[0], auto_lang_en.shape[0], lang_en_auto_lang_en.shape[0]

(1933919, 1980299, 1930688)

In [107]:
temp = set(lang_en["id"]) - set(lang_en_auto_lang_en["id"])
temp_df_2 = lang_en[lang_en["id"].isin(temp)]
view_lang_cols(temp_df_2, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
30378,VHOvLOPVxX8,noc18-me62-Lec 02B-Instrument -II,"bn,en,en,gu,hi,kn,ml,mr,ta,te",,en
38783,M56GMKXRE7c,🥥❤️ Sadece 3 Malzemeyle Yapabileceğiniz En Güz...,"ar,az,bg,bn,bs,cs,da,de,el,en,es,fa,fi,fil,fr,...",,en
59528,34iDTeCNTz4,Simulating color vision deficiencies in the Bl...,"en,nl",nl,en
123890,Wn_1Egqrq9o,Crazy Speed Eating Challenge w/ Rosanna | Copy...,"en,es",,en
123891,RZsb2O2ndCk,Transforming 3 YouTubers into Contortionists |...,"en,es",,en
219529,Np8GMBsUiv8,L'Italien | Policier | Film complet en français,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",fr,en
219532,J-t9WP_Gl_I,"N'embrasse pas la mariée | Comédie, Action | F...","ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",,en
219533,CWKZetMHzO8,Ultime Combat (Action) Film complet en français,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",,en
220580,V9JSRHVuLBE,Petite Princesse | Classique | Film complet en...,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",unknown,en
220584,dZPEVh0PWAU,Last Apocalypse | Action | Film complet en fra...,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",,en


In [109]:
temp_1 = set(auto_lang_en["id"]) - set(lang_en_auto_lang_en["id"])
temp_df_3 = auto_lang_en[auto_lang_en["id"].isin(temp_1)]
view_lang_cols(temp_df_3, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
90,so8TfVlMuVs,"COMING SEPT 2021 - NEW MUSIC FROM ""CUSTOM""! - ...","en,live_chat",en,
433,tFtImC8E0wY,My Heart And Seoul 😍 | 15 Days Around Seoul - ...,"en,live_chat",en,
865,-2auYFne_l0,Adventures of Zalke West : Tenerife Part II - ...,"en,live_chat",en,
866,dM2FwfPBC2A,Adventures of Zalke West: Tenerife Part I - Teide,"en,live_chat",en,
1511,4cz74eJittA,20 Days in CA Exam | Must Cover these things! ...,"en,live_chat",en,
8014,C3oJo-J3lTo,Setting Up Our Christmas Tree 2021 ft. Rayan &...,"en,live_chat",en,
8015,cMOHF0BWKTw,400 to 1000 Rs Kurti Shopping | Meesho Haul | ...,"en,live_chat",en,
8016,VDWdvB8JYVs,Fun Baking Cookies with Rayan 👩‍🍳😎🧒 | Cooking ...,"en,live_chat",en,
8017,tNpSQure4iA,5000 Rupees For A Lunch 😵 | Most Expensive Res...,"en,live_chat",en,
8018,HPY1tat23pI,Masala Chowk Street Food Review 😋♥️ | Jaipur S...,"en,live_chat",en,


In [111]:
any_lang_en_id = set(lang_en["id"].tolist() + auto_lang_en["id"].tolist() + lang_en_auto_lang_en["id"].tolist())
temp_2 = set(only_en_2["id"]) - any_lang_en_id
temp_df_4 = only_en_2[only_en_2["id"].isin(temp_2)]
view_lang_cols(temp_df_4, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
11,hWQZq0wHIwQ,rimne - First Color / MV,"en,ja",unknown,
19,4w-OpSyV5dw,Sangat Sedap | Semua Pasti Menyukainya!,"en,live_chat,ms",unknown,
21,JiHuFJ9RoHM,DEI UMA PASSADA NO CANADÁ ANTES DE IR PARA O B...,"en,pt",pt,pt
22,YhPHqdGcCQ4,"MINHA COLEÇÃO INTEIRA DA SUPERESTRELA DA NBA, ...","en,pt",pt,pt
23,zwPpeXqHsZs,JOGUEI O NBA 2K23 (PS4) CURRENT GEN PELA PRIME...,"en,pt",unknown,
24,VdlUZgEL1Qs,JOGUEI BASQUETE COM OS GRINGOS EM UMA QUADRA A...,"en,pt",pt,pt
56,9wY84F2-aUQ,DOĞADA HAYAT 44.BÖLÜM | LİFE İN NATURE EPİSOD...,"en,ko,ur",tr,tr
92,OFZyHn-GP48,Control Z - bad guy,"en,es,pt",unknown,
106,SI7nWeZJsaE,अब रोबोट पुलिस करेगी अपराधियों का एनकाउंटर/Now...,"en,hi",hi,hi
107,-ruaKTKQuvM,Ayam Penyet Cheese & Kambing Bakar Madu - Menu...,"en,ms",unknown,


In [41]:
en_ids = set(only_en_1["id"].tolist() + lang_en["id"].tolist() + lang_en_auto_lang_en["id"].tolist())
len(en_ids)

27441720

In [42]:
approx_en_only = list(en_ids) + id_downloaded
len(approx_en_only)

29890220

In [43]:
with open("data/metadata/approx_en_only.txt", "w") as f:
    for video_id in approx_en_only:
        f.write(video_id + "\n")

#### English (w/ established filters)

In [44]:
en_not_downloaded_df = not_downloaded_df[not_downloaded_df["id"].isin(en_ids)]
view_lang_cols(en_not_downloaded_df, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,en,en,
2,OREjUtUT2Sc,New Intro | I keep losing them,en,en,
8,nmRxk7k3kvU,Meditation Music video-2,en-US,en-US,
14,m18UlcXg3Xg,' YOU LADYBUG SONG' |MIRACULOUS LADYBUG & CAT ...,en-GB,en-GB,
20,y1yb_WnCfUA,Video Editor is. Thastefuel?,en-GB,en-GB,
30,sIKsodQRvlc,MYO Hand Letter Art,en-US,en-US,
31,v8uKybpMJdM,Nature Moment | Common Blue Violet Flower,en,en,en
32,IkiTwxRtQUk,MYO Whipped Coffee,en,en,en
33,VvPE3uB7O1A,Nature Moment | The Holly Plant,en,en,en


In [45]:
en_not_downloaded_df[en_not_downloaded_df["manual_caption_languages"].str.contains("en-")].head(20)

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
8,nmRxk7k3kvU,Meditation Music video-2,1295,44.0,Music,1.0,2.0,Meditation music,UCaHJfbRk8gnOZGrJTm5ZI4Q,3.0,...,124.292,124.292,avc1.640028,48000.0,2.0,,en-US,en-US,,
14,m18UlcXg3Xg,' YOU LADYBUG SONG' |MIRACULOUS LADYBUG & CAT ...,157,2433.0,Music,,57.0,miraculous ladybug season 06,UCBgXfZj6cVvu8cz0AX66HZQ,524.0,...,131.965,131.965,vp09.00.31.08.01.05.01.06.00,48000.0,2.0,,en-GB,en-GB,AUSTRALIA,
20,y1yb_WnCfUA,Video Editor is. Thastefuel?,12,25.0,Music,1.0,1.0,FrostyBallz Jr,UC-uKNbvsFlLqBaVI_ysSvig,3.0,...,130.202,130.202,vp9,44100.0,2.0,,en-GB,en-GB,,
30,sIKsodQRvlc,MYO Hand Letter Art,80,24.0,Howto & Style,,1.0,Roanoke County Public Library,UCRDIW4cJil7ls67SLQs9a4Q,217.0,...,138.62,138.62,vp09.00.50.08,48000.0,2.0,,en-US,en-US,,
52,0Yt4JoCW8oU,G Gundam Screenshop Repaint,1233,27.0,Howto & Style,,1.0,TkayArtz,UCHH2EHqaELDc2Lq9Ct5tn5w,34.0,...,139.683,139.683,avc1.640028,48000.0,2.0,,en-US,en,,
54,orWh_OLaRg4,All Goals | Final Copa America | Argentina (1)...,77,17.0,People & Blogs,,0.0,North Direction,UCDWbla2PIao9c6DnQNENkDQ,,...,122.41,122.41,vp09.00.21.08,48000.0,2.0,,"en-JkeT_87f4cc,en-uYU-mmqFLq8",en,,
93,y3o-baXQT0Q,VLOG #1 - Introduction,123,169.0,People & Blogs,7.0,7.0,Rikzaaa,UCkWADUHEgSPEdLlpdsc6pAA,306.0,...,115.805,115.805,avc1.640028,48000.0,2.0,,en-US,en,,
131,zHGwE0ritgk,Let's Play Minecraft: Greatest freakout ever,122,2620.0,Gaming,3.0,14.0,Riftenthecoolish93,UC_eV8_QSpMKm7ouQrs__qew,776.0,...,156.967,156.967,vp09.00.30.08,48000.0,2.0,,en-18WT74-rBWA,en,,
133,1N2Q2-veW8Y,Changing the Roofing Industry by Building Trus...,128,1.0,Howto & Style,,0.0,Schneider Roofing & Remodeling,UCA0DTNFRM-GrtGH_DpH9vYA,4.0,...,115.743,115.743,avc1.640028,48000.0,2.0,,en-US,en,,
134,7CRyjEyWjP4,Schneider Roofing & Remodeling - Show Me St. L...,164,13.0,Howto & Style,,0.0,Schneider Roofing & Remodeling,UCA0DTNFRM-GrtGH_DpH9vYA,4.0,...,103.942,103.942,avc1.640028,48000.0,2.0,,"en-US,en-uYU-mmqFLq8",en,,


In [46]:
en_not_downloaded_df["man_cap_langs_2"] = en_not_downloaded_df["manual_caption_languages"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_not_downloaded_df["man_cap_langs_2"] = en_not_downloaded_df["manual_caption_languages"]


In [47]:
en_not_downloaded_df.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license', 'man_cap_langs_2'],
      dtype='object')

In [48]:
view_lang_cols(en_not_downloaded_df)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,en,en,
2,OREjUtUT2Sc,New Intro | I keep losing them,en,en,
8,nmRxk7k3kvU,Meditation Music video-2,en-US,en-US,
14,m18UlcXg3Xg,' YOU LADYBUG SONG' |MIRACULOUS LADYBUG & CAT ...,en-GB,en-GB,


In [49]:
condition = (
        en_not_downloaded_df["man_cap_langs_2"]
        .apply(standardize_dialects_2)
        .apply(lambda lst: set(lst) == {"en"})
)
temp_df_5 = en_not_downloaded_df[condition]

In [50]:
temp_df_5["manual_caption_languages"].unique()

array(['en', 'en-US', 'en-GB', ..., 'en-CA-UFjxr4OIYz8', 'en-FpQ6dhQAASY',
       'en-LQRErUK_jpw'], dtype=object)

In [51]:
list(temp_df_5["manual_caption_languages"].unique())

['en',
 'en-US',
 'en-GB',
 'en-JkeT_87f4cc,en-uYU-mmqFLq8',
 'en-18WT74-rBWA',
 'en-US,en-uYU-mmqFLq8',
 'en-uYU-mmqFLq8',
 'en-eEY6OEpapPo',
 'en-IN',
 'en-CA',
 'en,en-GB',
 'en-cvfXDfbeED0',
 'en-US-anMIXjicSL4',
 'en-s8TMdDjdYe8',
 'en-IE',
 'en,en-US',
 'en-rX01gsAdUfU',
 'en-nP7-2PuUl7o',
 'en-zL7EZQmMa4Q',
 'en,en-IN',
 'en-imjcPpJerPk',
 'en-zL_pKPa_d08',
 'en-cFSbaKj2OKY',
 'en-anMIXjicSL4',
 'en-UMYBvvsXfqo',
 'en-1oAFzPNiuC0',
 'en-ehkg1hFWq8A',
 'en-vleHeMwJqgw',
 'en-0hllRZe4s5s',
 'en-US-eEY6OEpapPo',
 'en-K3JFmAG0FTI',
 'en,en-zL7EZQmMa4Q',
 'en-GB,en-US',
 'en-PQgNkSoyyBk',
 'en-TW7qhz_uLiI',
 'en-Zixs6prhbeE,en-nPk3V-duwf8',
 'en-US-YfeEIUII1AU',
 'en-y01yHln2iAs',
 'en-uYU-mmqFLq8,en-zL_pKPa_d08',
 'en-y7ZDYfb4tI8',
 'en-GusEpHUv8yI',
 'en-1FAeErPlCAc',
 'en,en-CA',
 'en-fGGqjO4aYg8',
 'en-q_HRRye8iTM',
 'en-nPk3V-duwf8',
 'en-A8Ln5B_8GBo',
 'en,en-UDggLCECq8g',
 'en-lqO-PPJy4Bc',
 'en-qoXxhA4oLYo',
 'en-m9hPnTBjEoU',
 'en-wILyS7txfUA',
 'en-IN,en-US',
 'en-6Effb_CP7

### Getting metadata on audio-transcript pairs that have been downloaded (and trained on)

In [24]:
with open("logs/data/download/trainable_ids.txt", "r") as f:
    trainable_ids = [line.strip() for line in f]

In [25]:
trainable_df = main_df_dedup[main_df_dedup['id'].isin(trainable_ids)]

In [26]:
trainable_df.head()

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
1,3TwThCtZGVI,Etsy Commission,313,133.0,People & Blogs,53.0,19.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,128.038,128.038,avc1.640028,48000.0,2.0,"0.0,79.0,Intro\n79.0,109.0,Lesson Learned\n109...",en,en,,
3,fG3DqJ_YqAs,PANGANDARAN SAAT INI | DRONE VIEW | DISAAT MALAM,781,1024.0,People & Blogs,13.0,47.0,CIAMIS 46211,UCCwrY8WB54qY-jI7hXJsh3g,19300.0,...,137.007,137.007,avc1.640028,48000.0,2.0,,en,en,,
33,sndzyEYbmTA,MYO Pointillism Art,112,54.0,Howto & Style,,0.0,Roanoke County Public Library,UCRDIW4cJil7ls67SLQs9a4Q,217.0,...,123.075,123.075,vp09.00.50.08,48000.0,2.0,,en,en,,
56,kWpOGMfmyFI,Write Code Save Lives: Digital Innovation for...,831,2222.0,Science & Technology,,2.0,Fogarty International Center at NIH,UCcSRJ6pWh8fvQVxwJMAEfWg,171.0,...,107.52,107.52,vp09.00.40.08,48000.0,2.0,"0,135.0,<Untitled Chapter 1>\n135.0,419.0,Stan...",en,en,,
57,4H2n-qfwvZ8,SUNY Downstate Medical Center / University at ...,310,725.0,Science & Technology,,2.0,Fogarty International Center at NIH,UCcSRJ6pWh8fvQVxwJMAEfWg,171.0,...,135.472,135.472,vp9,48000.0,2.0,,en,en,,


In [28]:
trainable_df.reset_index(drop=True, inplace=True)

In [30]:
def write_df_to_parquet(df, output_dir, rows_per_file):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    num_rows = df.shape[0]
    num_files = (num_rows + rows_per_file - 1) // rows_per_file  # Calculate number of files
    
    for i in range(num_files):
        start_row = i * rows_per_file
        end_row = min(start_row + rows_per_file, num_rows)
        subset_df = df.iloc[start_row:end_row]
        
        # Write each subset to a separate parquet file
        file_name = os.path.join(output_dir, f"subset_{i}.parquet")
        subset_df.to_parquet(file_name, engine='pyarrow')  # 'pyarrow' is commonly used as the Parquet engine

        print(f"Written rows {start_row} to {end_row} into file: {file_name}")

# Define parameters
output_directory = 'data/440K_metadata'
rows_per_file = 600000  # Change this value based on your file size requirement

# Call the function to write DataFrame into multiple parquet files
write_df_to_parquet(trainable_df, output_directory, rows_per_file)

Written rows 0 to 600000 into file: data/440K_metadata/subset_0.parquet
Written rows 600000 to 1200000 into file: data/440K_metadata/subset_1.parquet
Written rows 1200000 to 1800000 into file: data/440K_metadata/subset_2.parquet
Written rows 1800000 to 2300152 into file: data/440K_metadata/subset_3.parquet


## Generating Statistics

In [10]:
captions_0_df = pd.read_parquet(metadata_files[0])
temp_df = clean_df(captions_0_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(449872.3247222222, 163173.4663888889)

In [11]:
captions_1_df = pd.read_parquet(metadata_files[1])
temp_df = clean_df(captions_1_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(468788.4372222223, 161794.45944444442)

In [12]:
captions_2_df = pd.read_parquet(metadata_files[2])
temp_df = clean_df(captions_2_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(463617.8672222222, 167049.53333333333)

In [13]:
captions_3_df = pd.read_parquet(metadata_files[3])
temp_df = clean_df(captions_3_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(471860.9663888889, 152873.61833333335)

In [14]:
captions_4_df = pd.read_parquet(metadata_files[4])
temp_df = clean_df(captions_4_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(461073.8111111111, 172368.21916666668)

In [15]:
captions_5_df = pd.read_parquet(metadata_files[5])
temp_df = clean_df(captions_5_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(450831.0725, 156993.89027777777)

In [16]:
captions_6_df = pd.read_parquet(metadata_files[6])
temp_df = clean_df(captions_6_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(453780.5527777778, 171731.0841666667)

In [17]:
captions_7_df = pd.read_parquet(metadata_files[7])
temp_df = clean_df(captions_7_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(451183.7258333333, 151748.9525)

In [18]:
captions_8_df = pd.read_parquet(metadata_files[8])
temp_df = clean_df(captions_8_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(445482.2472222222, 183320.50694444444)

In [19]:
captions_9_df = pd.read_parquet(metadata_files[9])
temp_df = clean_df(captions_9_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(448018.40305555554, 174166.63444444444)

In [20]:
captions_10_df = pd.read_parquet(metadata_files[10])
temp_df = clean_df(captions_10_df)
get_duration_ml_t(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


(588.0472222222222, 58.34777777777778)

In [21]:
449872.3247222222 + 468788.4372222223 + 463617.8672222222 + 471860.9663888889 + 461073.8111111111 + 450831.0725 + 453780.5527777778 + 451183.7258333333 + 445482.2472222222 + 448018.40305555554 + 588.0472222222222

4565097.455277778

In [23]:
163173.4663888889 + 161794.45944444442 + 167049.53333333333 + 152873.61833333335 + 172368.21916666668 + 156993.89027777777 + 171731.0841666667 + 151748.9525 + 183320.50694444444 + 174166.63444444444 + 58.34777777777778

1655278.712777778

In [24]:
4565097.455277778 + 1655278.712777778

6220376.168055557

In [26]:
temp_df = clean_df(captions_0_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


566697.6641666667

In [27]:
temp_df = clean_df(captions_1_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


574250.731111111

In [28]:
temp_df = clean_df(captions_2_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


551560.5

In [29]:
temp_df = clean_df(captions_3_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


572103.6394444444

In [30]:
temp_df = clean_df(captions_4_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


566707.5469444444

In [31]:
temp_df = clean_df(captions_5_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


580833.2383333333

In [32]:
temp_df = clean_df(captions_6_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


569279.4233333333

In [33]:
temp_df = clean_df(captions_7_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


563446.2297222223

In [34]:
temp_df = clean_df(captions_8_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


550060.0961111111

In [35]:
temp_df = clean_df(captions_9_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


562896.7374999999

In [36]:
temp_df = clean_df(captions_10_df)
get_duration_en_only(temp_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


420.2072222222222

## Is number of hours proportional to number of audio-transcript pairs? As in, can you estimate the number of hours from number of audio-transcript pairs?

In [29]:
# calculating total duration from all trainable audio-transcript pairs
df_list = []
total_dur = 0
for parquet in glob.glob("data/440K_metadata/*.parquet"):
    df = pd.read_parquet(parquet)
    df_dur = sum(df["duration"]) / (60 * 60)
    total_dur += df_dur

In [30]:
total_dur

411584.1433333333

In [44]:
# calculating total duration from all audio-transcript pairs subsampled randomly for downloading and training
with open("logs/data/download/sampled_en.txt", "r") as f:
    durs = [int(line.strip().split("\t")[-1]) for line in f]
sum(durs) / (60 * 60)

438202.3872222222

In [45]:
# if 438202 hours correspond to 2448500 audio-transcript pairs, then how many hours correspond to 2300152 audio-transcript pairs?
est_total_dur = (2300152 * 438202.3872222222) / 2448500
print(total_dur, est_total_dur)
# it looks like they're quite close, so it seems that the duration of the subsampled audio-transcript pairs is quite 
# representative of the total duration of all audio-transcript pairs
# but let's check again...

411584.1433333333 411652.8884516924


In [37]:
# getting the video IDs after best filter
with open("logs/data/download/filtered.txt", "r") as f:
    filtered = [line.strip() for line in f]
filtered

['pkp_MM22ivE',
 'q7QPE66HoNY',
 'qHeOWpL7uCI',
 'qJoGRiG_iOQ',
 'qPUT1ovOqjc',
 'qPWWtUwRYlY',
 'qQbULMp2RFo',
 'qS3njdnQ29g',
 'qSMPGxovMi0',
 'qSj6m7ZHUmE',
 'qZfmbaSz_yg',
 'qzaWJXGmDnE',
 'rCahxo8uEIU',
 'rD6Y9Rvef7w',
 'rH09pwsMIYQ',
 'rLu4-1d9Qe4',
 'rR29J6-8b_4',
 'r_JTowo-E1s',
 'sBFsqZW01Ds',
 'sHXDknsO1DA',
 'sMYQCT7GPiI',
 'sSzU11S1e0o',
 'sUw117HHuQM',
 'snod06rCcZc',
 'tBslHCACu20',
 'tFwd_m5U-pw',
 'tG1Ai-oIYB8',
 'tK7bKhFvcP4',
 'tL3nd6ltJf4',
 'tPt5Bc-FYY8',
 'tc6zpF_jXFM',
 'tdkBABj5TJg',
 'tnSOdcWzozM',
 'tunQASBB9FA',
 'tuvrW998tko',
 'tyHAVBBp0iw',
 'u6NfqReEnuc',
 'uQpbEjdCDLQ',
 'ufgd1tgZk5A',
 'v5FXsOqCJhw',
 'v8-YdYtv4hE',
 'vK9B_6PgGek',
 'vLyrce6NdF0',
 'vNogT4RDw3I',
 'vO9Qos87aYg',
 'vYtoQlQIxqU',
 'vaTJzRB7DLg',
 'vfFzwBd6HAs',
 'vkDkfJEWRUY',
 'vkFnM5Xad24',
 'vlUveFSZxUU',
 'vzFYhtQLBpA',
 'w-QuumiO0Fw',
 'w2nhb7Js7lw',
 'w4NX46wjbdQ',
 'w90sTQt6_dQ',
 'wA_2NxFuWM0',
 'wFQu7CFAtX4',
 'wPLW6ZfCUlM',
 'wQephUzleVg',
 'wQqpC4A2xbA',
 'wRjRiLOluk4',
 'whFecU

In [38]:
temp_df_1 = main_df_dedup[main_df_dedup['id'].isin(filtered)]
temp_df_1.shape[0]

1175427

In [39]:
# duration of all audio-transcript pairs after best filter
sum(temp_df_1['duration']) / (60 * 60)

122121.75833333333

In [46]:
# if 411584 hours correspond to 2300152 audio-transcript pairs, then how many hours correspond to 1175427 audioo-transcript pairs?
est_dur = (1175427 * 411584.1433333333) / 2300152
act_dur = sum(temp_df_1['duration']) / (60 * 60)
print(act_dur, est_dur)
# they're quite different, so the duration of the subsampled audio-transcript pairs is not representative of the 
# total duration of all audio-transcript pairs

122121.75833333333 210328.3238872344


## How to systematically calculate how much to subsample from raw data pool and download
- Calculate reduction in data by **hours** after filtering (percentage and hours)
- If this reduction leads to number of hours **<440K hours**, then
    - Using percentage of reduction in data hours, calculate how much might be needed to enable reduction and yield minimum number of data hours to fulfill 440K hours (reduction in data hours)
    - Also account for how much might be removed when trying to download (downloadable) and segmented (trainable)
- Randomly subsample from the raw data pool iteratively until total duration of sample meets requirement

In [56]:
# calculating data hours retained after filtering
hours_kept = sum(temp_df_1['duration']) / (60 * 60)
# calculating the reduction in data hours
hours_reduced = total_dur - sum(temp_df_1['duration']) / (60 * 60)
print(f"{hours_kept=}, {hours_reduced=}")
perc_kept = (hours_kept / 411584.1433333333) * 100
# calculating the reduction in data hours by percentage
perc_reduced = (100 - perc_kept)
print(f"{perc_kept=}, {perc_reduced=}")

hours_kept=122121.75833333333, hours_reduced=289462.385
perc_kept=29.671152378294973, perc_reduced=70.32884762170502


In [58]:
approx_hours_needed = (440000 - hours_kept)
approx_hours_subsample = (100 * approx_hours_needed) / perc_kept
print(f"{approx_hours_needed=}, {approx_hours_subsample=}")

approx_hours_needed=317878.2416666667, approx_hours_subsample=1071337.7007197093


## Randomly subsampling English data for downloading (2nd time) to get sufficient data to filter down to 440K hours

In [16]:
with open("data/metadata/approx_en_only.txt", "r") as f:
    all_en_ids = [line.strip() for line in f]

with open("logs/data/download/440K/440K_download_ids.txt", "r") as f:
    downloaded_ids = [line.strip().split("\t")[0] for line in f]

en_ids = set(all_en_ids) - set(downloaded_ids)
print(len(en_ids))

en_not_downloaded_df = main_df_dedup[main_df_dedup['id'].isin(en_ids)]

27441720


In [17]:
import pandas as pd
import numpy as np

# Parameters
num_samples = 1000000  # Number of values to subsample
subsampled_ids = []
hours_to_subsample = 1080000
en_ids_list = list(en_ids)
duration = 0
rng = np.random.default_rng(42)
while True:
    # Randomly subsample a specified amount of values from 'category' column
    subsampled_values = rng.choice(en_ids_list, num_samples, replace=False)
    subsampled_ids.extend(subsampled_values)
    
    duration += en_not_downloaded_df[en_not_downloaded_df['id'].isin(subsampled_values)]["duration"].sum() / (60 * 60)
    print(f"{duration=}")
    print(f"{len(subsampled_values)=}")
    
    if duration >= hours_to_subsample:
        break
    else:
        en_ids_list = list(set(en_ids_list) - set(subsampled_values))

duration=187233.3025
len(subsampled_values)=1000000
duration=375183.75638888887
len(subsampled_values)=1000000
duration=564000.3927777777
len(subsampled_values)=1000000
duration=752849.7641666667
len(subsampled_values)=1000000
duration=941115.3980555555
len(subsampled_values)=1000000
duration=1129832.358611111
len(subsampled_values)=1000000


In [18]:
len(set(subsampled_ids))

6000000

In [19]:
en_not_downloaded_df["duration"].sum() / (60 * 60)

5174723.14

In [20]:
main_df_dedup[main_df_dedup['id'].isin(downloaded_ids)]["duration"].sum() / (60 * 60)

438202.3872222222

In [21]:
subsampled_ids[:10]

['U6msOTWWOFo',
 'bRow2kcrPmQ',
 'LCFilmhRBwU',
 'Touc8zNtky8',
 'Ow8SVjlrwjE',
 'f7AoNul76v0',
 'a8GYAcbg_1w',
 'N0vBTUkADl4',
 'HDqnzGBAt4A',
 'OUljrTDyBD0']

In [22]:
with open("logs/data/download/1M_en/subsampled_ids.txt", "w") as f:
    for video_id in subsampled_ids:
        f.write(video_id + "\n")

In [31]:
subsampled_df = main_df_dedup[main_df_dedup["id"].isin(subsampled_ids)]

In [35]:
subsampled_df["man_cap_langs_2"] = subsampled_df["manual_caption_languages"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subsampled_df["man_cap_langs_2"] = subsampled_df["manual_caption_languages"]


In [30]:
condition = (
        subsampled_df["man_cap_langs_2"]
        .apply(standardize_dialects_2)
        .apply(lambda lst: set(lst) == {"en"})
)
temp_df_5 = subsampled_df[condition]
list(temp_df_5["manual_caption_languages"].unique())

['en',
 'en-US,en-uYU-mmqFLq8',
 'en-uYU-mmqFLq8',
 'en-eEY6OEpapPo',
 'en-IN',
 'en-GB',
 'en-US',
 'en-cvfXDfbeED0',
 'en-CA',
 'en-zL7EZQmMa4Q',
 'en-IE',
 'en-zL_pKPa_d08',
 'en-cFSbaKj2OKY',
 'en-1oAFzPNiuC0',
 'en-ehkg1hFWq8A',
 'en,en-zL7EZQmMa4Q',
 'en-0hllRZe4s5s',
 'en-rX01gsAdUfU',
 'en-anMIXjicSL4',
 'en-US-YfeEIUII1AU',
 'en,en-GB',
 'en-y01yHln2iAs',
 'en-uYU-mmqFLq8,en-zL_pKPa_d08',
 'en-nP7-2PuUl7o',
 'en-y7ZDYfb4tI8',
 'en,en-US',
 'en-GusEpHUv8yI',
 'en-UMYBvvsXfqo',
 'en,en-CA',
 'en-A8Ln5B_8GBo',
 'en,en-UDggLCECq8g',
 'en-wILyS7txfUA',
 'en,en-uYU-mmqFLq8',
 'en-d6gV6WVcwQQ',
 'en-ZW6bPmS_ILY',
 'en-Uym2X1t3MTA',
 'en-puBa_fWWqOc',
 'en-vP3gpS-n2EU',
 'en-OkkdIt_1FtE',
 'en-US,en-US-Az1uF2nEdyo',
 'en-LUU0EuDKgKo',
 'en-6UJrWS5jR_I',
 'en-JkeT_87f4cc,en-uYU-mmqFLq8',
 'en-FmoQciUtYSc',
 'en-gG13fOM8Dfs',
 'en-LUU0EuDKgKo,en-gG13fOM8Dfs',
 'en-SJ_OorK2o0k',
 'en-bo0751DuF0A',
 'en-6UJrWS5jR_I,en-gG13fOM8Dfs',
 'en-tyWfmtkyBQo',
 'en-6UJrWS5jR_I,en-LUU0EuDKgKo',
 'en

In [36]:
subsampled_df["filtered_langs"] = subsampled_df["man_cap_langs_2"].apply(lambda langs: [lang for lang in langs.split(",") if lang.startswith("en")])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subsampled_df["filtered_langs"] = subsampled_df["man_cap_langs_2"].apply(lambda langs: [lang for lang in langs.split(",") if lang.startswith("en")])


In [40]:
view_lang_cols_2(subsampled_df, 50)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language,filtered_langs
40,D0TBEM3oUtk,"Meet Medical Oncologist, Matthew Kulke, MD",en,en,en,[en]
50,ybPciSw7vyE,"Meet: J. Pieter Noordzij, MD",en,en,en,[en]
93,3Mr8lg5kvIU,🌶🔥 SPICY FRIED CHICKEN RAMEN: The Best In Osak...,en,en,,[en]
115,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en,[en]
118,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en,[en]
149,7CRyjEyWjP4,Schneider Roofing & Remodeling - Show Me St. L...,"en-US,en-uYU-mmqFLq8",en,en,"[en-US, en-uYU-mmqFLq8]"
169,OJEVmtRoDW0,SOPIR TRUK PALING HOROR SERING KETEMU HANTU DI...,en,id,id,[en]
184,TYmnAQT88qw,Ano Novo 1989 e Aniversarios,en-uYU-mmqFLq8,en,,[en-uYU-mmqFLq8]
220,LQ1kAABebes,Amazing Korean Strawberries 🍓😮 | 15 Days Arou...,en,en,,[en]
223,oU2XC49MLEk,Arriving in Seoul ❤️ | 15 Days Around Seoul - ...,en,en,,[en]


In [44]:
subsampled_df.reset_index(drop=True, inplace=True)
subsampled_df.iloc[0]

id                                                                       D0TBEM3oUtk
title                                     Meet Medical Oncologist, Matthew Kulke, MD
duration                                                                          60
view_count                                                                    1179.0
categories                                                            People & Blogs
comment_count                                                                    NaN
like_count                                                                       5.0
channel                                                        Boston Medical Center
channel_id                                                  UCCsieriDUNU-2-nNmXngYAw
channel_follower_count                                                        7460.0
age_limit                                                                          0
upload_date                                                      

In [45]:
subsampled_df.iloc[0].filtered_langs

['en']

In [48]:
rng = np.random.default_rng(42)
subsampled_df["download_lang"] = subsampled_df["filtered_langs"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subsampled_df["download_lang"] = subsampled_df["filtered_langs"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])


In [56]:
view_lang_cols_4(subsampled_df, 50)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language,man_cap_langs_2,filtered_langs,download_lang
0,D0TBEM3oUtk,"Meet Medical Oncologist, Matthew Kulke, MD",en,en,en,en,[en],en
1,ybPciSw7vyE,"Meet: J. Pieter Noordzij, MD",en,en,en,en,[en],en
2,3Mr8lg5kvIU,🌶🔥 SPICY FRIED CHICKEN RAMEN: The Best In Osak...,en,en,,en,[en],en
3,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en,"en,es",[en],en
4,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en,"en,es",[en],en
5,7CRyjEyWjP4,Schneider Roofing & Remodeling - Show Me St. L...,"en-US,en-uYU-mmqFLq8",en,en,"en-US,en-uYU-mmqFLq8","[en-US, en-uYU-mmqFLq8]",en-US
6,OJEVmtRoDW0,SOPIR TRUK PALING HOROR SERING KETEMU HANTU DI...,en,id,id,en,[en],en
7,TYmnAQT88qw,Ano Novo 1989 e Aniversarios,en-uYU-mmqFLq8,en,,en-uYU-mmqFLq8,[en-uYU-mmqFLq8],en-uYU-mmqFLq8
8,LQ1kAABebes,Amazing Korean Strawberries 🍓😮 | 15 Days Arou...,en,en,,en,[en],en
9,oU2XC49MLEk,Arriving in Seoul ❤️ | 15 Days Around Seoul - ...,en,en,,en,[en],en


In [57]:
subsampled_df[["id", "manual_caption_languages", "download_lang"]].to_json("logs/data/download/1M_en/subsampled_ids.json", orient="records", lines=True)

In [61]:
import json

with open("logs/data/download/1M_en/subsampled_ids.json", "r") as f:
    subsampled_dicts = [json.loads(line) for line in f]


In [62]:
rng = np.random.default_rng(42)
rng.shuffle(subsampled_dicts)

In [63]:
subsampled_dicts[:10]

[{'id': 'hahSCT7UYfE',
  'manual_caption_languages': 'en-uYU-mmqFLq8',
  'download_lang': 'en-uYU-mmqFLq8'},
 {'id': 'YOxTr-KmtoU',
  'manual_caption_languages': 'en',
  'download_lang': 'en'},
 {'id': 'hetgPN1Pmmo',
  'manual_caption_languages': 'en-US',
  'download_lang': 'en-US'},
 {'id': 'qettCHgdKAU',
  'manual_caption_languages': 'en-WJwqjBL5cw8,en-uYU-mmqFLq8',
  'download_lang': 'en-WJwqjBL5cw8'},
 {'id': '2-D6zNVqVvY',
  'manual_caption_languages': 'en-uYU-mmqFLq8',
  'download_lang': 'en-uYU-mmqFLq8'},
 {'id': 'W9kG-knT7do',
  'manual_caption_languages': 'en-1weAQujjfYU',
  'download_lang': 'en-1weAQujjfYU'},
 {'id': '7novyN7ZF6w',
  'manual_caption_languages': 'en-q-YP8NOPts0,en-uYU-mmqFLq8',
  'download_lang': 'en-q-YP8NOPts0'},
 {'id': 'JenZpOb_now',
  'manual_caption_languages': 'en-US',
  'download_lang': 'en-US'},
 {'id': 'MEbBC1i3ucI',
  'manual_caption_languages': 'en-GB',
  'download_lang': 'en-GB'},
 {'id': 'qMbGM_F1A3s',
  'manual_caption_languages': 'en-uYU-mmqFLq

In [87]:
subsampled_tuples = [(d["id"], d["download_lang"]) for d in subsampled_dicts]

In [88]:
subsampled_tuples[:10]

[('hahSCT7UYfE', 'en-uYU-mmqFLq8'),
 ('YOxTr-KmtoU', 'en'),
 ('hetgPN1Pmmo', 'en-US'),
 ('qettCHgdKAU', 'en-WJwqjBL5cw8'),
 ('2-D6zNVqVvY', 'en-uYU-mmqFLq8'),
 ('W9kG-knT7do', 'en-1weAQujjfYU'),
 ('7novyN7ZF6w', 'en-q-YP8NOPts0'),
 ('JenZpOb_now', 'en-US'),
 ('MEbBC1i3ucI', 'en-GB'),
 ('qMbGM_F1A3s', 'en-uYU-mmqFLq8')]

In [64]:
with open("logs/data/download/1M_en/shuffled_subsampled_ids.json", "w") as f:
    for d in subsampled_dicts:
        f.write(json.dumps(d) + "\n")

In [74]:
subsampled_batches = [{f"batch_{i // 1000}": subsampled_dicts[i:i + 1000]} for i in range(0, 6000000, 1000)]

In [92]:
subsampled_batches_tuples = [{"videoIds": subsampled_tuples[i:i + 1000], "batchIdx": i // 1000} for i in range(0, 6000000, 1000)]

In [93]:
subsampled_batches_tuples[-1]

{'videoIds': [('1kSG0gDVvmY', 'en'),
  ('_hALGphRLEE', 'en-ehkg1hFWq8A'),
  ('U3xac534mEE', 'en'),
  ('1cqzp47V4Jc', 'en-yGn0zlAHf04'),
  ('UoMIxsepHvY', 'en-US'),
  ('aZOlUO0f6fw', 'en'),
  ('x5bEeaKBkeg', 'en'),
  ('fHWrf-XkRdk', 'en-US'),
  ('c64JrGjYL9c', 'en-RTbB2cpHawQ'),
  ('BL1K7wrA73k', 'en'),
  ('j-9p8INVN5Y', 'en-nP7-2PuUl7o'),
  ('e3YAZSi-uHA', 'en-GB'),
  ('bQrDaIYSOV8', 'en-uYU-mmqFLq8'),
  ('2a-thABZcpM', 'en'),
  ('TfeFv-wBeOc', 'en'),
  ('wyQ1YpLFK5U', 'en-GB'),
  ('4OrZ5OhCxjE', 'en'),
  ('qBbHZMT9Y5U', 'en-uYU-mmqFLq8'),
  ('_XTsTPKaZ8o', 'en'),
  ('ZnDJbAPNVz0', 'en'),
  ('q78z0Axl1MI', 'en'),
  ('hpYUGIL8Ew8', 'en'),
  ('wr4E81B8vnQ', 'en'),
  ('AHpX25o2iH8', 'en'),
  ('jJKLVi5K0VI', 'en'),
  ('bK1icqYTbSQ', 'en'),
  ('Mo-lwPeouN4', 'en'),
  ('glbQlYjwBIE', 'en-US'),
  ('fWOE5FEeMC4', 'en-IN'),
  ('qDkg8pgiwrY', 'en-GB'),
  ('AfvHGHP01BE', 'en'),
  ('QkPN9P5lYZk', 'en'),
  ('Vvvrs3fiB7w', 'en'),
  ('rqcPMoNgb4E', 'en-US'),
  ('iX3X6YXooLQ', 'en'),
  ('HMMPRnK2H3Q',

In [78]:
len(subsampled_batches)

6000

In [79]:
subsampled_batches[-1]

{'batch_5999': [{'id': '1kSG0gDVvmY',
   'manual_caption_languages': 'en',
   'download_lang': 'en'},
  {'id': '_hALGphRLEE',
   'manual_caption_languages': 'de-ehkg1hFWq8A,en-ehkg1hFWq8A,es-ehkg1hFWq8A,ru-ehkg1hFWq8A',
   'download_lang': 'en-ehkg1hFWq8A'},
  {'id': 'U3xac534mEE',
   'manual_caption_languages': 'en',
   'download_lang': 'en'},
  {'id': '1cqzp47V4Jc',
   'manual_caption_languages': 'en-yGn0zlAHf04',
   'download_lang': 'en-yGn0zlAHf04'},
  {'id': 'UoMIxsepHvY',
   'manual_caption_languages': 'en-US',
   'download_lang': 'en-US'},
  {'id': 'aZOlUO0f6fw',
   'manual_caption_languages': 'en',
   'download_lang': 'en'},
  {'id': 'x5bEeaKBkeg',
   'manual_caption_languages': 'en',
   'download_lang': 'en'},
  {'id': 'fHWrf-XkRdk',
   'manual_caption_languages': 'en-US',
   'download_lang': 'en-US'},
  {'id': 'c64JrGjYL9c',
   'manual_caption_languages': 'en-RTbB2cpHawQ',
   'download_lang': 'en-RTbB2cpHawQ'},
  {'id': 'BL1K7wrA73k',
   'manual_caption_languages': 'en',
   '

In [76]:
len(subsampled_batches[1]["batch_1"])

1000

In [None]:
with open("logs/data/download/1M_en/shuffled_subsampled_batches.jsonl", "w") as f:
    for i, batch in enumerate(subsampled_batches):
        f.write(json.dumps(batch) + "\n")

In [94]:
with open("logs/data/download/1M_en/shuffled_subsampled_batches.jsonl", "w") as f:
    for i, batch in enumerate(subsampled_batches_tuples):
        f.write(json.dumps(batch) + "\n")

In [None]:
import json

# code to split up batches into smaller batches from existing JSONL file
new_batches = []
with open("logs/data/download/1M_en/shuffled_subsampled_batches.jsonl", "r") as f:
    for line in f:
        d = json.loads(line.strip())
        batch_idx = int(d["batchIdx"])
        for i in range(0, len(d["videoIds"]), 250):
            new_batches.append({"videoIds": d["videoIds"][i:i + 250], "batchIdx": (batch_idx * 4) + (i // 250)})
            
with open("logs/data/download/1M_en/shuffled_subsampled_quart_batches.jsonl", "w") as f:
    for d in new_batches:
        f.write(json.dumps(d) + "\n")

## Dealing w/ missing pairs (2nd download)

In [29]:
with open("logs/data/download/missing_pairs_ids.txt", "r") as f:
    missing_pairs = [(line.strip().split(" ")[0], int(line.strip().split(" ")[1])) for line in f]
missing_pairs

[('EA8N7g8I_ns', 7922),
 ('wkx9hWqa_ok', 7922),
 ('kmVgCGWoXIA', 7922),
 ('s7DAFyO56mE', 7922),
 ('jU0c6fS3d6U', 7922),
 ('Q5DJgt-XF_k', 7922),
 ('P8MNjOVOxvE', 7922),
 ('NYxyfhoTooE', 7922),
 ('7tNoD9m9qAo', 7922),
 ('yaUjx_JLkyo', 5167),
 ('pitvacBn-Eg', 5167),
 ('d1wtQlaQEuQ', 5167),
 ('KWtkFHDjiiQ', 5167),
 ('51JN86DU-3o', 5167),
 ('3gKx5ng9mgg', 5167),
 ('ajxTT1TKhYI', 5167),
 ('5d8sOQZ8npM', 5167),
 ('aZINEc6YVxI', 5167),
 ('tdvRwbRsLa0', 5167),
 ('kcq73_jFMn0', 5628),
 ('Kpba6f53hds', 5628),
 ('E_70QGzZFfU', 5628),
 ('lDTL8tfv6ts', 5628),
 ('4WIudT6-lco', 5628),
 ('s_6CkYke0BE', 5628),
 ('OuBjMcI7ANU', 5628),
 ('XpDNYy78fiU', 5628),
 ('Dv4B6QrqVUA', 5628),
 ('Qk1ZlDKTyys', 3880),
 ('Ryij01EbQkI', 3880),
 ('N-wDlQkgpKA', 3880),
 ('-ZwpuQEFJjo', 3880),
 ('2NF9eJtwHOs', 3880),
 ('sWEunvKrFUI', 3880),
 ('V0nms7EK4BI', 3880),
 ('y1U89RB5PvU', 7741),
 ('5MvK7wOnDTw', 7741),
 ('9XZF9gJakaU', 7741),
 ('YO1TFY2CY0U', 7741),
 ('U0BQxLAxVqE', 7741),
 ('IKGd7bKTFe4', 7741),
 ('DJDF7BNS9js',

In [38]:
from collections import defaultdict

def group_by_second_element(data):
    grouped_data = defaultdict(list)
    
    # Iterate through each tuple
    for first, second in data:
        grouped_data[second].append(first)
    
    return dict(grouped_data)

# Using the function
batch_idx_ids = group_by_second_element(missing_pairs)
batch_idx_ids

{7922: ['EA8N7g8I_ns',
  'wkx9hWqa_ok',
  'kmVgCGWoXIA',
  's7DAFyO56mE',
  'jU0c6fS3d6U',
  'Q5DJgt-XF_k',
  'P8MNjOVOxvE',
  'NYxyfhoTooE',
  '7tNoD9m9qAo'],
 5167: ['yaUjx_JLkyo',
  'pitvacBn-Eg',
  'd1wtQlaQEuQ',
  'KWtkFHDjiiQ',
  '51JN86DU-3o',
  '3gKx5ng9mgg',
  'ajxTT1TKhYI',
  '5d8sOQZ8npM',
  'aZINEc6YVxI',
  'tdvRwbRsLa0'],
 5628: ['kcq73_jFMn0',
  'Kpba6f53hds',
  'E_70QGzZFfU',
  'lDTL8tfv6ts',
  '4WIudT6-lco',
  's_6CkYke0BE',
  'OuBjMcI7ANU',
  'XpDNYy78fiU',
  'Dv4B6QrqVUA'],
 3880: ['Qk1ZlDKTyys',
  'Ryij01EbQkI',
  'N-wDlQkgpKA',
  '-ZwpuQEFJjo',
  '2NF9eJtwHOs',
  'sWEunvKrFUI',
  'V0nms7EK4BI'],
 7741: ['y1U89RB5PvU',
  '5MvK7wOnDTw',
  '9XZF9gJakaU',
  'YO1TFY2CY0U',
  'U0BQxLAxVqE',
  'IKGd7bKTFe4',
  'DJDF7BNS9js',
  '2_dAybKwP80',
  'Ip9HgJIqqws',
  'msNijolazjU'],
 3605: ['BnbbHFCI9gc',
  '8snvic2AfOU',
  'ghz3sCzNXjw',
  '4Oz_jWXeFLY',
  'nF5U2a3wp0s',
  'DU61hE-j0Bg',
  'OtmvvRRahN0',
  'gvtSKeS4U9k',
  'iPVhhs7XqNw',
  '_gLRoSqQ-uc',
  'MOP0scB8_mQ'],
 6822:

In [30]:
with open("logs/data/download/missing_pairs_ids.txt", "r") as f:
    missing_pairs_ids = [line.strip().split(" ")[0] for line in f]
missing_pairs_ids

['EA8N7g8I_ns',
 'wkx9hWqa_ok',
 'kmVgCGWoXIA',
 's7DAFyO56mE',
 'jU0c6fS3d6U',
 'Q5DJgt-XF_k',
 'P8MNjOVOxvE',
 'NYxyfhoTooE',
 '7tNoD9m9qAo',
 'yaUjx_JLkyo',
 'pitvacBn-Eg',
 'd1wtQlaQEuQ',
 'KWtkFHDjiiQ',
 '51JN86DU-3o',
 '3gKx5ng9mgg',
 'ajxTT1TKhYI',
 '5d8sOQZ8npM',
 'aZINEc6YVxI',
 'tdvRwbRsLa0',
 'kcq73_jFMn0',
 'Kpba6f53hds',
 'E_70QGzZFfU',
 'lDTL8tfv6ts',
 '4WIudT6-lco',
 's_6CkYke0BE',
 'OuBjMcI7ANU',
 'XpDNYy78fiU',
 'Dv4B6QrqVUA',
 'Qk1ZlDKTyys',
 'Ryij01EbQkI',
 'N-wDlQkgpKA',
 '-ZwpuQEFJjo',
 '2NF9eJtwHOs',
 'sWEunvKrFUI',
 'V0nms7EK4BI',
 'y1U89RB5PvU',
 '5MvK7wOnDTw',
 '9XZF9gJakaU',
 'YO1TFY2CY0U',
 'U0BQxLAxVqE',
 'IKGd7bKTFe4',
 'DJDF7BNS9js',
 '2_dAybKwP80',
 'Ip9HgJIqqws',
 'msNijolazjU',
 'BnbbHFCI9gc',
 '8snvic2AfOU',
 'ghz3sCzNXjw',
 '4Oz_jWXeFLY',
 'nF5U2a3wp0s',
 'DU61hE-j0Bg',
 'OtmvvRRahN0',
 'gvtSKeS4U9k',
 'iPVhhs7XqNw',
 '_gLRoSqQ-uc',
 'MOP0scB8_mQ',
 'L6KXJaH0Q6Y',
 'ktYtGMm7QMg',
 '3H1zZNswdf4',
 '8nnqYXX7VDU',
 'oB4aXnc34MI',
 'J3tO5bOUR2s',
 'mDafoS

In [31]:
missing_pairs_df = main_df_dedup[main_df_dedup['id'].isin(missing_pairs_ids)]

In [32]:
missing_pairs_df["man_cap_langs_2"] = missing_pairs_df["manual_caption_languages"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_pairs_df["man_cap_langs_2"] = missing_pairs_df["manual_caption_languages"]


In [33]:
missing_pairs_df["filtered_langs"] = missing_pairs_df["man_cap_langs_2"].apply(lambda langs: [lang for lang in langs.split(",") if lang.startswith("en")])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_pairs_df["filtered_langs"] = missing_pairs_df["man_cap_langs_2"].apply(lambda langs: [lang for lang in langs.split(",") if lang.startswith("en")])


In [34]:
view_lang_cols_2(missing_pairs_df, 30)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language,man_cap_langs_2,filtered_langs
1710,vK-dM5aW1IE,WALA KANG MGA MAGULANG explain | spoken by A...,en,en,,en,[en]
1744,edssqlT3woM,Follow Your God Nudges | Women of Influence,en-uYU-mmqFLq8,en,en,en-uYU-mmqFLq8,[en-uYU-mmqFLq8]
1896,PhOdfxigdTg,GRADUATION IN FILM STUDIES IN INDIA | IF YOU W...,en-IN,en,en,en-IN,[en-IN]
2220,i_a908UrEaE,Up Close & Personal: THE AMERICAS,en,en,en,en,[en]
2859,dUThA-2aJl8,Bedtime Yoga | 15 Min Yoga Stretch - Nighttime...,en,en,en,en,[en]
3604,1169YX_TV40,TREASURE - ASAHI CAM EP 1 REACTION !!,en,id,id,en,[en]
5176,x8XN_kZ64Ug,Meet Miss Universe Malta 2017 Tiffany Pisani,en,en,en,en,[en]
5185,aT2dnECDQgM,"Behind the Scenes of ""Around the World in 80 T...",en,en,en,en,[en]
5755,uQuf8Y_0FF4,"Savitar, the God of Speed • Are you a man or a...","en,fr",en,en,"en,fr",[en]
6358,LbcjEYdwo1U,Signs From The Universe That You Are On The Ri...,en,en,en,en,[en]


In [35]:
missing_pairs_df.reset_index(drop=True, inplace=True)

In [36]:
rng = np.random.default_rng(42)
missing_pairs_df["download_lang"] = missing_pairs_df["filtered_langs"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_pairs_df["download_lang"] = missing_pairs_df["filtered_langs"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])


In [37]:
view_lang_cols_4(missing_pairs_df, 50)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language,man_cap_langs_2,filtered_langs,download_lang
0,vK-dM5aW1IE,WALA KANG MGA MAGULANG explain | spoken by A...,en,en,,en,[en],en
1,edssqlT3woM,Follow Your God Nudges | Women of Influence,en-uYU-mmqFLq8,en,en,en-uYU-mmqFLq8,[en-uYU-mmqFLq8],en-uYU-mmqFLq8
2,PhOdfxigdTg,GRADUATION IN FILM STUDIES IN INDIA | IF YOU W...,en-IN,en,en,en-IN,[en-IN],en-IN
3,i_a908UrEaE,Up Close & Personal: THE AMERICAS,en,en,en,en,[en],en
4,dUThA-2aJl8,Bedtime Yoga | 15 Min Yoga Stretch - Nighttime...,en,en,en,en,[en],en
5,1169YX_TV40,TREASURE - ASAHI CAM EP 1 REACTION !!,en,id,id,en,[en],en
6,x8XN_kZ64Ug,Meet Miss Universe Malta 2017 Tiffany Pisani,en,en,en,en,[en],en
7,aT2dnECDQgM,"Behind the Scenes of ""Around the World in 80 T...",en,en,en,en,[en],en
8,uQuf8Y_0FF4,"Savitar, the God of Speed • Are you a man or a...","en,fr",en,en,"en,fr",[en],en
9,LbcjEYdwo1U,Signs From The Universe That You Are On The Ri...,en,en,en,en,[en],en


In [42]:
import json
batch_dict_list = []
for idx, video_ids in batch_idx_ids.items():
    batch_df = missing_pairs_df[missing_pairs_df['id'].isin(video_ids)]
    id_lang_dicts = batch_df[["id", "download_lang"]].to_dict("records")
    id_lang = [(d["id"], d["download_lang"]) for d in id_lang_dicts]
    batch_dict = {"videoIds": id_lang, "batchIdx": idx}
    batch_dict_list.append(batch_dict)
    
with open("logs/data/download/1M_en/missing_pairs.jsonl", "w") as f:
    for d in batch_dict_list:
        f.write(json.dumps(d) + "\n")

## Downloading text data from pool (estimated to be English)

In [None]:
with open("data/metadata/approx_en_only.txt", "r") as f:
    all_en_ids = [line.strip() for line in f]