## Imports

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import json
os.chdir("/Users/huongn/Desktop/open_whisper")

In [2]:
metadata_files = glob.glob("data/metadata/*.parquet")
metadata_files

['data/metadata/captions-0001.parquet',
 'data/metadata/captions-0008.parquet',
 'data/metadata/captions-0009.parquet',
 'data/metadata/captions-0010.parquet',
 'data/metadata/captions-0000.parquet',
 'data/metadata/captions-0002.parquet',
 'data/metadata/captions-0003.parquet',
 'data/metadata/captions-0006.parquet',
 'data/metadata/captions-0007.parquet',
 'data/metadata/captions-0005.parquet',
 'data/metadata/captions-0004.parquet']

## UDFs

In [3]:
def standardize_dialects(s):
    words = s.split(",")
    transformed_words = [word.split("-")[0] if "-" in word else word for word in words]
    return ",".join(transformed_words)

In [4]:
def standardize_dialects_list(s):
    words = s.split(",")
    transformed_words = [word.split("-")[0] if "-" in word else word for word in words]
    return transformed_words

In [5]:
def get_percent_empty(df):
    return df.apply(lambda col: (col == "").mean() * 100)

In [6]:
def clean_df(df):
    df = df.copy()
    # fill in missing values with empty string
    df["automatic_caption_orig_language"] = df[
        "automatic_caption_orig_language"
    ].fillna("")
    df["language"] = df["language"].fillna("")

    # maybe don't need code above because this takes care of en case as well
    df["manual_caption_languages"] = df["manual_caption_languages"].apply(
        standardize_dialects
    )
    df["automatic_caption_orig_language"] = df["automatic_caption_orig_language"].apply(
        standardize_dialects
    )
    df["language"] = df["language"].apply(standardize_dialects)

    # check if all "en" types have been changed to strictly "en"
    if "-" in df["manual_caption_languages"].unique():
        print("dialects still exists in manual_caption_languages")
    else:
        print(
            "all dialects have been changed to strictly family language in manual_caption_languages"
        )

    if "-" in df["automatic_caption_orig_language"].unique():
        print("dialects still exists in automatic_caption_orig_language")
    else:
        print(
            "all dialects have been changed to strictly family language in automatic_caption_orig_language"
        )

    if "-" in df["language"].unique():
        print("dialects still exists in language")
    else:
        print("all dialects have been changed to strictly family language in language")

    return df

In [7]:
def check_in_col(row, ref_col, tgt_col):
    words = [word.strip() for word in row[tgt_col].split(",")]
    # Check if the second column's value is in the list of words
    return row[ref_col] in words

In [8]:
def get_duration_ml_t(df):
    condition = (
        df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    )
    not_strictly_en_df = df[~condition]
    condition_4 = not_strictly_en_df["manual_caption_languages"].str.contains("en")
    no_en_df = not_strictly_en_df[~condition_4]
    condition_5 = (
        no_en_df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: len(lst) == 1)
    )
    one_man_lang_df = no_en_df[condition_5]
    condition_6 = (
        (
            one_man_lang_df["manual_caption_languages"]
            == one_man_lang_df["automatic_caption_orig_language"]
        )
        | (one_man_lang_df["automatic_caption_orig_language"] == "")
        | (one_man_lang_df["automatic_caption_orig_language"] == "en")
    )
    condition_7 = (
        no_en_df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: len(lst) > 1)
    )
    many_man_lang_df = no_en_df[condition_7]
    condition_8 = many_man_lang_df.apply(
        check_in_col,
        axis=1,
        args=("automatic_caption_orig_language", "manual_caption_languages"),
    )

    temp_2 = many_man_lang_df[condition_8]
    # getting data complement of condition 8
    temp_3 = many_man_lang_df[~condition_8]
    # getting data where automatic_caption_orig_language == ""
    condition_9 = temp_3["automatic_caption_orig_language"] == ""
    temp_4 = temp_3[condition_9][
        temp_3[condition_9].apply(
            check_in_col, axis=1, args=("language", "manual_caption_languages")
        )
    ]
    condition_10 = temp_3["automatic_caption_orig_language"] != ""
    temp_5 = temp_3[condition_10][
        temp_3[condition_10].apply(
            check_in_col, axis=1, args=("language", "manual_caption_languages")
        )
    ]
    condition_11 = (
        condition_4
        & (not_strictly_en_df["automatic_caption_orig_language"] != "en")
        & (not_strictly_en_df["automatic_caption_orig_language"] != "")
    )
    condition_12 = not_strictly_en_df[condition_11].apply(
        check_in_col,
        axis=1,
        args=("automatic_caption_orig_language", "manual_caption_languages"),
    )
    temp_6 = not_strictly_en_df[condition_11][condition_12]
    condition_13 = (
        (not_strictly_en_df["manual_caption_languages"].str.contains("en"))
        & ("" == not_strictly_en_df["automatic_caption_orig_language"])
        & (not_strictly_en_df["language"] != "en")
    )
    condition_14 = (
        (not_strictly_en_df["manual_caption_languages"].str.contains("en"))
        & (not_strictly_en_df["automatic_caption_orig_language"] == "en")
        & (not_strictly_en_df["language"] != "en")
        & (not_strictly_en_df["language"] != "")
    )
    condition_15 = (
        (not_strictly_en_df["manual_caption_languages"].str.contains("en"))
        & ("" == not_strictly_en_df["automatic_caption_orig_language"])
        & (not_strictly_en_df["language"] != "en")
        & (not_strictly_en_df["language"] != "")
    )
    condition_16 = not_strictly_en_df[condition_15].apply(
        check_in_col,
        axis=1,
        args=("language", "manual_caption_languages"),
    )
    temp_7 = not_strictly_en_df[condition_15][condition_16]
    condition_17 = not_strictly_en_df[condition_14].apply(
        check_in_col,
        axis=1,
        args=("language", "manual_caption_languages"),
    )
    temp_8 = not_strictly_en_df[condition_14][condition_17]

    ml_dur = (
        one_man_lang_df[condition_6]["duration"].sum() / (60 * 60)
        + temp_2["duration"].sum() / (60 * 60)
        + temp_4["duration"].sum() / (60 * 60)
        + temp_5["duration"].sum() / (60 * 60)
        + temp_6["duration"].sum() / (60 * 60)
        + temp_7["duration"].sum() / (60 * 60)
        + temp_8["duration"].sum() / (60 * 60)
    )
    t_dur = (
        (not_strictly_en_df[condition_11]["duration"].sum() / (60 * 60))
        + (not_strictly_en_df[condition_13]["duration"].sum() / (60 * 60))
        + (not_strictly_en_df[condition_14]["duration"].sum() / (60 * 60))
    )

    return ml_dur, t_dur

In [9]:
def get_duration_en_only(df):
    # getting data that's strictly en in manual_caption_languages
    condition = (
        df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    )
    manual_en_df = df[condition]

    temp = df[~condition]

    # getting data where en in manual_caption_languages (not exactly "en") (temp) and en == automatic_caption_orig_language
    condition_2 = (
        (temp["manual_caption_languages"].str.contains("en"))
        & (temp["automatic_caption_orig_language"] == "en")
        & ((temp["language"] == "en") | (temp["language"] == ""))
    )

    # getting data where en in manual_caption_languages (not exactly "en") (temp) and "" == automatic_caption_orig_language but en == language
    condition_3 = (
        (temp["manual_caption_languages"].str.contains("en"))
        & ("" == temp["automatic_caption_orig_language"])
        & (temp["language"] == "en")
    )

    # returning total duration
    return (
        (manual_en_df["duration"].sum() / (60 * 60))
        + (temp[condition_2]["duration"].sum() / (60 * 60))
        + (temp[condition_3]["duration"].sum() / (60 * 60))
    )

In [10]:
def view_lang_cols(df, num_rows=5):
    return df[["id", "title"] + [col for col in df.columns if "lang" in col]].head(num_rows)

In [11]:
def get_num_rows(df):
    return df.shape[0]

In [12]:
def subsample_data(df, per_iter_sample: int, hours_to_subsample: int):
    subsampled_ids = []
    ids_list = df["id"].tolist()
    dur = 0
    rng = np.random.default_rng(42)
    while True:
        # Randomly subsample a specified amount of values from 'category' column
        subsampled_values = rng.choice(ids_list, per_iter_sample, replace=False)
        subsampled_ids.extend(subsampled_values)
        
        dur += df[df['id'].isin(subsampled_values)]["duration"].sum() / (60 * 60)
        print(f"{dur=}")
        print(f"{len(subsampled_values)=}")
        
        if dur >= hours_to_subsample:
            break
        else:
            ids_list = list(set(ids_list) - set(subsampled_values))

    return subsampled_ids

In [13]:
def get_subsampled_batches(subsampled_ids_langs, batch_size):
    subsampled_batches = [{f"videoIds": subsampled_ids_langs[i:i + batch_size], "batchIdx": i // batch_size} for i in range(0, len(subsampled_ids_langs), batch_size)]
    return subsampled_batches

## Getting Video IDs

### Loading in data from parquet files

In [14]:
df_list = []
for path in metadata_files:
    df = pd.read_parquet(path)
    df_list.append(df)
main_df = pd.concat(df_list)
main_df.reset_index(drop=True, inplace=True)

In [15]:
len(main_df)

60102381

In [16]:
main_df_dedup = main_df.drop_duplicates(subset=["id"], keep="first")
len(main_df_dedup)

60098151

In [17]:
main_df_dedup["duration"].sum() / (60 * 60)

11933139.694166666

In [18]:
main_df.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license'],
      dtype='object')

### Duplicates in df from joining parquet files (exploration - don't run)

In [63]:
len(main_df)

60102381

In [58]:
len(set(main_df["id"]))

60098151

In [60]:
len(main_df["id"]) - len(set(main_df["id"]))

4230

In [21]:
unique_count_main_df = len(set(main_df["id"]))
unique_count_downloaded = len(set(id_downloaded))
print(unique_count_main_df, unique_count_downloaded)
print(unique_count_main_df - unique_count_downloaded)

60098151 2448500
57649651


In [14]:
main_df_dedup = main_df.drop_duplicates(subset=["id"], keep="first")
len(main_df_dedup)

60098151

#### Confirmation that there are duplicates when joining all parquet files

In [57]:
from collections import defaultdict
d = defaultdict(int)
for id in main_df["id"].tolist():
    d[id] += 1

In [59]:
for id, count in d.items():
    if count > 1:
        print(id)

fyvEDS-PQO8
QIySuiQ3_Yw
-t-J098gF10
QAgrHKTqkIE
5F5oJyZv0Dc
LLIwgjSQnj0
X3-I_5gdZ00
2We5HvMPLFY
EcqahSwiJRo
xqS5zApPnKw
V7gZj9C9OUk
liPcdDFouP0
lqNAbx1nXkc
xFAckAQoIFs
OdJyNoOeYHo
XdC3qMWmap8
xUC88-IuEkM
kNgP0TF-ENQ
Rz8zo1iPr0k
Gm-zwbxzzOQ
LK4r2ePVb1U
wseZE-FzyQw
WG7XUDDBnpM
L6Hi4HSVBDQ
Woa4MU04aVk
slUYJZ1Fm3E
_MjflJq6f7w
I2ADZMmP0Uc
CpxeV9XMvdg
gfv9EjSMQ60
FKvAfEpijPE
ScRAOOK-Vlc
ppAk683jX_0
BnQ_T-rjeJA
EaYAwNY5OHw
8mGu6xP74Hk
dtrfTUw6vKc
XfdnPE7bQ_A
bzWAKI7lFp4
D3Fn3914hOI
s6vH0tYrh9o
L7joInxvxkw
YKXT1oMhCWk
ML_d9o0eQ7g
6lhiTVDJzio
tSuKLGf1jXE
e_MzXz3D8qQ
X80Epyyy450
r3jSMTmdhP8
ew0w8-StOb4
nkesANaBEBc
sD4tmb_VYUU
XKIC5ZAcumE
C42IcopodNE
yHwFqC8k4bE
KH8HYrX-JvY
GvIyZ1kVoTw
3-4RV5d9w4U
7Nt_W_TM1yw
YELcmcSY1eA
SlNh4aRMRT8
_0j9JoWm9Uo
877UTIzw_Oo
I51gsm0FHTg
mdXmMblgj9Y
ojnM5FfSHNY
gmpdNWFCLwk
LIk59pRMU58
F_LESLVzU_M
MlZvjMePLaM
cVnZuq1XBa0
O9npRJ6OVvM
PCyiFADtKHY
xLdrfWIs84Y
KlkqaZuqkoo
5wy9JRzJ7aM
ohc5qPk3y6w
wE6-OYTH7ks
C5SaOTC0tIM
Qd1HedDfZRU
ucg0U4qZ1UU
XKhHI3wMcdc
LRjt2rDtQok
t7li

In [61]:
main_df[main_df["id"] == "fyvEDS-PQO8"]

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
16753897,fyvEDS-PQO8,404 (FILE NOT FOUND),368,19.0,People & Blogs,1.0,2.0,Nubz,UCVlYKhbTiO8TT6fRD-aY6vw,53.0,...,125.77,125.77,vp09.00.50.08,48000.0,2.0,,en-GB,en,,
18029445,fyvEDS-PQO8,404 (FILE NOT FOUND),368,19.0,People & Blogs,1.0,2.0,Nubz,UCVlYKhbTiO8TT6fRD-aY6vw,53.0,...,125.77,125.77,vp09.00.50.08,48000.0,2.0,,en-GB,en,,


### Getting information on remaining to download

#### Getting approximately English data to download (2nd round)
- Conditions used to get approximately English data (union):
    1. manual_caption_languages must only contain `en` or variants of `en`
    2. in the complement set of 1., manual_caption_languages must contain at least `en` or variants of `en`
        
        3. in this set, language == `en`
- UPDATE: 11/5/2024
    - These are conditions that lead to bad eval performance. That could mean there could be more filtering to get higher quality data. For now, we use a subset from this and do more data filtering to train.

In [57]:
with open("logs/data/download/2M_en/2M_download_ids.txt", "r") as f:
    id_downloaded = [line.strip().split("\t")[0] for line in f]
print(id_downloaded[:5])
print(len(id_downloaded))

['05xOF5ubabQ', 'GjTPw_aQmgg', 'T9_3s_yybq8', 'nKZSV-h0SGA', '3993tVeKd-w']
2448500


In [58]:
all_ids = set(main_df_dedup['id'])
downloaded_ids = set(id_downloaded)

not_downloaded_ids = all_ids - downloaded_ids

not_downloaded_df = main_df_dedup[main_df_dedup['id'].isin(not_downloaded_ids)]

In [59]:
not_downloaded_df.reset_index(drop=True, inplace=True)

In [60]:
view_lang_cols(not_downloaded_df)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,,en,en
2,OREjUtUT2Sc,New Intro | I keep losing them,,en,en
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),,sv,sv
4,2XitiLKk6rM,Анонимки в тик ток,ru,sv,ru


In [61]:
print(not_downloaded_df.shape[0])

57649651


In [28]:
clean_not_downloaded_df = clean_df(not_downloaded_df)

all dialects have been changed to strictly family language in manual_caption_languages
all dialects have been changed to strictly family language in automatic_caption_orig_language
all dialects have been changed to strictly family language in language


In [29]:
clean_not_downloaded_df.head()

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
0,LwJv3vSd8no,Free Art Wednesday!,372,378.0,People & Blogs,175.0,70.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,120.177,120.177,vp9,48000.0,2.0,,en,en,,
1,XXjYyBCNJEY,FREE Art Wednesday,293,194.0,People & Blogs,69.0,48.0,Wajas,UCrfdGNBnkDd0rOh1M9mdmaQ,1680.0,...,129.7,129.7,vp9,48000.0,2.0,,en,en,,
2,OREjUtUT2Sc,New Intro | I keep losing them,11,39.0,People & Blogs,3.0,9.0,Sade Royalty,UCE_3_c7b769PCiS1iqc1APw,363.0,...,158.206,158.206,vp9,48000.0,2.0,,en,en,,
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),20,29.0,People & Blogs,,5.0,Overk,UCOxiCFiOYLUgPCtdcDZiK5A,12.0,...,129.293,129.293,vp9,48000.0,2.0,,sv,sv,,
4,2XitiLKk6rM,Анонимки в тик ток,102,578.0,People & Blogs,3.0,17.0,Overk,UCOxiCFiOYLUgPCtdcDZiK5A,12.0,...,121.21,121.21,vp09.00.21.08.01.05.01.06.00,48000.0,2.0,,sv,ru,,


In [30]:
view_lang_cols(clean_not_downloaded_df)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,en,en,
2,OREjUtUT2Sc,New Intro | I keep losing them,en,en,
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),sv,sv,
4,2XitiLKk6rM,Анонимки в тик ток,sv,ru,ru


##### First condition

In [31]:
condition = (
        clean_not_downloaded_df["manual_caption_languages"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    )
only_en_1 = clean_not_downloaded_df[condition]

In [32]:
only_en_1.shape[0]

25507801

In [33]:
not_only_en_1_id = set(clean_not_downloaded_df["id"]) - set(only_en_1["id"])
len(not_only_en_1_id)

32141850

##### Second condition

In [34]:
not_only_en_1 = clean_not_downloaded_df[clean_not_downloaded_df["id"].isin(not_only_en_1_id)]

In [35]:
view_lang_cols(not_only_en_1)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),sv,sv,
4,2XitiLKk6rM,Анонимки в тик ток,sv,ru,ru
5,IPxXLQpOJRU,Marisa Marisa,it,it,
6,ug5TkquQEHo,Marisa marisa,it,it,
7,Ar-XwEnnFPY,Formation des Classe D | Serveur UPRP,fr,fr,fr


In [36]:
get_num_rows(not_only_en_1)

32141850

In [88]:
view_lang_cols(not_only_en_1[not_only_en_1["language"] == "en"], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
75,qVlXu1zanWE,【游玩克拉普】 平常小方块怎么测试声音的,zh,en,en
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
181,AxmibPoLlvo,[𝐏𝐋𝐀𝐘𝐋𝐈𝐒𝐓] 일렁이는 여름 햇살,ko,en,en
280,wboahvmJHyU,Resident Evil Revelations 2 - Trailer Oficial ...,pt,en,en
281,p_GO0YcfN4M,Resident Evil Revelations 2 - Trailer Oficial ...,pt,en,en


In [90]:
get_num_rows(not_only_en_1[not_only_en_1["language"] == "en"])

4070589

In [92]:
view_lang_cols(not_only_en_1[not_only_en_1["manual_caption_languages"].str.contains("en")], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
11,hWQZq0wHIwQ,rimne - First Color / MV,"en,ja",unknown,
19,4w-OpSyV5dw,Sangat Sedap | Semua Pasti Menyukainya!,"en,live_chat,ms",unknown,
21,JiHuFJ9RoHM,DEI UMA PASSADA NO CANADÁ ANTES DE IR PARA O B...,"en,pt",pt,pt
22,YhPHqdGcCQ4,"MINHA COLEÇÃO INTEIRA DA SUPERESTRELA DA NBA, ...","en,pt",pt,pt
23,zwPpeXqHsZs,JOGUEI O NBA 2K23 (PS4) CURRENT GEN PELA PRIME...,"en,pt",unknown,
24,VdlUZgEL1Qs,JOGUEI BASQUETE COM OS GRINGOS EM UMA QUADRA A...,"en,pt",pt,pt
56,9wY84F2-aUQ,DOĞADA HAYAT 44.BÖLÜM | LİFE İN NATURE EPİSOD...,"en,ko,ur",tr,tr
90,so8TfVlMuVs,"COMING SEPT 2021 - NEW MUSIC FROM ""CUSTOM""! - ...","en,live_chat",en,
92,OFZyHn-GP48,Control Z - bad guy,"en,es,pt",unknown,
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en


In [None]:
# second condition set here
only_en_2 = not_only_en_1[not_only_en_1["manual_caption_languages"].str.contains("en")]

In [38]:
only_en_2.shape[0]

9991767

##### Third condition

In [95]:
view_lang_cols(only_en_2[only_en_2["language"] == "en"], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
547,4khB0ZyEhVM,DFAT-CSIRO partnership,"en,vi",en,en
665,stIDKDUA-Xs,Panty Bleaching Gone Viral,"en,live_chat",en,en
696,1G1Fv0oGcpM,Adding subtitles using Chromebook only,"de,en",en,en
750,LeXguFYox0o,"ENTREVISTA | Neil Newbon, dublador de Nicholai...","en,pt",en,en


In [96]:
get_num_rows(only_en_2[only_en_2["language"] == "en"])

1933919

In [98]:
(only_en_2[only_en_2["language"] == "en"]["automatic_caption_orig_language"] == "").sum() / len(only_en_2[only_en_2["language"] == "en"])

0.0011768848643609168

In [97]:
view_lang_cols(only_en_2[only_en_2["automatic_caption_orig_language"] == "en"], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
90,so8TfVlMuVs,"COMING SEPT 2021 - NEW MUSIC FROM ""CUSTOM""! - ...","en,live_chat",en,
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
433,tFtImC8E0wY,My Heart And Seoul 😍 | 15 Days Around Seoul - ...,"en,live_chat",en,
547,4khB0ZyEhVM,DFAT-CSIRO partnership,"en,vi",en,en
665,stIDKDUA-Xs,Panty Bleaching Gone Viral,"en,live_chat",en,en


In [100]:
get_num_rows(only_en_2[only_en_2["automatic_caption_orig_language"] == "en"])

1980299

In [102]:
view_lang_cols(only_en_2[(only_en_2["automatic_caption_orig_language"] == "en") & (only_en_2["language"] == "en")], 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
99,1f14oKd9jx8,Meet Don - Direct Express cardholder & PayPerk...,"en,es",en,en
100,SK6ngqjyp8E,Meet Helen (Part 2) - Financial Capability wit...,"en,es",en,en
101,Wssea1xQJBE,"Meet Helen (Part 1) - Convenience, Safety & Se...","en,es",en,en
102,OjjMLrhBkSk,Meet Ray: Direct Express Cardholder & PayPerks...,"en,es",en,en
103,zicChYIBtIw,Direct Express Cardholder & PayPerks User Prof...,"en,es",en,en
104,6KsSZFDkwJ4,Meet Dawn - Direct Express Cardholder & PayPer...,"en,es",en,en
547,4khB0ZyEhVM,DFAT-CSIRO partnership,"en,vi",en,en
665,stIDKDUA-Xs,Panty Bleaching Gone Viral,"en,live_chat",en,en
696,1G1Fv0oGcpM,Adding subtitles using Chromebook only,"de,en",en,en
750,LeXguFYox0o,"ENTREVISTA | Neil Newbon, dublador de Nicholai...","en,pt",en,en


In [103]:
get_num_rows(only_en_2[(only_en_2["automatic_caption_orig_language"] == "en") & (only_en_2["language"] == "en")])

1930688

In [39]:
lang_en = only_en_2[only_en_2["language"] == "en"]
auto_lang_en = only_en_2[only_en_2["automatic_caption_orig_language"] == "en"]
lang_en_auto_lang_en = only_en_2[(only_en_2["automatic_caption_orig_language"] == "en") & (only_en_2["language"] == "en")]

In [40]:
lang_en.shape[0], auto_lang_en.shape[0], lang_en_auto_lang_en.shape[0]

(1933919, 1980299, 1930688)

In [107]:
temp = set(lang_en["id"]) - set(lang_en_auto_lang_en["id"])
temp_df_2 = lang_en[lang_en["id"].isin(temp)]
view_lang_cols(temp_df_2, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
30378,VHOvLOPVxX8,noc18-me62-Lec 02B-Instrument -II,"bn,en,en,gu,hi,kn,ml,mr,ta,te",,en
38783,M56GMKXRE7c,🥥❤️ Sadece 3 Malzemeyle Yapabileceğiniz En Güz...,"ar,az,bg,bn,bs,cs,da,de,el,en,es,fa,fi,fil,fr,...",,en
59528,34iDTeCNTz4,Simulating color vision deficiencies in the Bl...,"en,nl",nl,en
123890,Wn_1Egqrq9o,Crazy Speed Eating Challenge w/ Rosanna | Copy...,"en,es",,en
123891,RZsb2O2ndCk,Transforming 3 YouTubers into Contortionists |...,"en,es",,en
219529,Np8GMBsUiv8,L'Italien | Policier | Film complet en français,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",fr,en
219532,J-t9WP_Gl_I,"N'embrasse pas la mariée | Comédie, Action | F...","ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",,en
219533,CWKZetMHzO8,Ultime Combat (Action) Film complet en français,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",,en
220580,V9JSRHVuLBE,Petite Princesse | Classique | Film complet en...,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",unknown,en
220584,dZPEVh0PWAU,Last Apocalypse | Action | Film complet en fra...,"ar,de,el,en,es,fa,fil,fr,hi,hu,id,it,iw,ja,ko,...",,en


In [109]:
temp_1 = set(auto_lang_en["id"]) - set(lang_en_auto_lang_en["id"])
temp_df_3 = auto_lang_en[auto_lang_en["id"].isin(temp_1)]
view_lang_cols(temp_df_3, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
90,so8TfVlMuVs,"COMING SEPT 2021 - NEW MUSIC FROM ""CUSTOM""! - ...","en,live_chat",en,
433,tFtImC8E0wY,My Heart And Seoul 😍 | 15 Days Around Seoul - ...,"en,live_chat",en,
865,-2auYFne_l0,Adventures of Zalke West : Tenerife Part II - ...,"en,live_chat",en,
866,dM2FwfPBC2A,Adventures of Zalke West: Tenerife Part I - Teide,"en,live_chat",en,
1511,4cz74eJittA,20 Days in CA Exam | Must Cover these things! ...,"en,live_chat",en,
8014,C3oJo-J3lTo,Setting Up Our Christmas Tree 2021 ft. Rayan &...,"en,live_chat",en,
8015,cMOHF0BWKTw,400 to 1000 Rs Kurti Shopping | Meesho Haul | ...,"en,live_chat",en,
8016,VDWdvB8JYVs,Fun Baking Cookies with Rayan 👩‍🍳😎🧒 | Cooking ...,"en,live_chat",en,
8017,tNpSQure4iA,5000 Rupees For A Lunch 😵 | Most Expensive Res...,"en,live_chat",en,
8018,HPY1tat23pI,Masala Chowk Street Food Review 😋♥️ | Jaipur S...,"en,live_chat",en,


In [111]:
any_lang_en_id = set(lang_en["id"].tolist() + auto_lang_en["id"].tolist() + lang_en_auto_lang_en["id"].tolist())
temp_2 = set(only_en_2["id"]) - any_lang_en_id
temp_df_4 = only_en_2[only_en_2["id"].isin(temp_2)]
view_lang_cols(temp_df_4, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
11,hWQZq0wHIwQ,rimne - First Color / MV,"en,ja",unknown,
19,4w-OpSyV5dw,Sangat Sedap | Semua Pasti Menyukainya!,"en,live_chat,ms",unknown,
21,JiHuFJ9RoHM,DEI UMA PASSADA NO CANADÁ ANTES DE IR PARA O B...,"en,pt",pt,pt
22,YhPHqdGcCQ4,"MINHA COLEÇÃO INTEIRA DA SUPERESTRELA DA NBA, ...","en,pt",pt,pt
23,zwPpeXqHsZs,JOGUEI O NBA 2K23 (PS4) CURRENT GEN PELA PRIME...,"en,pt",unknown,
24,VdlUZgEL1Qs,JOGUEI BASQUETE COM OS GRINGOS EM UMA QUADRA A...,"en,pt",pt,pt
56,9wY84F2-aUQ,DOĞADA HAYAT 44.BÖLÜM | LİFE İN NATURE EPİSOD...,"en,ko,ur",tr,tr
92,OFZyHn-GP48,Control Z - bad guy,"en,es,pt",unknown,
106,SI7nWeZJsaE,अब रोबोट पुलिस करेगी अपराधियों का एनकाउंटर/Now...,"en,hi",hi,hi
107,-ruaKTKQuvM,Ayam Penyet Cheese & Kambing Bakar Madu - Menu...,"en,ms",unknown,


##### Filter fully established here

In [41]:
en_ids = set(only_en_1["id"].tolist() + lang_en["id"].tolist() + lang_en_auto_lang_en["id"].tolist())
len(en_ids)

27441720

In [42]:
approx_en_only = list(en_ids) + id_downloaded
len(approx_en_only)

29890220

In [43]:
with open("data/metadata/approx_en_only.txt", "w") as f:
    for video_id in approx_en_only:
        f.write(video_id + "\n")

In [44]:
en_not_downloaded_df = not_downloaded_df[not_downloaded_df["id"].isin(en_ids)]
view_lang_cols(en_not_downloaded_df, 20)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,en,en,
2,OREjUtUT2Sc,New Intro | I keep losing them,en,en,
8,nmRxk7k3kvU,Meditation Music video-2,en-US,en-US,
14,m18UlcXg3Xg,' YOU LADYBUG SONG' |MIRACULOUS LADYBUG & CAT ...,en-GB,en-GB,
20,y1yb_WnCfUA,Video Editor is. Thastefuel?,en-GB,en-GB,
30,sIKsodQRvlc,MYO Hand Letter Art,en-US,en-US,
31,v8uKybpMJdM,Nature Moment | Common Blue Violet Flower,en,en,en
32,IkiTwxRtQUk,MYO Whipped Coffee,en,en,en
33,VvPE3uB7O1A,Nature Moment | The Holly Plant,en,en,en


In [45]:
en_not_downloaded_df[en_not_downloaded_df["manual_caption_languages"].str.contains("en-")].head(20)

Unnamed: 0,id,title,duration,view_count,categories,comment_count,like_count,channel,channel_id,channel_follower_count,...,abr,vbr,vcodec,asr,audio_channels,chapters,manual_caption_languages,automatic_caption_orig_language,location,license
8,nmRxk7k3kvU,Meditation Music video-2,1295,44.0,Music,1.0,2.0,Meditation music,UCaHJfbRk8gnOZGrJTm5ZI4Q,3.0,...,124.292,124.292,avc1.640028,48000.0,2.0,,en-US,en-US,,
14,m18UlcXg3Xg,' YOU LADYBUG SONG' |MIRACULOUS LADYBUG & CAT ...,157,2433.0,Music,,57.0,miraculous ladybug season 06,UCBgXfZj6cVvu8cz0AX66HZQ,524.0,...,131.965,131.965,vp09.00.31.08.01.05.01.06.00,48000.0,2.0,,en-GB,en-GB,AUSTRALIA,
20,y1yb_WnCfUA,Video Editor is. Thastefuel?,12,25.0,Music,1.0,1.0,FrostyBallz Jr,UC-uKNbvsFlLqBaVI_ysSvig,3.0,...,130.202,130.202,vp9,44100.0,2.0,,en-GB,en-GB,,
30,sIKsodQRvlc,MYO Hand Letter Art,80,24.0,Howto & Style,,1.0,Roanoke County Public Library,UCRDIW4cJil7ls67SLQs9a4Q,217.0,...,138.62,138.62,vp09.00.50.08,48000.0,2.0,,en-US,en-US,,
52,0Yt4JoCW8oU,G Gundam Screenshop Repaint,1233,27.0,Howto & Style,,1.0,TkayArtz,UCHH2EHqaELDc2Lq9Ct5tn5w,34.0,...,139.683,139.683,avc1.640028,48000.0,2.0,,en-US,en,,
54,orWh_OLaRg4,All Goals | Final Copa America | Argentina (1)...,77,17.0,People & Blogs,,0.0,North Direction,UCDWbla2PIao9c6DnQNENkDQ,,...,122.41,122.41,vp09.00.21.08,48000.0,2.0,,"en-JkeT_87f4cc,en-uYU-mmqFLq8",en,,
93,y3o-baXQT0Q,VLOG #1 - Introduction,123,169.0,People & Blogs,7.0,7.0,Rikzaaa,UCkWADUHEgSPEdLlpdsc6pAA,306.0,...,115.805,115.805,avc1.640028,48000.0,2.0,,en-US,en,,
131,zHGwE0ritgk,Let's Play Minecraft: Greatest freakout ever,122,2620.0,Gaming,3.0,14.0,Riftenthecoolish93,UC_eV8_QSpMKm7ouQrs__qew,776.0,...,156.967,156.967,vp09.00.30.08,48000.0,2.0,,en-18WT74-rBWA,en,,
133,1N2Q2-veW8Y,Changing the Roofing Industry by Building Trus...,128,1.0,Howto & Style,,0.0,Schneider Roofing & Remodeling,UCA0DTNFRM-GrtGH_DpH9vYA,4.0,...,115.743,115.743,avc1.640028,48000.0,2.0,,en-US,en,,
134,7CRyjEyWjP4,Schneider Roofing & Remodeling - Show Me St. L...,164,13.0,Howto & Style,,0.0,Schneider Roofing & Remodeling,UCA0DTNFRM-GrtGH_DpH9vYA,4.0,...,103.942,103.942,avc1.640028,48000.0,2.0,,"en-US,en-uYU-mmqFLq8",en,,


In [46]:
en_not_downloaded_df["man_cap_langs_2"] = en_not_downloaded_df["manual_caption_languages"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_not_downloaded_df["man_cap_langs_2"] = en_not_downloaded_df["manual_caption_languages"]


In [47]:
en_not_downloaded_df.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license', 'man_cap_langs_2'],
      dtype='object')

In [48]:
view_lang_cols(en_not_downloaded_df)

Unnamed: 0,id,title,manual_caption_languages,automatic_caption_orig_language,language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,en,en,
2,OREjUtUT2Sc,New Intro | I keep losing them,en,en,
8,nmRxk7k3kvU,Meditation Music video-2,en-US,en-US,
14,m18UlcXg3Xg,' YOU LADYBUG SONG' |MIRACULOUS LADYBUG & CAT ...,en-GB,en-GB,


In [49]:
condition = (
        en_not_downloaded_df["man_cap_langs_2"]
        .apply(standardize_dialects_2)
        .apply(lambda lst: set(lst) == {"en"})
)
temp_df_5 = en_not_downloaded_df[condition]

In [50]:
temp_df_5["manual_caption_languages"].unique()

array(['en', 'en-US', 'en-GB', ..., 'en-CA-UFjxr4OIYz8', 'en-FpQ6dhQAASY',
       'en-LQRErUK_jpw'], dtype=object)

In [51]:
list(temp_df_5["manual_caption_languages"].unique())

['en',
 'en-US',
 'en-GB',
 'en-JkeT_87f4cc,en-uYU-mmqFLq8',
 'en-18WT74-rBWA',
 'en-US,en-uYU-mmqFLq8',
 'en-uYU-mmqFLq8',
 'en-eEY6OEpapPo',
 'en-IN',
 'en-CA',
 'en,en-GB',
 'en-cvfXDfbeED0',
 'en-US-anMIXjicSL4',
 'en-s8TMdDjdYe8',
 'en-IE',
 'en,en-US',
 'en-rX01gsAdUfU',
 'en-nP7-2PuUl7o',
 'en-zL7EZQmMa4Q',
 'en,en-IN',
 'en-imjcPpJerPk',
 'en-zL_pKPa_d08',
 'en-cFSbaKj2OKY',
 'en-anMIXjicSL4',
 'en-UMYBvvsXfqo',
 'en-1oAFzPNiuC0',
 'en-ehkg1hFWq8A',
 'en-vleHeMwJqgw',
 'en-0hllRZe4s5s',
 'en-US-eEY6OEpapPo',
 'en-K3JFmAG0FTI',
 'en,en-zL7EZQmMa4Q',
 'en-GB,en-US',
 'en-PQgNkSoyyBk',
 'en-TW7qhz_uLiI',
 'en-Zixs6prhbeE,en-nPk3V-duwf8',
 'en-US-YfeEIUII1AU',
 'en-y01yHln2iAs',
 'en-uYU-mmqFLq8,en-zL_pKPa_d08',
 'en-y7ZDYfb4tI8',
 'en-GusEpHUv8yI',
 'en-1FAeErPlCAc',
 'en,en-CA',
 'en-fGGqjO4aYg8',
 'en-q_HRRye8iTM',
 'en-nPk3V-duwf8',
 'en-A8Ln5B_8GBo',
 'en,en-UDggLCECq8g',
 'en-lqO-PPJy4Bc',
 'en-qoXxhA4oLYo',
 'en-m9hPnTBjEoU',
 'en-wILyS7txfUA',
 'en-IN,en-US',
 'en-6Effb_CP7

#### Checking if data downloaded from 1st round is a subset of data collection subsampled from and downloaded in 2nd round
It is.

In [43]:
with open("data/metadata/approx_en_relaxed.txt", "r") as f:
    approx_en_only = [line.strip() for line in f]

In [44]:
set(id_downloaded) - set(approx_en_only)

set()

#### Getting approximately English data to download (3rd round)
- This entails the same conditions (to get approx English data) as 1st round

In [62]:
with open("logs/data/download/2M_en/2M_download_ids.txt", "r") as f:
    id_downloaded = [line.strip().split("\t")[0] for line in f]
with open("logs/data/download/6M_en/subsampled_ids.txt", "r") as f:
    id_downloaded.extend([line.strip().split("\t")[0] for line in f])
print(id_downloaded[:5])
print(len(id_downloaded))

['05xOF5ubabQ', 'GjTPw_aQmgg', 'T9_3s_yybq8', 'nKZSV-h0SGA', '3993tVeKd-w']
8448500


In [63]:
all_ids = set(main_df_dedup['id'])
downloaded_ids = set(id_downloaded)

not_downloaded_ids = all_ids - downloaded_ids

not_downloaded_df = main_df_dedup[main_df_dedup['id'].isin(not_downloaded_ids)]

In [64]:
not_downloaded_df.reset_index(drop=True, inplace=True)

In [65]:
print(not_downloaded_df.shape[0])

51649651


In [66]:
not_downloaded_df["std_man_cap_langs"] = not_downloaded_df["manual_caption_languages"]
not_downloaded_df["std_man_cap_langs"] = not_downloaded_df["std_man_cap_langs"].apply(standardize_dialects)
view_lang_cols(not_downloaded_df, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_downloaded_df["std_man_cap_langs"] = not_downloaded_df["manual_caption_languages"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_downloaded_df["std_man_cap_langs"] = not_downloaded_df["std_man_cap_langs"].apply(standardize_dialects)


Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,std_man_cap_langs
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en,en
1,XXjYyBCNJEY,FREE Art Wednesday,,en,en,en
2,OREjUtUT2Sc,New Intro | I keep losing them,,en,en,en
3,cNPxiPUdyvA,(Примьера дораммы 2021!!!),,sv,sv,sv
4,2XitiLKk6rM,Анонимки в тик ток,ru,sv,ru,sv
5,IPxXLQpOJRU,Marisa Marisa,,it,it,it
6,ug5TkquQEHo,Marisa marisa,,it,it,it
7,Ar-XwEnnFPY,Formation des Classe D | Serveur UPRP,fr,fr-FR,fr,fr
8,nmRxk7k3kvU,Meditation Music video-2,,en-US,en-US,en
9,y9v4phX48UA,عودة ملوك العواصف الى سيرفر الجامعة/لعبة الامب...,,ar,ar,ar


In [67]:
condition = (
    not_downloaded_df["std_man_cap_langs"]
    .str.split(",")
    .apply(lambda lst: set(lst) == {"en"})
) & (not_downloaded_df["automatic_caption_orig_language"] == "en")
only_en_man_auto = not_downloaded_df[condition]

In [68]:
only_en_man_auto.shape[0]

12745843

In [69]:
sum(only_en_man_auto["duration"]) / (60 * 60)

2290777.251388889

#### Established conditions for English-only data for training
``` 
df_1["man_cap_langs_2"] = df_1["manual_caption_languages"]
condition = (
        df_1["man_cap_langs_2"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    ) & (df_1["automatic_caption_orig_language"] == "en")
df_2 = df_1[condition]
```
- To download transcript w/ specific language code, just use the 1st language. 

#### Established conditions for English-only data (for statistics and future exploration/filtering)

## Is number of hours proportional to number of audio-transcript pairs? As in, can you estimate the number of hours from number of audio-transcript pairs?
No, it is not. 

In [51]:
# calculating total duration from all trainable audio-transcript pairs
with open("logs/data/download/2M_en/2M_trainable_ids.txt", "r") as f:
    trainable_ids_2M = [line.strip() for line in f]
trainable_ids_2M_df = main_df_dedup[main_df_dedup["id"].isin(trainable_ids_2M)]
total_dur = trainable_ids_2M_df["duration"].sum() / (60 * 60)
print(total_dur)

411584.1433333333


In [52]:
print(len(trainable_ids_2M))

2300152


In [54]:
trainable_2M_dur = total_dur

In [55]:
# calculating total duration from all audio-transcript pairs subsampled randomly for downloading and training
with open("logs/data/download/2M_en/2M_download_ids.txt", "r") as f:
    durs = [int(line.strip().split("\t")[-1]) for line in f]
print(sum(durs) / (60 * 60))
downloadable_2M_dur = sum(durs) / (60 * 60)

438202.3872222222


In [61]:
# if 438202 hours correspond to 2448500 audio-transcript pairs, then how many hours correspond to 2300152 audio-transcript pairs?
est_trainable_2M_dur = (len(trainable_ids_2M) * downloadable_2M_dur) / len(id_downloaded)
print(trainable_2M_dur, est_trainable_2M_dur)
# it looks like they're quite close, so it seems that the duration of the subsampled audio-transcript pairs is quite 
# representative of the total duration of all audio-transcript pairs
# but let's check again...

411584.1433333333 411652.8884516924


In [58]:
# getting the video IDs after best filter
with open("logs/data/download/2M_en/2M_bestfilter_10_12.txt", "r") as f:
    bestfilter_2M = [line.strip() for line in f]
bestfilter_2M[:5]

['pkp_MM22ivE', 'q7QPE66HoNY', 'qHeOWpL7uCI', 'qJoGRiG_iOQ', 'qPUT1ovOqjc']

In [59]:
bestfilter_2M_df = main_df_dedup[main_df_dedup['id'].isin(bestfilter_2M)]
bestfilter_2M_df.shape[0]

1175427

In [63]:
# duration of all audio-transcript pairs after best filter
bestfilter_hours_kept = sum(bestfilter_2M_df['duration']) / (60 * 60)
print(bestfilter_hours_kept)

122121.75833333333


In [62]:
# if 411584 hours correspond to 2300152 audio-transcript pairs, then how many hours correspond to 1175427 audioo-transcript pairs?
est_bestfilter_dur = (bestfilter_2M_df.shape[0] * 411584.1433333333) / 2300152
act_bestfilter_dur = sum(bestfilter_2M_df['duration']) / (60 * 60)
print(act_bestfilter_dur, est_bestfilter_dur)
# they're quite different, so the duration of the subsampled audio-transcript pairs is not representative of the 
# total duration of all audio-transcript pairs

122121.75833333333 210328.3238872344


## How to systematically calculate how much to subsample from raw data pool and download
- Calculate reduction in data by **hours** after filtering (percentage and hours)
- If this reduction leads to number of hours **<440K hours**, then
    - Using percentage of reduction in data hours, calculate how much might be needed to enable reduction and yield minimum number of data hours to fulfill 440K hours (reduction in data hours)
    - Also account for how much might be removed when trying to download (downloadable) and segmented (trainable)
- Randomly subsample from the raw data pool iteratively until total duration of sample meets requirement

### For 2nd round of downloading, when only had 122K hours of filtered data (from 1st round of downloading)
*Note: Using variables from previous portion, don't delete!*

In [64]:
# calculating the reduction in data hours
bestfilter_hours_red = trainable_2M_dur - bestfilter_hours_kept
print(f"{bestfilter_hours_kept=}, {bestfilter_hours_red=}")
perc_kept = (bestfilter_hours_kept / trainable_2M_dur) * 100
# calculating the reduction in data hours by percentage
perc_reduced = (100 - perc_kept)
print(f"{perc_kept=}, {perc_reduced=}")

bestfilter_hours_kept=122121.75833333333, bestfilter_hours_red=289462.385
perc_kept=29.671152378294973, perc_reduced=70.32884762170502


In [65]:
approx_hours_needed = (440000 - bestfilter_hours_kept)
approx_hours_subsample = (100 * approx_hours_needed) / perc_kept
print(f"{approx_hours_needed=}, {approx_hours_subsample=}")

approx_hours_needed=317878.2416666667, approx_hours_subsample=1071337.7007197093


### For 3rd round of downloading, when only had 230K hours of filtered data (from 1st and 2nd round of downloading)

In [66]:
approx_hours_needed = (440000 - 230000)
approx_hours_subsample = (100 * approx_hours_needed) / perc_kept
print(f"{approx_hours_needed=}, {approx_hours_subsample=}")

approx_hours_needed=210000, approx_hours_subsample=707758.1528434976


## Randomly subsample English data for downloading to get sufficient data to filter down to 440K hours

(subject to change for non-English data)

### For 3rd round of downloading

In [70]:
subsampled_ids = subsample_data(df=only_en_man_auto, per_iter_sample=1000000, hours_to_subsample=700000)

dur=180319.0113888889
len(subsampled_values)=1000000
dur=360395.36805555556
len(subsampled_values)=1000000
dur=539037.1127777777
len(subsampled_values)=1000000
dur=719333.1136111111
len(subsampled_values)=1000000


#### Checking subsampled data

In [71]:
subsampled_df = not_downloaded_df[not_downloaded_df["id"].isin(subsampled_ids)]

In [72]:
view_lang_cols(subsampled_df)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,std_man_cap_langs
1,XXjYyBCNJEY,FREE Art Wednesday,,en,en,en
33,VvPE3uB7O1A,Nature Moment | The Holly Plant,en,en,en,en
36,8SnK7mWezWw,"Jobs for Veterans: Janine Hardman RN, BSN, Arm...",en,en,en,en
40,lVBVy0EY0I4,BMC Presents: Donate Life - Bob & Mike featuri...,en,en,en,en
50,0Yt4JoCW8oU,G Gundam Screenshop Repaint,en,en-US,en,en


In [73]:
[lang for lang in subsampled_df["manual_caption_languages"].unique() if "en" not in lang]

[]

#### Getting language code for transcript (random choice)

In [74]:
subsampled_df["man_cap_langs_list"] = subsampled_df["manual_caption_languages"].apply(lambda langs: langs.split(","))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subsampled_df["man_cap_langs_list"] = subsampled_df["manual_caption_languages"].apply(lambda langs: langs.split(","))


In [75]:
rng = np.random.default_rng(42)
subsampled_df["download_lang"] = subsampled_df["man_cap_langs_list"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subsampled_df["download_lang"] = subsampled_df["man_cap_langs_list"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])


In [76]:
view_lang_cols(subsampled_df, 20)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,std_man_cap_langs,man_cap_langs_list,download_lang
1,XXjYyBCNJEY,FREE Art Wednesday,,en,en,en,[en],en
33,VvPE3uB7O1A,Nature Moment | The Holly Plant,en,en,en,en,[en],en
36,8SnK7mWezWw,"Jobs for Veterans: Janine Hardman RN, BSN, Arm...",en,en,en,en,[en],en
40,lVBVy0EY0I4,BMC Presents: Donate Life - Bob & Mike featuri...,en,en,en,en,[en],en
50,0Yt4JoCW8oU,G Gundam Screenshop Repaint,en,en-US,en,en,[en-US],en-US
52,orWh_OLaRg4,All Goals | Final Copa America | Argentina (1)...,en,"en-JkeT_87f4cc,en-uYU-mmqFLq8",en,"en,en","[en-JkeT_87f4cc, en-uYU-mmqFLq8]",en-JkeT_87f4cc
59,P4xDdWmDdsY,DOĞADA HAYAT 39.BÖLÜM ATEŞ KUTUSUNDA SUCUK | ...,,en,en,en,[en],en
76,XuAsD43fbSU,Hokkaido Sweet Corn & Meal Pouch For Travelers...,,en,en,en,[en],en
77,jMT8OdDL_hE,Spicy Miso Ramen In Kyoto! | 15 Days Around Ja...,,en,en,en,[en],en
80,KpHg0kxFtMo,Beef Burger & Cheese Yang SUPER DAHSYAT | Burg...,,en,en,en,[en],en


#### Converting subsampled dataframe to JSONL batches

In [77]:
rng = np.random.default_rng(42)
shuffled_subsampled_ids_langs = subsampled_df[["id", "download_lang"]].values.tolist()
rng.shuffle(shuffled_subsampled_ids_langs)
shuffled_subsampled_ids_langs[:5]

[['CpqFbRHQ4u0', 'en-SUqXwpdmyfE'],
 ['acHsTL-bGE0', 'en-nP7-2PuUl7o'],
 ['XlzA78sVUcw', 'en'],
 ['ZupL8HxoSRA', 'en'],
 ['jCvdxiTm6qM', 'en-uYU-mmqFLq8']]

In [78]:
len(shuffled_subsampled_ids_langs)

4000000

In [79]:
subsampled_batches = get_subsampled_batches(shuffled_subsampled_ids_langs, 250)
print(len(subsampled_batches))
subsampled_batches[:5]

16000


[{'videoIds': [['CpqFbRHQ4u0', 'en-SUqXwpdmyfE'],
   ['acHsTL-bGE0', 'en-nP7-2PuUl7o'],
   ['XlzA78sVUcw', 'en'],
   ['ZupL8HxoSRA', 'en'],
   ['jCvdxiTm6qM', 'en-uYU-mmqFLq8'],
   ['yiWXOwrVITY', 'en-US'],
   ['j9l_OFYuDtM', 'en-uYU-mmqFLq8'],
   ['UoOMKnzYcNU', 'en'],
   ['yx7e8Oi-8dQ', 'en'],
   ['-esQO8EqlGc', 'en'],
   ['yGrXM9v2JLQ', 'en'],
   ['aYMOIrScCg0', 'en-uYU-mmqFLq8'],
   ['BZJ4yMKK96s', 'en'],
   ['9Ud1oYgf-Gk', 'en'],
   ['Zl2_koMI5ns', 'en'],
   ['CSV1teyQKj8', 'en-uYU-mmqFLq8'],
   ['8xBhsHbEfbo', 'en-US'],
   ['2GNcGt9mFQs', 'en-nP7-2PuUl7o'],
   ['A4wjXqyMCtU', 'en'],
   ['5RnJ8Qs7xzM', 'en'],
   ['0t0FfURt0Xs', 'en-US'],
   ['XcOSh2hF1nw', 'en-US'],
   ['i-reXVO2FpY', 'en'],
   ['9WAMMi_yIzI', 'en-uYU-mmqFLq8'],
   ['BYPASyonibk', 'en-US'],
   ['cI1viD2hiLI', 'en-LUU0EuDKgKo'],
   ['37f_qFubHAM', 'en-US'],
   ['rE0kUQwqwv4', 'en'],
   ['cvvoZ9Or0xo', 'en-uYU-mmqFLq8'],
   ['6lBZ4eUUA-w', 'en'],
   ['d8qUCEY__Ys', 'en-uYU-mmqFLq8'],
   ['iyUm25vNQ9A', 'en-US'],
   

In [80]:
print(len(subsampled_batches[0]["videoIds"]))

250


In [81]:
with open("logs/data/download/4M_en/shuffled_subsampled_batches.jsonl", "w") as f:
    for batch in subsampled_batches:
        f.write(json.dumps(batch) + "\n")

#### Getting remaining that isn't subsampled to download only transcript (for text-only filtering, will download paired audio later)

In [82]:
subsampled_ids_4M = subsampled_df["id"].values.tolist()
with open("logs/data/download/4M_en/subsampled_ids.txt", "w") as f:
    for video_id in subsampled_ids_4M:
        f.write(video_id + "\n")

In [84]:
only_en_man_auto_text = set(only_en_man_auto["id"].values.tolist()) - set(subsampled_ids_4M)
only_en_man_auto_text_df = only_en_man_auto[only_en_man_auto["id"].isin(only_en_man_auto_text)]

In [86]:
only_en_man_auto_text_df["man_cap_langs_list"] = only_en_man_auto_text_df["manual_caption_languages"].apply(lambda langs: langs.split(","))
only_en_man_auto_text_df["download_lang"] = only_en_man_auto_text_df["man_cap_langs_list"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])
print(only_en_man_auto_text_df.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_en_man_auto_text_df["man_cap_langs_list"] = only_en_man_auto_text_df["manual_caption_languages"].apply(lambda langs: langs.split(","))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_en_man_auto_text_df["download_lang"] = only_en_man_auto_text_df["man_cap_langs_list"].apply(lambda langs: rng.choice(langs) if len(langs) > 0 else langs[0])


8745843


In [87]:
view_lang_cols(only_en_man_auto_text_df, 20)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,std_man_cap_langs,man_cap_langs_list,download_lang
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en,en,[en],en
2,OREjUtUT2Sc,New Intro | I keep losing them,,en,en,en,[en],en
31,v8uKybpMJdM,Nature Moment | Common Blue Violet Flower,en,en,en,en,[en],en
32,IkiTwxRtQUk,MYO Whipped Coffee,en,en,en,en,[en],en
35,Nj4wCms-Xko,NSFW Shego (Kim Possible) Speedrun #Shorts,en,en,en,en,[en],en
37,t1EwgfdBn-0,"BMC Faces: Sarah Kimball, MD",en,en,en,en,[en],en
38,qjXauRp1uaU,Jobs for Veterans: Joseph Blansfield,en,en,en,en,[en],en
39,7TZ02EP-VaE,BMC Presents: Donate Life - Marta and Brenda's...,en,en,en,en,[en],en
41,ln8Tgp4NsOg,BMC Presents: Donate Life - Dan & Susan's Kidn...,en,en,en,en,[en],en
42,1CDSz94YRGk,"Meet: Gregory Grillone, MD",en,en,en,en,[en],en


In [90]:
rng = np.random.default_rng(42)
shuffled_ids_langs = only_en_man_auto_text_df[["id", "download_lang"]].values.tolist()
rng.shuffle(shuffled_ids_langs)
shuffled_ids_langs[:5]

[['ZigZZ7Cqvfg', 'en'],
 ['RVHmlK3mLv0', 'en-uYU-mmqFLq8'],
 ['vTfXBjO_z1E', 'en-RuL_K9RGzgg'],
 ['JTnAdquASPo', 'en'],
 ['MKof6QkOHgk', 'en']]

In [91]:
shuffled_batches = get_subsampled_batches(shuffled_ids_langs, 250)
print(len(shuffled_batches))
shuffled_batches[:5]

34984


[{'videoIds': [['ZigZZ7Cqvfg', 'en'],
   ['RVHmlK3mLv0', 'en-uYU-mmqFLq8'],
   ['vTfXBjO_z1E', 'en-RuL_K9RGzgg'],
   ['JTnAdquASPo', 'en'],
   ['MKof6QkOHgk', 'en'],
   ['0K1jOM3CHYo', 'en-US'],
   ['9sfF_2Xu7pM', 'en'],
   ['fiK9v1YI3XA', 'en-uYU-mmqFLq8'],
   ['7o8-NOxnYc4', 'en-US'],
   ['8TB_k1cpQaY', 'en-US'],
   ['5oFKx8Vt8Gs', 'en'],
   ['K5evSiB1iPY', 'en-JkeT_87f4cc'],
   ['dYvyUE3zbqs', 'en'],
   ['X5ArokuXesU', 'en-uYU-mmqFLq8'],
   ['isDBXqD0YHQ', 'en'],
   ['pI0kiftW1n8', 'en'],
   ['l2rDYpAos4Q', 'en'],
   ['XC8UxiZJPxo', 'en-US'],
   ['eE6N8MPVhRw', 'en'],
   ['muLPYSChruI', 'en-uYU-mmqFLq8'],
   ['DA4jC82hqFw', 'en-US'],
   ['JRsjuzCaX9o', 'en-uYU-mmqFLq8'],
   ['3nZWCqeQQKw', 'en'],
   ['HWyFwGtv7Tw', 'en'],
   ['QoQ12rjKDWM', 'en'],
   ['UgQ2_bGoHe4', 'en'],
   ['4_U77heiMt8', 'en-US'],
   ['mXG8k5OevHU', 'en'],
   ['dXAhK-nUQHI', 'en-PqWgEQDUTmg'],
   ['BPwI4C9ssTU', 'en-uYU-mmqFLq8'],
   ['DEHgp_OBGc4', 'en'],
   ['9L3QdbVhW1w', 'en-uYU-mmqFLq8'],
   ['rrv1xjd2D9Y',

In [93]:
with open("logs/data/download/8M_en_text_only/shuffled_batches.jsonl", "w") as f:
    for batch in shuffled_batches:
        f.write(json.dumps(batch) + "\n")

In [None]:
with open("logs/data/download/8M_en_text_only/subset_ids.txt", "w") as f:
    for video_id in only_en_man_auto_text_df["id"].values.tolist():
        f.write(video_id + "\n")

## Data Statistics

In [17]:
main_df_dedup["man_cap_langs_2"] = main_df_dedup["manual_caption_languages"]
main_df_dedup["std_man_cap_langs"] = main_df_dedup["manual_caption_languages"]
main_df_dedup["std_man_cap_langs"] = main_df_dedup["std_man_cap_langs"].apply(standardize_dialects)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df_dedup["man_cap_langs_2"] = main_df_dedup["manual_caption_languages"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df_dedup["std_man_cap_langs"] = main_df_dedup["manual_caption_languages"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df_dedup["std_man_cap_langs"] = main_df_dedu

### en-only-v2

In [None]:
condition = (
    main_df_dedup["man_cap_langs_2"]
    .str.split(",")
    .apply(lambda lst: set(lst) == {"en"})
) & (main_df_dedup["automatic_caption_orig_language"] == "en")
df_2 = main_df_dedup[condition]

In [19]:
df_2.shape[0]

9907059

In [20]:
df_2["duration"].sum() / (60 * 60)

1567039.9983333333

In [52]:
condition = (
    main_df_dedup["std_man_cap_langs"]
    .str.split(",")
    .apply(lambda lst: set(lst) == {"en"})
) & (main_df_dedup["automatic_caption_orig_language"] == "en")
df_3 = main_df_dedup[condition]

In [53]:
df_3.shape[0]

18761823

In [54]:
df_3["duration"].sum() / (60 * 60)

3369060.4058333333

In [47]:
with open("logs/data/download/2M_en/2M_download_ids.txt", "r") as f:
    subsampled_ids_2M = [line.strip().split("\t")[0] for line in f]
with open("logs/data/download/4M_en/subsampled_ids.txt", "r") as f:
    subsampled_ids_4M = [line.strip() for line in f]

with open("logs/data/download/6M_en/subsampled_ids.txt", "r") as f:
    subsampled_ids_6M = [line.strip() for line in f]
    
with open("logs/data/download/8M_en/shuffled_batches.jsonl", "r") as f:
    shuffled_batches_8M_10K = [json.loads(line.strip())["videoIds"] for line in f][:10000]
subsampled_ids_8M_10K = [tpl[0] for batch in shuffled_batches_8M_10K for tpl in batch]

In [48]:
print(f"{len(subsampled_ids_2M)=}, {len(subsampled_ids_4M)=}, {len(subsampled_ids_6M)=}, {len(subsampled_ids_8M_10K)=}")

len(subsampled_ids_2M)=2448500, len(subsampled_ids_4M)=4000000, len(subsampled_ids_6M)=6000000, len(subsampled_ids_8M_10K)=2500000


In [50]:
df_2M_mod = df_2[df_2["id"].isin(subsampled_ids_2M)]
print(f"{df_2M_mod.shape[0]=}")
df_4M_mod = df_2[df_2["id"].isin(subsampled_ids_4M)]
print(f"{df_4M_mod.shape[0]=}")
df_6M_mod = df_2[df_2["id"].isin(subsampled_ids_6M)]
print(f"{df_6M_mod.shape[0]=}")
df_8M_10K_mod = df_2[df_2["id"].isin(subsampled_ids_8M_10K)]
print(f"{df_8M_10K_mod.shape[0]=}")

df_2_mod.shape[0]=1278530
df_4M_mod.shape[0]=2116800
df_6M_mod.shape[0]=1886686
df_8M_10K_mod.shape[0]=1322858


In [43]:
with open("logs/data/download/4M_en/4M_trainable_mod.txt", "r") as f:
    trainable_ids_4M = [line.strip() for line in f]
with open("logs/data/download/6M_en/6M_trainable_mod.txt", "r") as f:
    trainable_ids_6M = [line.strip() for line in f]
with open("logs/data/download/8M_en/8M_10K_trainable_mod.txt", "r") as f:
    trainable_ids_8M_10K = [line.strip() for line in f]
with open("logs/data/download/2M_en/2M_trainable_ids.txt", "r") as f:
    trainable_ids_2M = [line.strip() for line in f]
print(f"{len(trainable_ids_4M)=}, {len(trainable_ids_6M)=}, {len(trainable_ids_8M_10K)=}, {len(trainable_ids_2M)=}")
print(f"total length: {len(trainable_ids_4M) + len(trainable_ids_6M) + len(trainable_ids_8M_10K) + len(trainable_ids_2M)}")

len(trainable_ids_4M)=1933350, len(trainable_ids_6M)=1728307, len(trainable_ids_8M_10K)=1208895, len(trainable_ids_2M)=2300152
total length: 7170704


In [44]:
temp = df_2[df_2["id"].isin(trainable_ids_2M)]
temp.shape[0]

1215816

In [46]:
all_trainable = trainable_ids_4M + trainable_ids_6M + trainable_ids_8M_10K + trainable_ids_2M
all_trainable_df = df_2[df_2["id"].isin(all_trainable)]
print(all_trainable_df.shape[0])
print(all_trainable_df["duration"].sum() / (60 * 60))

6086368
943460.7775


In [51]:
all_trainable = trainable_ids_4M + trainable_ids_6M + trainable_ids_8M_10K + trainable_ids_2M
all_trainable_df = main_df_dedup[main_df_dedup["id"].isin(all_trainable)]
print(all_trainable_df.shape[0])
print(all_trainable_df["duration"].sum() / (60 * 60))

7170704
1164319.161388889


In [37]:
len(all_trainable)

7170704

In [26]:
print(2300152 + len(trainable_ids_4M) + len(trainable_ids_6M) + len(trainable_ids_8M_10K))

7170704


In [27]:
with open("logs/data/download/4M_en/seg_fail_4M.txt", "r") as f:
    seg_fail_4M = [line.strip() for line in f]
with open("logs/data/download/6M_en/seg_fail_6M.txt", "r") as f:
    seg_fail_6M = [line.strip() for line in f]
with open("logs/data/download/8M_en/seg_fail_8M_10K.txt", "r") as f:
    seg_fail_8M_10K = [line.strip() for line in f]

In [29]:
df_seg_fail_4M_mod = df_2[df_2["id"].isin(seg_fail_4M)]
print(df_seg_fail_4M_mod.shape[0])
df_seg_fail_6M_mod = df_2[df_2["id"].isin(seg_fail_6M)]
print(df_seg_fail_6M_mod.shape[0])
df_seg_fail_8M_10K_mod = df_2[df_2["id"].isin(seg_fail_8M_10K)]
print(df_seg_fail_8M_10K_mod.shape[0])

55604
1210
33623


In [31]:
1933350 + 55604, 1728307 + 1210, 1208895 + 33623

(1988954, 1729517, 1242518)

### en-only-v1

In [22]:
condition = (
    main_df_dedup["std_man_cap_langs"]
    .str.split(",")
    .apply(lambda lst: set(lst) == {"en"})
)
df_3 = main_df_dedup[condition]

In [23]:
df_3.shape[0]

27956301

In [24]:
not_df_3_id = set(main_df_dedup["id"]) - set(df_3["id"])
len(not_df_3_id)

32141850

In [25]:
not_df_3 = main_df_dedup[main_df_dedup["id"].isin(not_df_3_id)]
not_df_3.shape[0]

32141850

In [26]:
not_df_3.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license', 'man_cap_langs_2', 'std_man_cap_langs'],
      dtype='object')

In [27]:
df_4 = not_df_3[not_df_3["std_man_cap_langs"].str.contains("en")]
df_4.shape[0]

9991767

In [28]:
df_4["language"] = df_4["language"].fillna("")
df_4["std_lang"] = df_4["language"]
df_4["std_lang"] = df_4["std_lang"].apply(standardize_dialects)
df_5 = df_4[df_4["std_lang"] == "en"]
df_5.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4["language"] = df_4["language"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4["std_lang"] = df_4["language"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4["std_lang"] = df_4["std_lang"].apply(standardize_dialects)


1933919

In [29]:
approx_en = set(df_3["id"].values.tolist() + df_5["id"].values.tolist())
len(approx_en)

29890220

In [30]:
with open("data/metadata/approx_en_relaxed.txt", "r") as f:
    temp = [line.strip() for line in f]
set(temp) - set(approx_en)

set()

In [31]:
approx_en_dur = main_df_dedup[main_df_dedup["id"].isin(approx_en)]["duration"].sum() / (60 * 60)

In [32]:
approx_en_dur

5612925.527222223

In [33]:
not_df_4_ids = set(not_df_3["id"]) - set(df_4["id"])
len(not_df_4_ids)

22150083

In [34]:
not_df_5_ids = set(df_4["id"]) - set(df_5["id"])
len(not_df_5_ids)

8057848

In [35]:
approx_non_en = list(not_df_4_ids) + list(not_df_5_ids)
len(approx_non_en)

30207931

In [36]:
approx_non_en_df = main_df_dedup[main_df_dedup["id"].isin(approx_non_en)]

In [37]:
approx_non_en_df.columns

Index(['id', 'title', 'duration', 'view_count', 'categories', 'comment_count',
       'like_count', 'channel', 'channel_id', 'channel_follower_count',
       'age_limit', 'upload_date', 'is_live', 'was_live', 'format',
       'format_id', 'language', 'filesize', 'stretched_ratio', 'width',
       'height', 'fps', 'dynamic_range', 'aspect_ratio', 'abr', 'vbr',
       'vcodec', 'asr', 'audio_channels', 'chapters',
       'manual_caption_languages', 'automatic_caption_orig_language',
       'location', 'license', 'man_cap_langs_2', 'std_man_cap_langs'],
      dtype='object')

In [38]:
approx_non_en_df[['manual_caption_languages', 'automatic_caption_orig_language', 'language']].isnull().mean() * 100

manual_caption_languages            0.000000
automatic_caption_orig_language    39.167300
language                           50.327495
dtype: float64

In [39]:
view_lang_cols(approx_non_en_df, 20)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,man_cap_langs_2,std_man_cap_langs
5,cNPxiPUdyvA,(Примьера дораммы 2021!!!),,sv,sv,sv,sv
6,2XitiLKk6rM,Анонимки в тик ток,ru,sv,ru,sv,sv
7,IPxXLQpOJRU,Marisa Marisa,,it,it,it,it
8,ug5TkquQEHo,Marisa marisa,,it,it,it,it
9,Ar-XwEnnFPY,Formation des Classe D | Serveur UPRP,fr,fr-FR,fr,fr-FR,fr
11,y9v4phX48UA,عودة ملوك العواصف الى سيرفر الجامعة/لعبة الامب...,,ar,ar,ar,ar
12,UMvzlMuBnt0,Sabuwar wakar sadiq saleh darasul Awwal,,ha,ha,ha,ha
13,hWQZq0wHIwQ,rimne - First Color / MV,,"en,ja",unknown,"en,ja","en,ja"
14,IjhDUC8nbXk,A VONTADE DE DEUS- Regis Moreira,pt,pt,pt,pt,pt
15,kKTrQhL6Zq8,"#short,आ चले ,हिंदी कविता ,#feed",,hi,hi,hi,hi


In [None]:
approx_non_en_df['man_cap_langs_count'] = approx_non_en_df['manual_caption_languages'].apply(lambda x: len(x.split(',')))

# Get the distribution of rows with 1 language, 2 languages, etc.
language_count_distribution = approx_non_en_df['man_cap_langs_count'].value_counts().sort_index()

In [41]:
language_count_distribution[:10]

man_cap_langs_count
1     20481762
2      4367971
3      1053415
4       502791
5       554458
6       394308
7       318037
8       251343
9       227189
10      205953
Name: count, dtype: int64

In [42]:
approx_non_en_1_lang_df = approx_non_en_df[approx_non_en_df["man_cap_langs_count"] == 1]

In [43]:
view_lang_cols(approx_non_en_1_lang_df, 20)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,man_cap_langs_2,std_man_cap_langs,man_cap_langs_count
5,cNPxiPUdyvA,(Примьера дораммы 2021!!!),,sv,sv,sv,sv,1
6,2XitiLKk6rM,Анонимки в тик ток,ru,sv,ru,sv,sv,1
7,IPxXLQpOJRU,Marisa Marisa,,it,it,it,it,1
8,ug5TkquQEHo,Marisa marisa,,it,it,it,it,1
9,Ar-XwEnnFPY,Formation des Classe D | Serveur UPRP,fr,fr-FR,fr,fr-FR,fr,1
11,y9v4phX48UA,عودة ملوك العواصف الى سيرفر الجامعة/لعبة الامب...,,ar,ar,ar,ar,1
12,UMvzlMuBnt0,Sabuwar wakar sadiq saleh darasul Awwal,,ha,ha,ha,ha,1
14,IjhDUC8nbXk,A VONTADE DE DEUS- Regis Moreira,pt,pt,pt,pt,pt,1
15,kKTrQhL6Zq8,"#short,आ चले ,हिंदी कविता ,#feed",,hi,hi,hi,hi,1
17,LTSQG7x7e84,רוקט ליג פרק 2 גולים מטורפים!!!!!,,iw,iw,iw,iw,1


In [44]:
approx_non_en_many_lang_df = approx_non_en_df[approx_non_en_df["man_cap_langs_count"] > 1]

In [None]:
approx_non_en_many_lang_df

## Generating CSV file mapping IDs to durations (`dur_dict`)
- This is done to add duration data to text-only JSONLs generated at downloading time

### Retrieving all relevant IDs

In [6]:
# whatever subset don't have set of known trainable ids, use set of known downloadable ids
# 6M_trainable_mod is the subset of the 6M subsampled for the 2nd round of downloading that uses the better quality English metadata conditions
id_files = ["logs/data/download/2M_en/2M_trainable_ids.txt", "logs/data/download/4M_en/subsampled_ids.txt", "logs/data/download/6M_en/6M_trainable_mod.txt", "logs/data/download/8M_en_text_only/subset_ids.txt"]
video_ids = []
for id_file in id_files:
    with open(id_file, "r") as f:
        video_ids_batch = [line.strip() for line in f]
        video_ids.extend(video_ids_batch)

In [7]:
len(video_ids)

16774302

In [27]:
dur_dict_df = main_df_dedup[main_df_dedup["id"].isin(video_ids)]

In [28]:
dur_dict_df.shape[0]

16774302

In [34]:
dur_dict_df[["id", "duration"]].to_parquet("logs/data/download/8M_en_text_only/dur_dict.parquet", index=False)

In [35]:
df = pd.read_parquet('logs/data/download/8M_en_text_only/dur_dict.parquet')

# Ensure there are at least two columns
dur_dict = dict(zip(df.iloc[:, 0], df.iloc[:, 1]))

In [36]:
dur_dict

{'LwJv3vSd8no': 372,
 '3TwThCtZGVI': 313,
 'XXjYyBCNJEY': 293,
 'fG3DqJ_YqAs': 781,
 'OREjUtUT2Sc': 11,
 'sndzyEYbmTA': 112,
 'v8uKybpMJdM': 122,
 'IkiTwxRtQUk': 88,
 'VvPE3uB7O1A': 121,
 'Nj4wCms-Xko': 19,
 '8SnK7mWezWw': 59,
 't1EwgfdBn-0': 58,
 'qjXauRp1uaU': 57,
 '7TZ02EP-VaE': 60,
 'lVBVy0EY0I4': 60,
 'ln8Tgp4NsOg': 60,
 '1CDSz94YRGk': 112,
 'Dvyxal7xrfI': 277,
 'X27WzfT3NGA': 46,
 'ybPciSw7vyE': 55,
 'kWpOGMfmyFI': 831,
 '4H2n-qfwvZ8': 310,
 'vDAana0qFmU': 163,
 '0Yt4JoCW8oU': 1233,
 'orWh_OLaRg4': 77,
 'RHK7gjNfuZ4': 1821,
 'P4xDdWmDdsY': 1453,
 'Iis9VJXgzKI': 1107,
 'o9x7TM75yhM': 57,
 'QC3enpDiB1A': 69,
 'k7lN1DCopXM': 59,
 't224ya1UUz8': 90,
 'ZmrcKeD4Hqk': 667,
 'HqO20KiYMAM': 449,
 'XuAsD43fbSU': 1037,
 '3Iasm0Dv3bc': 616,
 'jMT8OdDL_hE': 503,
 'SVd654PgTp4': 716,
 'RMlWF2KOC_4': 708,
 '3Mr8lg5kvIU': 906,
 'Hd5B3JgJ_w0': 637,
 'KpHg0kxFtMo': 383,
 'qnfwM7F-q_Y': 525,
 '-oyDwlEwxnU': 659,
 '_VxLBZkZRek': 366,
 'EOI4kjC34LQ': 343,
 'gPgI_OsFk0g': 37,
 '7Bj1BbavrMo': 286,
 'y3

In [37]:
view_lang_cols(dur_dict_df, 5)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language
0,LwJv3vSd8no,Free Art Wednesday!,en,en,en
1,3TwThCtZGVI,Etsy Commission,en,en,en
2,XXjYyBCNJEY,FREE Art Wednesday,,en,en
3,fG3DqJ_YqAs,PANGANDARAN SAAT INI | DRONE VIEW | DISAAT MALAM,en,en,en
4,OREjUtUT2Sc,New Intro | I keep losing them,,en,en


In [41]:
dur_dict_df[["id", "duration"]].to_parquet("logs/data/download/8M_en_text_only/dur_dict.parquet", index=False, compression="gzip")

## Debug

### Debugging (part 1)

In [17]:
main_df_dedup["std_man_cap_langs"] = main_df_dedup["manual_caption_languages"]
main_df_dedup["std_man_cap_langs"] = main_df_dedup["std_man_cap_langs"].apply(standardize_dialects)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df_dedup["std_man_cap_langs"] = main_df_dedup["manual_caption_languages"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df_dedup["std_man_cap_langs"] = main_df_dedup["std_man_cap_langs"].apply(standardize_dialects)


In [18]:
condition = (
    main_df_dedup["std_man_cap_langs"]
    .str.split(",")
    .apply(lambda lst: set(lst) == {"en"})
) & (main_df_dedup["automatic_caption_orig_language"] == "en")
only_en_man_auto = main_df_dedup[condition]

In [19]:
with open("logs/data/download/2M_en/2M_download_ids.txt", "r") as f:
    id_downloaded_2M = [line.strip().split("\t")[0] for line in f]

In [20]:
with open("logs/data/download/6M_en/subsampled_ids.txt", "r") as f:
    id_downloaded_6M = [line.strip() for line in f]

In [22]:
all_ids = only_en_man_auto["id"]

In [23]:
set(id_downloaded_2M) - set(all_ids)

set()

In [24]:
with open("logs/data/download/4M_en/subsampled_ids.txt", "r") as f:
    id_downloaded_4M = [line.strip() for line in f]

In [25]:
set(id_downloaded_4M) - set(all_ids)

set()

In [26]:
len(set(id_downloaded_4M) - set(id_downloaded_6M))

4000000

In [27]:
with open("logs/data/download/6M_en/6M_trainable.txt", "r") as f:
    id_trainable_6M = [line.strip() for line in f]

In [28]:
len(id_trainable_6M)

5284868

In [None]:
df_1 = main_df_dedup[main_df_dedup['id'].isin(id_trainable_6M)]

In [30]:
df_1["man_cap_langs_2"] = df_1["manual_caption_languages"]
condition = (
        df_1["man_cap_langs_2"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    ) & (df_1["automatic_caption_orig_language"] == "en")
df_2 = df_1[condition]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1["man_cap_langs_2"] = df_1["manual_caption_languages"]


In [31]:
df_2.shape[0]

1728307

In [32]:
df_1["std_man_cap_langs"] = df_1["manual_caption_languages"]
df_1["std_man_cap_langs"] = df_1["std_man_cap_langs"].apply(standardize_dialects)
condition = (
        df_1["std_man_cap_langs"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    ) & (df_1["automatic_caption_orig_language"] == "en")
df_3 = df_1[condition]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1["std_man_cap_langs"] = df_1["manual_caption_languages"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1["std_man_cap_langs"] = df_1["std_man_cap_langs"].apply(standardize_dialects)


In [33]:
df_3.shape

(3191446, 36)

In [34]:
df_3.shape[0]

3191446

In [17]:
with open("logs/data/download/4M_en/4M_trainable.txt", "r") as f:
    id_trainable_4M = [line.strip().split(" ")[0] for line in f]

In [18]:
df_4 = main_df_dedup[main_df_dedup['id'].isin(id_trainable_4M)]

In [19]:
df_4["man_cap_langs_2"] = df_4["manual_caption_languages"]
condition = (
        df_4["man_cap_langs_2"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    ) & (df_4["automatic_caption_orig_language"] == "en")
df_5 = df_4[condition]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_4["man_cap_langs_2"] = df_4["manual_caption_languages"]


In [20]:
df_5.shape[0]

1933350

In [22]:
df_5["manual_caption_languages"].unique()

array(['en'], dtype=object)

In [23]:
with open("logs/data/download/4M_en/4M_trainable_mod.txt", "w") as f:
    for video_id in df_5["id"].values.tolist():
        f.write(video_id + "\n")

### Debugging (part 2)

In [24]:
with open("logs/data/download/4M_en/4M_trainable_bad.txt", "r") as f:
    id_trainable_4M_bad = [line.strip().split(" ")[0] for line in f]

In [26]:
with open("logs/data/download/4M_en/shuffled_subsampled_batches.jsonl", "r") as f:
    ids_lang_tpl_list = [json.loads(line.strip())["videoIds"] for line in f]

In [27]:
ids_lang_tpl_list[0]

[['CpqFbRHQ4u0', 'en-SUqXwpdmyfE'],
 ['acHsTL-bGE0', 'en-nP7-2PuUl7o'],
 ['XlzA78sVUcw', 'en'],
 ['ZupL8HxoSRA', 'en'],
 ['jCvdxiTm6qM', 'en-uYU-mmqFLq8'],
 ['yiWXOwrVITY', 'en-US'],
 ['j9l_OFYuDtM', 'en-uYU-mmqFLq8'],
 ['UoOMKnzYcNU', 'en'],
 ['yx7e8Oi-8dQ', 'en'],
 ['-esQO8EqlGc', 'en'],
 ['yGrXM9v2JLQ', 'en'],
 ['aYMOIrScCg0', 'en-uYU-mmqFLq8'],
 ['BZJ4yMKK96s', 'en'],
 ['9Ud1oYgf-Gk', 'en'],
 ['Zl2_koMI5ns', 'en'],
 ['CSV1teyQKj8', 'en-uYU-mmqFLq8'],
 ['8xBhsHbEfbo', 'en-US'],
 ['2GNcGt9mFQs', 'en-nP7-2PuUl7o'],
 ['A4wjXqyMCtU', 'en'],
 ['5RnJ8Qs7xzM', 'en'],
 ['0t0FfURt0Xs', 'en-US'],
 ['XcOSh2hF1nw', 'en-US'],
 ['i-reXVO2FpY', 'en'],
 ['9WAMMi_yIzI', 'en-uYU-mmqFLq8'],
 ['BYPASyonibk', 'en-US'],
 ['cI1viD2hiLI', 'en-LUU0EuDKgKo'],
 ['37f_qFubHAM', 'en-US'],
 ['rE0kUQwqwv4', 'en'],
 ['cvvoZ9Or0xo', 'en-uYU-mmqFLq8'],
 ['6lBZ4eUUA-w', 'en'],
 ['d8qUCEY__Ys', 'en-uYU-mmqFLq8'],
 ['iyUm25vNQ9A', 'en-US'],
 ['dPhbUavjSKc', 'en'],
 ['E_VQwBN5XSI', 'en-GB'],
 ['qRU1dWQlO4s', 'en-US'],
 

In [28]:
from itertools import chain
ids_lang_tpl_list = list(chain(*ids_lang_tpl_list))
len(ids_lang_tpl_list)

4000000

In [36]:
id_to_lang = {}
for tpl in ids_lang_tpl_list:
    id_to_lang[tpl[0]] = tpl[1]

In [39]:
id_to_lang_bad_4M = {}
for id in id_trainable_4M_bad:
    id_to_lang_bad_4M[id] = id_to_lang[id]

In [40]:
len(id_to_lang_bad_4M)

1636616

In [41]:
id_to_lang_bad_4M

{'gARhAiWfgQk': 'en-uYU-mmqFLq8',
 'Imj66qr5GKc': 'en-nP7-2PuUl7o',
 '7D2Ar1Sfl90': 'en-uYU-mmqFLq8',
 'OitkJWm2bII': 'en-vlW2OHFw3ww',
 '1CSiCftovHo': 'en-8we0KTnfFCk',
 'r6o_JVqaJgc': 'en-nP7-2PuUl7o',
 'xuUq8srwES0': 'en-US',
 '-IGoZ35FHDE': 'en-uYU-mmqFLq8',
 'WE_pi17usnQ': 'en-uYU-mmqFLq8',
 'IaJYMIeGq9E': 'en-U5ILRQKRVmc',
 '4q-BByqHORA': 'en-DgTNZJhXds8',
 '-7jfsU3Kswo': 'en-nP7-2PuUl7o',
 'VXru3bhJRK4': 'en-nP7-2PuUl7o',
 '8D2m38q5JLU': 'en-uYU-mmqFLq8',
 'i4-1sZUzHJQ': 'en-uYU-mmqFLq8',
 'eDuLdz-dD0Q': 'en-uYU-mmqFLq8',
 'uzlq2TiVz2g': 'en-US',
 'qO7VucjhcAo': 'en-nP7-2PuUl7o',
 '8JUPqtkzR60': 'en-GB',
 'FjKEpQuII5k': 'en-nP7-2PuUl7o',
 'QgoOfCxKkgc': 'en-GB',
 'e8nXqz5D5-I': 'en-nP7-2PuUl7o',
 '_X7CqCNHdY0': 'en-US',
 'b6bd7leQvz4': 'en-US',
 'CZtOR4_F7fA': 'en-uYU-mmqFLq8',
 'PxVj7FnCVnQ': 'en-j3PyPqV-e1s',
 '3GejB11e9Zg': 'en-uYU-mmqFLq8',
 'UB4YY5asVfc': 'en-nP7-2PuUl7o',
 'ige9pq_mTkc': 'en-uYU-mmqFLq8',
 '61oqInPEBPM': 'en-uYU-mmqFLq8',
 'Z2LE_bHQKvs': 'en-CA',
 '66N2zXN

In [43]:
bad_4M_df = df_4[df_4["id"].isin(id_trainable_4M_bad)]
bad_4M_df.shape[0]

1636616

In [45]:
view_lang_cols(bad_4M_df, 30)

Unnamed: 0,id,title,language,manual_caption_languages,automatic_caption_orig_language,man_cap_langs_2
61,orWh_OLaRg4,All Goals | Final Copa America | Argentina (1)...,en,"en-JkeT_87f4cc,en-uYU-mmqFLq8",en,"en-JkeT_87f4cc,en-uYU-mmqFLq8"
106,y3o-baXQT0Q,VLOG #1 - Introduction,en,en-US,en,en-US
146,zHGwE0ritgk,Let's Play Minecraft: Greatest freakout ever,en,en-18WT74-rBWA,en,en-18WT74-rBWA
288,5GPxO1AEAwc,KC & HR COLLEGE 2021 STEP BY STEP ADMISSION PR...,en,en-IN,en,en-IN
498,KWszRt6gKW8,Mr Indestructable,en,en-uYU-mmqFLq8,en,en-uYU-mmqFLq8
677,9qBidHh1weI,Manchester bin collection and street cleaning ...,en,en-GB,en,en-GB
812,xMj6WpM8XeQ,When Time Management,en,en-uYU-mmqFLq8,en,en-uYU-mmqFLq8
814,CnVil18-WLY,He Won't Work on Your Marriage,en,en-uYU-mmqFLq8,en,en-uYU-mmqFLq8
826,3nqTJ-U4kN8,Power of Saying No | 8- Priorities for Life,en,en-uYU-mmqFLq8,en,en-uYU-mmqFLq8
843,6gniitAi2MY,Mirror Intro,,en-cvfXDfbeED0,en,en-cvfXDfbeED0


In [54]:
condition = (bad_4M_df["manual_caption_languages"].str.split(",").apply(lambda lst: len(lst) == 1))
bad_4M_df_2 = bad_4M_df[condition]
bad_4M_df_2.shape[0]

1237771

In [55]:
bad_4M_df_2["manual_caption_languages"].unique().tolist()

['en-US',
 'en-18WT74-rBWA',
 'en-IN',
 'en-uYU-mmqFLq8',
 'en-GB',
 'en-cvfXDfbeED0',
 'en-US-anMIXjicSL4',
 'en-s8TMdDjdYe8',
 'en-nP7-2PuUl7o',
 'en-CA',
 'en-US-YfeEIUII1AU',
 'en-ehkg1hFWq8A',
 'en-anMIXjicSL4',
 'en-1oAFzPNiuC0',
 'en-fGGqjO4aYg8',
 'en-IE',
 'en-rX01gsAdUfU',
 'en-Uym2X1t3MTA',
 'en-8k-bGVCfGYE',
 'en-MPV48gM7bqQ',
 'en-LUU0EuDKgKo',
 'en-6UJrWS5jR_I',
 'en-tyWfmtkyBQo',
 'en-SJ_OorK2o0k',
 'en-gG13fOM8Dfs',
 'en-eEY6OEpapPo',
 'en-zOnwy86eUkk',
 'en-Sxg7wWGnGNg',
 'en-DvHdX2kI9hs',
 'en-fcVymTUPpeI',
 'en-H_h6RMPrfCg',
 'en-jFK5kpDRP3M',
 'en-v9B2qqWVzdQ',
 'en-FetHzM0RQJU',
 'en-dBnIPyQn-7E',
 'en-UMYBvvsXfqo',
 'en-j3PyPqV-e1s',
 'en-qlPKC2UN_YU',
 'en-US-qhidoPi0w8c',
 'en-0GZSgHK00aQ',
 'en-6ebHVfu6BsQ',
 'en-ovhHx95i6Ho',
 'en-0hllRZe4s5s',
 'en-zL7EZQmMa4Q',
 'en-5iccchR9oVs',
 'en-GB-anMIXjicSL4',
 'en-yR6JfUoAWYg',
 'en-xWCTb4ASxWo',
 'en-QLlkAr8DpK0',
 'en-fBJ7WLCgqx8',
 'en-GtDsi6WLVPc',
 'en-VqxSfNneakA',
 'en-yd_sspvDACE',
 'en-6DGfwZopUls',
 'en-Qi

In [57]:
bad_4M_df_2["manual_caption_languages"].value_counts()[:50]

manual_caption_languages
en-US                340387
en-uYU-mmqFLq8       260000
en-GB                143094
en-nP7-2PuUl7o       126966
en-ehkg1hFWq8A        60954
en-IN                 26036
en-CA                 25484
en-eEY6OEpapPo        18269
en-j3PyPqV-e1s        13300
en-1oAFzPNiuC0         8244
en-LUU0EuDKgKo         5574
en-qlPKC2UN_YU         4898
en-hnCuwW0TCiQ         4627
en-rX01gsAdUfU         4440
en-RTbB2cpHawQ         4290
en-qoXxhA4oLYo         3708
en-VjEpQ-RlAYk         3673
en-US-eEY6OEpapPo      2970
en-i7LFIGUQP0g         2840
en-Q8VSUiGLCHE         2537
en-zL7EZQmMa4Q         2496
en-0hllRZe4s5s         2365
en-vlW2OHFw3ww         2329
en-IE                  2296
en-Jyun0WSqg2k         2097
en-US-cvfXDfbeED0      1865
en-6UJrWS5jR_I         1649
en-cvfXDfbeED0         1623
en-anMIXjicSL4         1347
en-dQs7zDoAYDs         1288
en-US-dOaosRHqfaE      1197
en-US-zOFsReYi2c4       993
en-UMYBvvsXfqo          922
en-E3euOu7LhIg          884
en-n7b14AlKMnA         

In [66]:
bad_4M_df_2["manual_caption_languages"].value_counts()[:8]

manual_caption_languages
en-US             340387
en-uYU-mmqFLq8    260000
en-GB             143094
en-nP7-2PuUl7o    126966
en-ehkg1hFWq8A     60954
en-IN              26036
en-CA              25484
en-eEY6OEpapPo     18269
Name: count, dtype: int64

In [65]:
sum(bad_4M_df_2["manual_caption_languages"].value_counts()[:8])

1001190

In [68]:
accepted_langs = {"en-US", "en-uYU-mmqFLq8", "en-GB", "en-nP7-2PuUl7o", "en-ehkg1hFWq8A", "en-IN", "en-CA", "en-eEY6OEpapPo"}
condition = (bad_4M_df["manual_caption_languages"].str.split(",").apply(lambda lst: (len(lst) == 1) & (lst[0] in accepted_langs)))
bad_4M_df_3 = bad_4M_df[condition]
bad_4M_df_3.shape[0]

1001190

In [69]:
with open("logs/data/download/4M_en/4M_trainable_mod_2.txt", "w") as f:
    for video_id in bad_4M_df_3["id"].values.tolist():
        f.write(video_id + "\n")

In [71]:
sum(bad_4M_df_2["manual_caption_languages"].value_counts()[:4])

870447

In [72]:
accepted_langs = {"en-US", "en-uYU-mmqFLq8", "en-GB", "en-nP7-2PuUl7o"}
condition = (bad_4M_df["manual_caption_languages"].str.split(",").apply(lambda lst: (len(lst) == 1) & (lst[0] in accepted_langs)))
bad_4M_df_3 = bad_4M_df[condition]
bad_4M_df_3.shape[0]

870447

In [73]:
with open("logs/data/download/4M_en/4M_trainable_mod_2.txt", "w") as f:
    for video_id in bad_4M_df_3["id"].values.tolist():
        f.write(video_id + "\n")

In [None]:
accepted_langs = {"en-US", "en-GB"}
condition = (bad_4M_df["manual_caption_languages"].str.split(",").apply(lambda lst: (len(lst) == 1) & (lst[0] in accepted_langs)))
bad_4M_df_3 = bad_4M_df[condition]
bad_4M_df_3.shape[0]

508965

In [76]:
with open("logs/data/download/4M_en/4M_trainable_mod_2.txt", "w") as f:
    for video_id in bad_4M_df_3["id"].values.tolist():
        f.write(video_id + "\n")

### Debugging (part 3)

In [77]:
with open("logs/data/download/6M_en/6M_trainable.txt", "r") as f:
    id_trainable_6M = [line.strip() for line in f]

with open("logs/data/download/6M_en/6M_trainable_mod.txt", "r") as f:
    id_trainable_6M_mod = [line.strip() for line in f]

In [78]:
len(id_trainable_6M), len(id_trainable_6M_mod)

(5284868, 1728307)

In [79]:
id_trainable_6M_bad = set(id_trainable_6M) - set(id_trainable_6M_mod)
len(id_trainable_6M_bad)

3556561

In [82]:
trainable_6M_bad_df = main_df_dedup[main_df_dedup["id"].isin(id_trainable_6M_bad)]

In [83]:
condition = (trainable_6M_bad_df["manual_caption_languages"].str.split(",").apply(lambda lst: len(lst) == 1))
trainable_6M_bad_df_2 = trainable_6M_bad_df[condition]
trainable_6M_bad_df_2.shape[0]

2739853

In [85]:
trainable_6M_bad_df_2["manual_caption_languages"].value_counts()[:50]

manual_caption_languages
en                   1019176
en-US                 550734
en-uYU-mmqFLq8        281748
en-GB                 247014
en-nP7-2PuUl7o        139677
en-IN                  84448
en-ehkg1hFWq8A         61016
en-CA                  37492
en-RTbB2cpHawQ         23390
en-eEY6OEpapPo         19557
en-j3PyPqV-e1s         14124
en-1oAFzPNiuC0         10235
en-rX01gsAdUfU          8920
en-LUU0EuDKgKo          7493
en-Q8VSUiGLCHE          6241
en-E3euOu7LhIg          6163
en-qlPKC2UN_YU          5275
en-0hllRZe4s5s          4956
en-qoXxhA4oLYo          4096
en-hnCuwW0TCiQ          4078
en-zL7EZQmMa4Q          3564
en-VjEpQ-RlAYk          3399
en-IE                   3246
en-i7LFIGUQP0g          3086
en-US-eEY6OEpapPo       2890
en-_yASqOZhnrs          2611
en-vlW2OHFw3ww          2327
en-6UJrWS5jR_I          2253
en-anMIXjicSL4          2190
en-Jyun0WSqg2k          1947
en-UMYBvvsXfqo          1927
en-US-cvfXDfbeED0       1874
en-cvfXDfbeED0          1558
en-dQs7zDoAYDs    

In [86]:
condition = (trainable_6M_bad_df["manual_caption_languages"].str.split(",").apply(lambda lst: len(lst) == 1)) & (trainable_6M_bad_df["automatic_caption_orig_language"] == "en")
trainable_6M_bad_df_2 = trainable_6M_bad_df[condition]
trainable_6M_bad_df_2.shape[0]

1106647

In [87]:
trainable_6M_bad_df_2["manual_caption_languages"].value_counts()[:50]

manual_caption_languages
en-US                304049
en-uYU-mmqFLq8       232641
en-GB                127683
en-nP7-2PuUl7o       113267
en-ehkg1hFWq8A        54606
en-IN                 23053
en-CA                 22747
en-eEY6OEpapPo        16221
en-j3PyPqV-e1s        11962
en-1oAFzPNiuC0         7446
en-LUU0EuDKgKo         5024
en-qlPKC2UN_YU         4491
en-hnCuwW0TCiQ         4024
en-rX01gsAdUfU         3899
en-RTbB2cpHawQ         3820
en-VjEpQ-RlAYk         3399
en-qoXxhA4oLYo         3263
en-i7LFIGUQP0g         2610
en-US-eEY6OEpapPo      2565
en-zL7EZQmMa4Q         2326
en-Q8VSUiGLCHE         2263
en-0hllRZe4s5s         2209
en-vlW2OHFw3ww         2124
en-IE                  1985
en-Jyun0WSqg2k         1764
en-US-cvfXDfbeED0      1661
en-6UJrWS5jR_I         1500
en-cvfXDfbeED0         1394
en-dQs7zDoAYDs         1203
en-anMIXjicSL4         1192
en-US-dOaosRHqfaE      1066
en-US-zOFsReYi2c4       920
en-UMYBvvsXfqo          835
en-E3euOu7LhIg          818
en-GHEw9DUond8         

In [89]:
accepted_langs = {"en-US", "en-GB"}
condition = (trainable_6M_bad_df["manual_caption_languages"].str.split(",").apply(lambda lst: (len(lst) == 1) & (lst[0] in accepted_langs)) & (trainable_6M_bad_df["automatic_caption_orig_language"] == "en")) 
trainable_6M_bad_df_3 = trainable_6M_bad_df[condition]
trainable_6M_bad_df_3.shape[0]

431732

In [90]:
with open("logs/data/download/6M_en/6M_trainable_mod_2.txt", "w") as f:
    for video_id in trainable_6M_bad_df_3["id"].values.tolist():
        f.write(video_id + "\n")

### Debugging (part 4)

In [28]:
with open("logs/data/download/8M_en/8M_10K_trainable.txt", "r") as f:
    id_trainable_8M_10K = [line.strip().split("\t")[0] for line in f]
len(id_trainable_8M_10K)

2231027

In [29]:
df_1 = main_df_dedup[main_df_dedup['id'].isin(id_trainable_8M_10K)]

In [30]:
df_1["man_cap_langs_2"] = df_1["manual_caption_languages"]
condition = (
        df_1["man_cap_langs_2"]
        .str.split(",")
        .apply(lambda lst: set(lst) == {"en"})
    ) & (df_1["automatic_caption_orig_language"] == "en")
df_2 = df_1[condition]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1["man_cap_langs_2"] = df_1["manual_caption_languages"]


In [31]:
df_2.shape[0]

1208895

In [32]:
with open("logs/data/download/8M_en/8M_10K_trainable_mod.txt", "w") as f:
    for video_id in df_2["id"].values.tolist():
        f.write(video_id + "\n")