In [5]:
import numpy as np
import pandas as pd
import joblib
import os
import requests
import json
import time
import concurrent.futures
import glob
from tqdm import tqdm
from pytube import YouTube
import cv2

In [6]:
details = glob.glob("details/*.pkl")

In [7]:
data_list = []

In [8]:
for i, pkl in tqdm(enumerate(details), total=len(details)):
    data = joblib.load(pkl)
    data_list.append(data)

100%|██████████| 453/453 [00:02<00:00, 157.42it/s]


In [9]:
df = pd.DataFrame.from_dict(data_list, orient='columns')

In [10]:
df.columns.to_list()

['id',
 'sku',
 'type',
 'sex',
 'color',
 'breed',
 'feed',
 'age_in_month',
 'slides',
 'sale_offer_percentage',
 'feedlot_in_month',
 'teeth',
 'parts_available',
 'height_in_inch',
 'youtube_slug',
 'weight_in_kg',
 'price',
 'is_special',
 'is_dewormed',
 'is_fmd_vaccinated',
 'is_anthrax_vaccinated',
 'is_lumpy_skin_disease',
 'size',
 'thumbnail',
 'created_at',
 'status']

In [11]:
columns = ['sku', 'type', 'sex', 'color', 'breed', 'feed', 'age_in_month', 'feedlot_in_month', 'teeth', 'height_in_inch', 'youtube_slug', 'weight_in_kg', 'price', 'size']
df = df.loc[:, columns]

In [12]:
df.head()

Unnamed: 0,sku,type,sex,color,breed,feed,age_in_month,feedlot_in_month,teeth,height_in_inch,youtube_slug,weight_in_kg,price,size
0,BLF2002,COW,MALE_BULL,NON_RED,LOCAL,"[JUMBOO, LUCERNE, NAPIER, SILAGE, STRAW, CORN,...",2.5 Years,3 Months,4,51.0,I0FFZuejRlo?rel=0,296.0,125000.0,LARGE
1,BLF2003,COW,MALE_BULL,RED,LOCAL,"[JUMBOO, LUCERNE, NAPIER, SILAGE, STRAW, CORN,...",2 Years,3 Months,2,44.0,TZREqa_GhBU,212.0,92000.0,MINIMUM
2,BLF2004,COW,MALE_BULL,RED,LOCAL,"[JUMBOO, LUCERNE, NAPIER, SILAGE, STRAW, CORN,...",2 Years,3 Months,2,43.5,0gab9HKzRSQ,202.0,88000.0,MINIMUM
3,BLF2005,COW,MALE_BULL,NON_RED,LOCAL,"[JUMBOO, LUCERNE, NAPIER, SILAGE, STRAW, CORN,...",2 Years,3 Months,2,45.0,jeBvgtUPias,205.0,88000.0,MEDIUM
4,BLF2008,COW,MALE_BULL,RED,RED_CHITTAGONG,"[JUMBOO, LUCERNE, NAPIER, SILAGE, STRAW, CORN,...",2 Years,3 Months,2,43.2,VsfL3AENFnU,215.0,95000.0,MINIMUM


In [13]:
df_sku_slug = df.loc[:, ["sku", "youtube_slug"]]
df_sku_slug.head()

Unnamed: 0,sku,youtube_slug
0,BLF2002,I0FFZuejRlo?rel=0
1,BLF2003,TZREqa_GhBU
2,BLF2004,0gab9HKzRSQ
3,BLF2005,jeBvgtUPias
4,BLF2008,VsfL3AENFnU


In [14]:
len(df_sku_slug.loc[:, "youtube_slug"].values)

453

In [15]:
sku_slug_list = []
for index, row in df_sku_slug.iterrows():
    sku_slug_list.append((row['sku'], row['youtube_slug']))

In [16]:
sku_slug_list[0]

('BLF2002', 'I0FFZuejRlo?rel=0')

In [36]:
existed = 0
downloaded = 0

In [37]:
apply_tuple = lambda f: lambda args: f(*args)

def sku2filename(sku):
    splitted_list = list(sku)
    file_name = ''.join(splitted_list[:3]) + ' ' + ''.join(splitted_list[3:]) + '.mp4'
    return file_name

@apply_tuple
def yt2imgs(sku, slug):
    url = f'https://www.youtube.com/watch?v={slug}'
    path = f'yt_videos/{sku}'
    if not os.path.exists(path):
        os.mkdir(path)
    if len(os.listdir(path)) == 1 or os.listdir(path)[0] == sku2filename(sku):
        # print(f"Video already existed for sku: {sku}...")
        global existed
        existed += 1
        return None
    try:
        YouTube(url).streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(path)
        # print(f"Downloaded the video for sku: {sku}...")
        global downloaded
        downloaded += 1
    except Exception as e:
        print(f'Got exception for {url}, exception: {e}')


In [38]:
import time
t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(yt2imgs, sku_slug_list)

t2 = time.perf_counter()

print("========================================")
print(f"Finished in {t2-t1} seconds")
print(f"Total downloaded => {downloaded}")
print(f"Total existed => {existed}")
print("========================================")

Finished in 0.05359619999990173 seconds
Total downloaded => 0
Total existed => 449


In [42]:
yt_video_path_list = []
for dirname, _, filenames in os.walk("yt_videos"):
    for filename in filenames:
        yt_video_path_list.append(os.path.join(dirname, filename))

In [44]:
len(yt_video_path_list)

449

In [46]:
yt_video_path_list[:10]

['yt_videos\\BLF2002\\BLF 2002.mp4',
 'yt_videos\\BLF2003\\BLF 2003.mp4',
 'yt_videos\\BLF2004\\BLF 2004.mp4',
 'yt_videos\\BLF2005\\BLF 2005.mp4',
 'yt_videos\\BLF2008\\BLF 2008.mp4',
 'yt_videos\\BLF2009\\BLF 2009.mp4',
 'yt_videos\\BLF2010\\BLF 2010.mp4',
 'yt_videos\\BLF2011\\BLF 2011.mp4',
 'yt_videos\\BLF2012\\BLF 2012.mp4',
 'yt_videos\\BLF2014\\BLF 2014.mp4']