## Mannequin Challenge Dataset Downloader
This notebook is a downloader for Mannequin Challenge Dataset. Written by Myeong-Gyu.Lee

* Reference: https://blog.naver.com/PostView.nhn?blogId=skyshin0304&logNo=221620513883&proxyReferer=https:%2F%2Fwww.google.com%2F

type `pip install pytube3` to install pytube library.

### Test single video download

In [None]:
from pytube import YouTube
from tqdm import tqdm
import os, cv2, shutil, math, datetime, ast
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [None]:
_url = 'https://www.youtube.com/watch?v=KMtgexdtcGY'
yt = YouTube(_url)
print("영상 제목 :", yt.title)
print("영상 길이 :", yt.length)
print("영상 평점 :", yt.rating)
print("영상 썸네일 링크: ", yt.thumbnail_url)
print("영상 조회수 :", yt.views)
print("영상 설명 :", yt.description)

In [None]:
_yt_streams = yt.streams
print("다운가능한 영상 상세 정보 :")
for i, stream in enumerate(_yt_streams.all()):
    print(i, " : ", stream)

### Build the dataframe and sort by `res` to get highest resolution video.
Get only `video/mp4` rows.

In [None]:
def get_download_info(yt_streams, video_url):
    stream_df_list = []
    
    for stream in yt_streams.all():
        try:
            stream_dict = dict()
            stream_str = str(stream)
            stream_elements = stream_str.replace('Stream: ', '').replace('=', ':').replace('<', '').replace('>', '').replace('"', '').split(' ')
            for elemnt in stream_elements:
                stream_dict[elemnt.split(':')[0]] = elemnt.split(':')[-1]
            stream_df_list.append(pd.DataFrame.from_dict(stream_dict, orient='index').T)
        except:
            print("This yt_stream does not have any downloadable streams.")

    stream_df_global = pd.concat(stream_df_list)
    stream_df_global['videoID'] = str(video_url.split('/')[-1].split('=')[-1])
    stream_df_global.set_index('videoID', inplace = True)
    stream_df_global = stream_df_global[pd.notnull(stream_df_global['res'])]
    stream_df_global['res'] = stream_df_global['res'].str.replace(pat=r'[A-Za-z]', repl= r'', regex=True)
    stream_df_global = stream_df_global.astype({'itag': int, 'res': int})
    stream_df_global = stream_df_global.sort_values(by='res', ascending=False)
    stream_df_global = stream_df_global[stream_df_global['mime_type'] == 'video/mp4']
    
    # itag를 이용해 가장 높은 해상도의 Video Download
    highest_stream = yt_streams.get_by_itag(stream_df_global.iloc[0]['itag'])
    
    return stream_df_global, highest_stream

In [None]:
stream_df_global, highest_stream = get_download_info(_yt_streams, _url)
print('Highest resolution stream info:', highest_stream)
stream_df_global.head(3)

In [None]:
print("선택된 stream 다운로드:", highest_stream)
# highest_stream.download(output_path='D:/MannequinChallenge_Videos', filename=stream_df_global.index[0])

### Read whole `.txt` file and download

In [None]:
meta_txt_path = 'D:/MannequinChallenge'

Get video url and build combined dictionary

In [None]:
entire_video_urls = {}

for path in os.listdir(meta_txt_path):
    path_url_list = []
    
    txt_path = os.path.join(meta_txt_path, path)
    for txt in os.listdir(txt_path):
        f = open(os.path.join(txt_path, txt))
        line = f.readline().replace('\n', '')
        path_url_list.append(line)
    entire_video_urls.update({path:path_url_list})

Download videos.

In [None]:
failure_videos = []

for key, value in entire_video_urls.items():
    for video_url in value:
        try:
            output_path = os.path.join('D:/MannequinChallenge_Videos', key)
            os.makedirs(output_path, exist_ok=True)
            yt_streams = YouTube(video_url)
            _yt_str = yt_streams.streams
            stream_df_global, highest_stream = get_download_info(_yt_str, video_url)
            highest_stream.download(output_path=output_path, filename=video_url.split('=')[-1])
        except:
            failed_url = video_url.split('=')[-1]
            print("This stream is not downloadable: {}".format(failed_url))
            failure_videos.append(failed_url)
            continue
            
print("Done downloading videos! Failure video count: {}".format(len(failed_url)))