## Mannequin Challenge Dataset Downloader
This notebook is a downloader for Mannequin Challenge Dataset. Written by Myeong-Gyu.Lee

* Reference: https://blog.naver.com/PostView.nhn?blogId=skyshin0304&logNo=221620513883&proxyReferer=https:%2F%2Fwww.google.com%2F

type `pip install pytube3` to install pytube library.

### Test single video download

In [1]:
from pytube import YouTube
from tqdm import tqdm
import os, cv2, shutil, math, datetime, ast, warnings
import matplotlib.pyplot as plt
import pandas as pd

warnings.filterwarnings("ignore",category=DeprecationWarning)
%matplotlib inline

In [2]:
_url = 'https://www.youtube.com/watch?v=KMtgexdtcGY'
yt = YouTube(_url)
print("영상 제목 :", yt.title)
print("영상 길이 :", yt.length)
print("영상 평점 :", yt.rating)
print("영상 썸네일 링크: ", yt.thumbnail_url)
print("영상 조회수 :", yt.views)
print("영상 설명 :", yt.description)

영상 제목 : YouTube
영상 길이 : 203
영상 평점 : 5.0
영상 썸네일 링크:  https://i.ytimg.com/vi/KMtgexdtcGY/maxresdefault.jpg
영상 조회수 : 200
영상 설명 : The FHS Wind Ensemble took on the Mannequin Challenge! Not originally my video but it was too fun NOT to upload, and all credit is given! :)

See if you can spot me!
---------------------------------
Twitter ~ https://twitter.com/JessicaWitXx
Instagram ~ https://www.instagram.com/jesswitxx/
Snapchat ~ jessicawitxx
Vlog Channel ~ https://goo.gl/6ZhRuu
Collab Channel ~ https://goo.gl/rHNw96
---------------------------------
Song 1:
Merry Go Slower by Kevin MacLeod is licensed under a Creative Commons Attribution license (https://creativecommons.org/licenses/by/4.0/)
Source: http://incompetech.com/music/royalty-free/index.html?isrc=USUAN1100732
Artist: http://incompetech.com/
Song 2:
Divertimento K131 by Kevin MacLeod is licensed under a Creative Commons Attribution license (https://creativecommons.org/licenses/by/4.0/)
Source: http://incompetech.com/music/royalty-

In [3]:
_yt_streams = yt.streams
print("다운가능한 영상 상세 정보 :")
for i, stream in enumerate(_yt_streams.all()):
    print(i, " : ", stream)

다운가능한 영상 상세 정보 :
0  :  <Stream: itag="18" mime_type="video/mp4" res="360p" fps="30fps" vcodec="avc1.42001E" acodec="mp4a.40.2" progressive="True" type="video">
1  :  <Stream: itag="22" mime_type="video/mp4" res="720p" fps="30fps" vcodec="avc1.64001F" acodec="mp4a.40.2" progressive="True" type="video">
2  :  <Stream: itag="136" mime_type="video/mp4" res="720p" fps="30fps" vcodec="avc1.4d401f" progressive="False" type="video">
3  :  <Stream: itag="247" mime_type="video/webm" res="720p" fps="30fps" vcodec="vp9" progressive="False" type="video">
4  :  <Stream: itag="135" mime_type="video/mp4" res="480p" fps="30fps" vcodec="avc1.4d4014" progressive="False" type="video">
5  :  <Stream: itag="244" mime_type="video/webm" res="480p" fps="30fps" vcodec="vp9" progressive="False" type="video">
6  :  <Stream: itag="134" mime_type="video/mp4" res="360p" fps="30fps" vcodec="avc1.4d401e" progressive="False" type="video">
7  :  <Stream: itag="243" mime_type="video/webm" res="360p" fps="30fps" vcodec="v

  This is separate from the ipykernel package so we can avoid doing imports until


### Build the dataframe and sort by `res` to get highest resolution video.
Get only `video/mp4` rows.

In [4]:
def get_video_info(video_path):
    video = cv2.VideoCapture(video_path)
    entire_frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = video.get(cv2.CAP_PROP_FPS)

    return entire_frame_count, width, height, fps, video

def save_frames(case_name, meta_txt, video, target_frames_path):
    target_frames_path = os.path.join(target_frames_path, case_name)
    os.makedirs(target_frames_path, exist_ok=True)
    lines = meta_txt.readlines()
    for index, line in enumerate(lines):
        row_str = line.replace('\n', '') 
        if not 'https' in line:
            microsecond_info = int(row_str.split(' ')[0])
            frame_number = math.floor(round(int(microsecond_info)/1000000, 3)*29.97)
            video.set(cv2.CAP_PROP_POS_FRAMES, frame_number-1)
            ret, frame = video.read() # Read the frame
            # frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            try:
                cv2.imwrite(os.path.join(target_frames_path, case_name + '_' + str(index) + '.png'), frame)
            except:
                continue
    meta_txt.close()

def get_download_info(yt_streams, video_url):
    stream_df_list = []
    
    for stream in yt_streams:
        try:
            stream_dict = dict()
            stream_str = str(stream)
            stream_elements = stream_str.replace('Stream: ', '').replace('=', ':').replace('<', '').replace('>', '').replace('"', '').split(' ')
            for elemnt in stream_elements:
                stream_dict[elemnt.split(':')[0]] = elemnt.split(':')[-1]
            stream_df_list.append(pd.DataFrame.from_dict(stream_dict, orient='index').T)
        except:
            print("This yt_stream does not have any downloadable streams.")

    stream_df_global = pd.concat(stream_df_list)
    stream_df_global['videoID'] = str(video_url.split('/')[-1].split('=')[-1])
    stream_df_global.set_index('videoID', inplace = True)
    stream_df_global = stream_df_global[pd.notnull(stream_df_global['res'])]
    stream_df_global['res'] = stream_df_global['res'].str.replace(pat=r'[A-Za-z]', repl= r'', regex=True)
    stream_df_global = stream_df_global.astype({'itag': int, 'res': int})
    stream_df_global = stream_df_global.sort_values(by='res', ascending=False)
    stream_df_global = stream_df_global[stream_df_global['mime_type'] == 'video/mp4']
    
    # itag를 이용해 가장 높은 해상도의 Video Download
    highest_stream = yt_streams.get_by_itag(stream_df_global.iloc[0]['itag'])
    
    return stream_df_global, highest_stream

# Read txt file to fetch youtube video stream. After fetching stream, save video with single frames.
def dataset_downloader(meta_file_path, target_video_path, target_frames_path):
    failed_video_urls = {}
    
    for path in os.listdir(meta_file_path):
        failure_url_list = []
        txt_path = os.path.join(meta_file_path, path)
        for txt in tqdm(os.listdir(txt_path)):
            f = open(os.path.join(txt_path, txt))
            video_url = f.readline().replace('\n', '')
            
            try:
                output_path = os.path.join(target_video_path, path)
                os.makedirs(output_path, exist_ok=True)
                
                # Get youtube video stream informations.
                yt_streams = YouTube(video_url)
                _yt_str = yt_streams.streams
                stream_df_global, highest_stream = get_download_info(_yt_str, video_url)
                filename = stream_df_global.index[0]
                
                if not os.path.exists(os.path.join(output_path, filename+'.mp4')):
                    highest_stream.download(output_path=output_path, filename=filename)
                else:
                    print("This stream is exist: {}".format(filename+'.mp4'))
                    continue
            except:
                print("This stream is not downloadable: {}".format(filename))
                failure_url_list.append(filename)
                continue
            
            failed_video_urls.update({path:failure_url_list})
            entire_frame_count, width, height, fps, video = get_video_info(os.path.join(output_path, filename+'.mp4'))
            
            output_frame_path = os.path.join(target_frames_path, path)
            os.makedirs(output_frame_path, exist_ok=True)
            save_frames(case_name=txt.split('.')[0], meta_txt=f, video=video, target_frames_path=output_frame_path)
            
    return failed_video_urls

In [5]:
stream_df_global, highest_stream = get_download_info(_yt_streams, _url)
print('Highest resolution stream info:', highest_stream)
stream_df_global.head(3)

Highest resolution stream info: <Stream: itag="22" mime_type="video/mp4" res="720p" fps="30fps" vcodec="avc1.64001F" acodec="mp4a.40.2" progressive="True" type="video">


Unnamed: 0_level_0,itag,mime_type,res,fps,vcodec,acodec,progressive,type,abr
videoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
KMtgexdtcGY,22,video/mp4,720,30fps,avc1.64001F,mp4a.40.2,True,video,
KMtgexdtcGY,136,video/mp4,720,30fps,avc1.4d401f,,False,video,
KMtgexdtcGY,135,video/mp4,480,30fps,avc1.4d4014,,False,video,


In [6]:
print("선택된 stream 다운로드:", highest_stream)
# highest_stream.download(output_path='D:/MannequinChallenge_Videos', filename=stream_df_global.index[0])

선택된 stream 다운로드: <Stream: itag="22" mime_type="video/mp4" res="720p" fps="30fps" vcodec="avc1.64001F" acodec="mp4a.40.2" progressive="True" type="video">


### Read whole `.txt` file and download

In [7]:
root_path = 'D:/MannequinChallenge'
meta_file_path = 'D:/MannequinChallenge/meta_files'
target_video_path = os.path.join(root_path, 'original_videos')
target_frames_path = os.path.join(root_path, 'original_sequences')

In [None]:
failed_video_urls = dataset_downloader(meta_file_path, target_video_path, target_frames_path)

In [9]:
print("Done downloading videos! \nFailure video summary: \n - train: {}, - val: {}, - test: {}".format(
                len(failed_video_urls['train']), 
                len(failed_video_urls['validation']),
                len(failed_video_urls['test'])))

Done downloading videos! 
Failure video summary: 
 - train: 1212, - val: 47, - test: 93
