In [1]:
from extract.extractor_manager import ExtractorManager
from extract.youtube.youtube_channel_extractor import YouTubeChannelExtractor
from extract.youtube.youtube_videos_extractor import YouTubePlaylistExtractor, YouTubeVideosExtractor


class YouTubeExtractorManager(ExtractorManager):
    def __init__(self, channels):
        super().__init__([YouTubeChannelExtractor(channel) for channel in channels])


In [None]:

# ['@NetworkDirection', '@WestNetworksLLC', '@MobileInternetResourceCenter', '@Frontierus', '@MobileMustHave', '@Technorv', '@Peplink', '@5Gstore']
manager = YouTubeExtractorManager(['@NetworkDirection', '@WestNetworksLLC', '@MobileInternetResourceCenter', '@Frontierus', '@MobileMustHave', '@Technorv', '@Peplink', '@5Gstore'])
manager.fetch_all()


## Introspect

In [2]:
import pandas as pd
from extract.youtube.youtube_channel_extractor import YouTubeChannelExtractor

dfs = YouTubeChannelExtractor.get_artifacts()

df_names = [df[0] for df in dfs]
for name in df_names:
    print(name)


8
youtube_video_item_5Gstore__T_20241228_220647.jsonl
youtube_video_item_Frontierus__T_20241228_215952.jsonl
youtube_video_item_MobileInternetResourceCenter__T_20241228_215109.jsonl
youtube_video_item_MobileMustHave__T_20241228_220029.jsonl
youtube_video_item_NetworkDirection__T_20241228_213545.jsonl
youtube_video_item_Peplink__T_20241228_220434.jsonl
youtube_video_item_Technorv__T_20241228_220248.jsonl
youtube_video_item_WestNetworksLLC__T_20241228_213820.jsonl


In [7]:

df_list = [df[1] for df in dfs]

# Merge all dataframes
merged_df = pd.concat(df_list, ignore_index=True)

print("\nTop row from describe():")
print(merged_df.describe().iloc[2].metadata)

# Find rows with empty page_content
empty_content = merged_df[merged_df['page_content'] == '']
print(f"\nFound {len(empty_content)} rows with empty page_content:")
print(empty_content)



Top row from describe():
{'id': '6Q5s8WsR0XY', 'kind': 'youtube#video', 'etag': '1tnuQvjNdmBq_q3WuOwrU5NiToA', 'snippet': {'publishedAt': '2024-09-23T19:16:08Z', 'channelId': 'UCPfPFshSC-rH4IbaClXrWSQ', 'title': 'Inseego MiFi X Pro 5G - Unboxing', 'description': 'Watch as we unbox the Inseego MiFi X Pro 5G and highlight its features. \n\nLearn more about the MiFi options available: 5gstore.com/search/mifi\n\nConnect with Us:\nWebsite | https://www.5gstore.com\nFacebook | https://www.facebook.com/the5Gstore\nTwitter | https://twitter.com/3GCustomerSvc\nYouTube | https://www.youtube.com/user/3Gstore\nInstagram | https://www.instagram.com/the5gstore/\nLinkedIn | https://www.linkedin.com/company/5gstore/\nBlog | https://5gstore.com/blog/', 'channelTitle': '5Gstore', 'categoryId': '28', 'liveBroadcastContent': 'none'}, 'contentDetails': {'duration': 'PT3M13S', 'dimension': '2d', 'definition': 'hd', 'caption': 'false', 'licensedContent': False, 'projection': 'rectangular'}, 'status': {'uplo

# Videos Extractors

In [None]:

extra_extractors = YouTubeVideosExtractor("extra_videos",
                                          video_ids=['0PbTi_Prpgs', '_IOZ8_cPgu8', 'oHQvWa6J8dU', 'k9ZigsW9il0',
                                               '0j6-QFnnwQk'])
extra_extractors.extract()


In [None]:

training_extractors = YouTubeVideosExtractor("peplink_training_videos",
                                          video_ids=['mZAr7Z7eL48', 'GLtjyS4ELAA', 'Ny5z_4Pjz6c', 'Ow8sdUEb_eg',
                                               'KiFrxH46qM0', 'J1Jcgce7zrQ', 'iNwVqhp2QtY', 'GLtjyS4ELAA',
                                               'fsB5MqE7uOU', '1vvm0JiEwww', '-ILspN9YRsY'])
training_extractors.extract()


In [None]:

uni_extractors = YouTubePlaylistExtractor("west_peplink_university_playlist", playlist_id='PLT8XvvJf-9vgah5id_2tW6GvSCOmV62h6')
uni_extractors.extract()
