In [92]:
class Video:
    def __init__(self, id, title, desc, category_id, comments, tags, channel_title, view_count, comment_count) -> None:
        self.id = id
        self.title = title
        self.desc = desc
        self.category_id = category_id
        self.comments = comments
        self.tags = tags
        self.channel_title = channel_title
        self.view_count = view_count
        self.comment_count = comment_count
    
    def __str__(self):
        return f'{self.id},{self.title},{self.desc[:10]},{self.category_id},{len(self.comments)},{len(self.tags)},{self.channel_title},{self.view_count},{self.comment_count}'


<a href='https://colab.research.google.com/github/cdwangco/SentimentAnalysisProject/blob/main/MLProjectYTSentimentAnalysis.ipynb' target='_parent'><img src='https://colab.research.google.com/assets/colab-badge.svg' alt='Open In Colab'/></a>

In [93]:
import requests, sys, time, os
output_dir = 'output/'
# Any characters to exclude, generally these are things that become problematic in CSV files
unsafe_characters = ['\n', '"']

# Used to identify columns, currently hardcoded order
header = ['video_id', 'title', 'desc', 'categoryId', 'comments', 'tags', 'channel_title',  'view_count', 'comment_count']
MAX_VIDEOS = 10
MAX_COMMENTS = 10

def setup(api_path, code_path):
    with open(api_path, 'r') as file:
        api_key = file.readline()

    with open(code_path) as file:
        country_codes = [x.rstrip() for x in file]

    return api_key, country_codes

def api_request(page_token, country_code):
    # Builds the URL and requests the JSON from it
    request_url = f'https://www.googleapis.com/youtube/v3/videos?part=id,statistics,snippet{page_token}chart=mostPopular&regionCode={country_code}&maxResults={MAX_VIDEOS}&key={api_key}'
    request = requests.get(request_url)
    if request.status_code == 429:
        print('Temp-Banned due to excess requests, please wait and continue later')
        sys.exit()

    video_data_page = request.json()
    res = video_data_page
    items = video_data_page.get('items', [])
    video_dict = {}
    for video in items:
        video_id = video['id']
        snippet = video['snippet']
        title = snippet.get('title','')
        desc = snippet.get('description', '')
        category_id = snippet.get('categoryId','')
        tags = snippet.get('tags', ['[none]'])
        statistics = video['statistics']
        view_count = statistics.get('viewCount', 0)
        comment_count = statistics['commentCount']
        channel_title = snippet.get('channelTitle','')
        
        request_url = f'https://www.googleapis.com/youtube/v3/commentThreads?key={api_key}&textFormat=plainText&part=snippet&videoId={video_id}&maxResults={MAX_COMMENTS}&pageToken={page_token}'
        comments_request = requests.get(request_url)
        comments_request = comments_request.json()
        comments_items = comments_request.get('items', [])
        comments = [c.get('snippet', {}).get('topLevelComment',{}).get('snippet', {}).get('textDisplay', '') for c in comments_items]
        video = Video(video_id, title, desc, category_id, comments, tags, channel_title, view_count, comment_count)
        video_dict[video_id] = video

    return video_dict

def write_to_file(country_code, country_data):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(f'{output_dir}/{time.strftime("%y.%d.%m")}_{country_code}_videos.csv', 'w+', encoding='utf-8') as file:
        for row in country_data:
            file.write(f'{row}\n')


def get_video_data(should_write_to_file=False):
    for country_code in country_codes:
        result = api_request('&', country_code)
        results_list = list(result.values())
        country_data = [','.join(header)] + results_list
        if should_write_to_file:
            write_to_file(country_code, country_data)
    return result


key_path = 'api_key.txt'
country_codes = 'country_codes.txt'
api_key, country_codes = setup(key_path, country_codes)

videos = get_video_data(True)
for v in videos.values():
    print(v, end=", ")
    print(len(videos[v.id].comments))

u3V5KDHRQvk,Marvel Studios’ Guardians of the Galaxy Volume 3 | Official Trailer,It’s time ,24,10,2,Marvel Entertainment,11555783,23191, 10
ZfVYgWYaHmE,Indiana Jones and the Dial of Destiny | Official Trailer,See #India,1,10,23,Lucasfilm,5797648,12282, 10
2IwhkJ0XzRE,Why I've Been Gone,i am excit,24,10,18,MindofRez,1054398,17204, 10
u18be_kRmC0,RM '들꽃놀이 (with 조유진)' Official MV,RM '들꽃놀이 (,10,10,4,HYBE LABELS,7686530,257173, 10
RcTLBsXvzQk,Most Dangerous Animals!,SUBSCRIBE ,24,10,25,Beast Reacts,3589481,3350, 10
EPWrVyyd3U4,Japan vs. Spain Highlights | 2022 FIFA World Cup,Japan and ,17,10,22,FOX Soccer,2110012,5541, 10
mkHQDPch6fo,Latto - FTCU (feat. GloRilla & Gangsta Boo) [Official Video],Stream/Dow,10,10,29,Latto,303638,2464, 10
CKg3FV5gwMc,RM of BTS: Tiny Desk (Home) Concert,Stephen Th,10,10,9,NPR Music,843777,13909, 10
qyi1DaFZzXQ,Morgan Wallen - Tennessee Fan (Lyric Video),Listen to ,10,10,1,Morgan Wallen,133495,554, 10
LtOqU2o81iI,Tesla Semi Delivery Event,,2,10,15,Tesla,611817,77,