In [1]:
import os
import sys
import re
import pandas as pd
import json
import math 

from collections import Counter
import matplotlib.pyplot as plt

In [5]:
country = 'gbr'
news = open(f"seed_channel_list/{country}_news_channel_list.json", 'r', encoding="utf-8")

df = pd.read_json(news)
channels, urls = [], []
for channel, url in df["channels"]:
    channels.append(channel.replace(" ",""))
    urls.append(url)
    
print(len(channels))
len(set(urls))


28


28

In [6]:
pattern = r"(?<=output_).[^_]*(?=_)"
dir_name = f"outputs_{country}_news"


channel_list = set()
files = [f"{dir_name}/{f}" for f in os.listdir(f"{dir_name}/") if f.endswith(".csv")]

for f in files:
    channel_name = re.search(pattern, f)[0]
    channel_list.add(channel_name)


In [7]:
all_df_list = []
for channel in channel_list:
    channel_files = [f for f in files if f.startswith(f"{dir_name}/output_{channel}")]
    channel_df_list = []

    for f in channel_files:
        df_temp = pd.read_csv(f, encoding= 'unicode_escape')
        channel_df_list.append(df_temp)

    df = pd.concat(channel_df_list).drop_duplicates(subset=["video_id"], keep='first').reset_index(drop=True)
    all_df_list.append(df)
    
df = pd.DataFrame()
df = pd.concat(all_df_list).drop_duplicates(subset=["video_id"], keep='first').reset_index(drop=True)
df

Unnamed: 0,video_id,channel_name,video_title,view_count
0,UgOMektm_wU,WalesOnline,"Meet Trevor, the hero pensioner who fought off...",5400000
1,ODtPjTZSTSU,WalesOnline,Jeremy Vine squirms on air as Welsh miner dema...,1000000
2,R1JNHgN9TZQ,WalesOnline,I ate and trained like a professional rugby pl...,1000000
3,M-eyTr0njmo,WalesOnline,Fans in Japan sing the Welsh national anthem |...,822000
4,pytDUR1hMNM,WalesOnline,Adorable moment baby grabs Kate Middletons bag...,431000
...,...,...,...,...
8606,BtSWnWmbHZk,TheEconomist,How are offices changing?,445000
8607,oQWaw5S4b3I,TheEconomist,"The global food crisis, explained",928000
8608,qqMAFtIGaq4,TheEconomist,Black holes: why they matter,412000
8609,PECy4bE9zg4,TheEconomist,Boris Johnson resigns: what happens next?,299000


In [8]:
Counter(df["channel_name"])

Counter({'Janes': 494,
         'SkyNewsUK': 448,
         'LeicesterMercury': 420,
         'ChathamHouse': 420,
         'TheSpectator': 388,
         'InstituteforGovernment': 388,
         'Channel4NewsUK': 360,
         'TheNewArab': 356,
         'LondonEveningStandard': 356,
         'ThomasReutersFoundation': 350,
         'HITCSevens': 328,
         'VideoVoxEconomics': 328,
         'WhatCulture': 328,
         'BylineTV': 325,
         'WalesOnline': 300,
         'Amnesty': 300,
         'ITVNews': 300,
         'UnHerd': 300,
         'FinancialTimes': 300,
         'TheTimesandTheSundayTimes': 300,
         'MiddleEastEye': 300,
         'TheEconomist': 295,
         'EconomicsHelp': 239,
         'ConservativeHome': 179,
         'JOECOUK': 148,
         'DeclassifiedUK': 142,
         'TheNorthernEcho': 112,
         'FullFactUK': 107})

In [10]:
# mapping of number of youtube users per country (DataReportal 2023)
pop_mapping = {
    'irl': 4.0 * 10**6,
    'usa': 246 * 10**6,
    'can': 33.1 * 10**6,
    'aus': 21.3 * 10**6,
    'gbr': 56.2 * 10**6
}

TARGET_VIDEO_COUNT = 1500
VIEW_COUNT_CUT_OFF = 0.001 * pop_mapping[country]

dff = df[df['view_count'] >= VIEW_COUNT_CUT_OFF].reset_index(drop=True)

num_round = round(TARGET_VIDEO_COUNT/len(channel_list))
video_list = pd.DataFrame()

while len(video_list) < TARGET_VIDEO_COUNT:
    for channel in channel_list:
        dft = dff[dff['channel_name'] == channel]
        dft = dft.sample(n=min(len(dft), num_round))
        
        dff.drop(dff[dff['video_id'].isin(dft['video_id'])].index, inplace=True)
            
        video_list = pd.concat([dft, video_list]).reset_index(drop=True)
        video_list.drop_duplicates(subset='video_id', keep="last", inplace=True)

print(len(video_list))
display(Counter(video_list["channel_name"]))
video_list = video_list.sample(frac=1, random_state=2).reset_index(drop=True)
display(video_list)

1767


Counter({'TheEconomist': 108,
         'MiddleEastEye': 108,
         'WhatCulture': 108,
         'TheTimesandTheSundayTimes': 108,
         'FinancialTimes': 108,
         'UnHerd': 108,
         'JOECOUK': 108,
         'HITCSevens': 108,
         'SkyNewsUK': 108,
         'LondonEveningStandard': 108,
         'TheSpectator': 108,
         'ITVNews': 108,
         'Channel4NewsUK': 108,
         'Janes': 86,
         'BylineTV': 78,
         'Amnesty': 50,
         'EconomicsHelp': 42,
         'ThomasReutersFoundation': 30,
         'DeclassifiedUK': 27,
         'ChathamHouse': 23,
         'WalesOnline': 16,
         'TheNewArab': 5,
         'LeicesterMercury': 3,
         'FullFactUK': 2,
         'TheNorthernEcho': 1})

Unnamed: 0,video_id,channel_name,video_title,view_count
0,9KKdIJSkWAQ,Janes,Paris Air Show 2019: Full-scale mock-up of Tur...,131000
1,Xd5l1mZfSZM,TheSpectator,Louise Perry: motherhood in crisis and the fem...,67000
2,nbdtZ--FmyQ,LondonEveningStandard,Universal Credit benefit payments: What change...,365000
3,WtiqjGlH4c4,HITCSevens,7 Football Stadiums Closest Together,354000
4,v0ochp78Uy4,ThomasReutersFoundation,Extreme heat pushed workers to the limits in Q...,60000
...,...,...,...,...
1762,wY-rJu7aHvo,TheSpectator,Trevor Phillips: BLM uses race as 'battering r...,364000
1763,DqU9gHlcpMU,ITVNews,Ukrainians head for Polish border where thousa...,527000
1764,QnzOHS3_D3g,SkyNewsUK,Ukraine Invasion: Hundreds of men sign up to f...,1100000
1765,1AfNYztas2c,SkyNewsUK,President-elect Joe Biden's victory speech in ...,1300000


In [11]:
output_file_name = f"video_list/{country}_news_2024.csv"
df.to_csv(output_file_name, index=False, encoding="utf-8")