## import libraries

In [1]:
import os
import zipfile

from tqdm import tqdm 
from datetime import datetime

import pandas as pd
pd.set_option('display.max_colwidth', None)

import numpy as np

import re

import yxdb

import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns 

import psycopg2

## import helper 

In [2]:
from config_GAM2025 import gam_info

from functions import execute_sql_query
import test_functions

In [3]:
gam_info['lookup_file']

'GAM2025_Lookup.xlsx'

In [4]:
# country
country_codes = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')

# week 
week_tester = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='GAM Period')
week_tester['w/c'] = pd.to_datetime(week_tester['w/c'])
week_tester['week_ending'] = pd.to_datetime(week_tester['week_ending'])

# social media accounts
dtype_dict = {'Channel ID': 'str',
              'Linked FB Account': 'str'}
socialmedia_accounts = pd.read_excel(f"../../{gam_info['lookup_file']}", dtype=dtype_dict,
                                     sheet_name='Social Media Accounts new')

socialmedia_accounts = socialmedia_accounts[(socialmedia_accounts['Platform'] == 'Youtube')
                                            & 
                                            (socialmedia_accounts['Status'] == 'active')]
socialmedia_accounts = socialmedia_accounts.rename(columns={'Excluding UK': 'Channel Group'})

channel_ids = socialmedia_accounts['Channel ID'].unique().tolist()
formatted_channel_ids = ', '.join(f"'{channel_id}'" for channel_id in channel_ids)
socialmedia_accounts.sample()

Unnamed: 0,Platform,Status,Channel ID,Channel Name,Service,ServiceID,Channel Group,Channel URL,Channel Username,Linked FB Account,Year
471,Youtube,active,UCd9maKo3B6jX8pCPzLa2hvA,BBC News မြန်မာ,Burmese,BUR,BBC World Service,,,,GAM2025


# Unique Viewers

## Ingestions 

### automated extracts

In [5]:
main_path = f"../data/raw/YouTube/{gam_info['file_timeinfo']}_export/"
#main_path = f"../../../../Research Projects/GAM/Digital GAM/2025/Social Media/"

# Dynamically get all folders in the main_path
folder_paths = [f for f in os.listdir(main_path) if os.path.isdir(os.path.join(main_path, f))]

### TESTING input files ### 
test_functions.youtube_test_input_files('1_YT_1', folder_paths, main_path, week_tester, test_step='testing automated extracts')


100%|█████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 541.54it/s]


All weeks are present in the dataset.


100%|█████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 565.72it/s]


All weeks are present in the dataset.


100%|█████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 572.17it/s]


All weeks are present in the dataset.


100%|█████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 583.89it/s]


All weeks are present in the dataset.
...updating logbook...



In [6]:
# ingest files
output_csv_path = f"../data/processed/Youtube/{gam_info['file_timeinfo']}_zipfiles_BBC World Service.csv"

# Check if the output CSV already exists
if os.path.exists(output_csv_path):
    combined_df = pd.read_csv(output_csv_path)
else:
    combined_df = pd.DataFrame()

# Dynamically get all folders in the main_path
folder_paths = [f for f in os.listdir(main_path) if os.path.isdir(os.path.join(main_path, f))]
print('if files have been previously extracted the unzipping will be skipped')
                
for folder in folder_paths:    
    for file_name in tqdm(os.listdir(main_path+folder)):
        if file_name.endswith('.zip'):
                # Check if the file has already been processed
                if 'source_path' in combined_df.columns and (main_path+folder + file_name) in combined_df['source_path'].values:
                    #print(f"Skipping {file_name} as it has already been processed.")
                    continue

                # TODO next year: advertisement is identified as in each folder is a subfolder total
                # and a second subfolder called Advertisment - to process advertisement that has to be added
                # as a flag to the individual exports here 
                with zipfile.ZipFile(os.path.join(main_path+folder, file_name), 'r') as zip_ref:
                    for member in zip_ref.namelist():
                        if 'Table data.csv' in member:
                            with zip_ref.open(member) as file:
                                df = pd.read_csv(file)
                            
                            # Extract start date, end date, and content manager from the file name
                            match = re.search(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2}) (.+)', file_name)
                            if match:
                                df['w/c'] = pd.to_datetime(match.group(1), format='%Y-%m-%d')
                                df['Channel Group'] = match.group(3)
                                df['source_path'] = main_path+folder + file_name
    
                            combined_df = pd.concat([combined_df, df], ignore_index=True)
                            combined_df.to_csv(output_csv_path, index=False)

# processing 
combined_df['w/c'] = pd.to_datetime(combined_df['w/c'], format='ISO8601')
combined_df['w/c'] = combined_df['w/c'] - pd.to_timedelta(combined_df['w/c'].dt.weekday, unit='D')

combined_df['week_ending'] = combined_df['w/c'] + pd.to_timedelta(6 - combined_df['w/c'].dt.weekday, unit='D')

combined_df['Channel Group'] = combined_df['Channel Group'].str.replace('.zip', '')
combined_df = combined_df.rename(columns={'Channel': 'Channel ID'})

# TODO: confirm what to do with the total (so far it's excluded at the inner join with social media accounts)
combined_df = combined_df.loc[combined_df['Channel ID'] != 'Total']

# confirm dtypes 
combined_df.loc[:, 'Unique viewers'] = pd.to_numeric(combined_df['Unique viewers'], errors='raise').astype('Int64')
combined_df.loc[:, 'Views'] = pd.to_numeric(combined_df['Views'].fillna(0), errors='raise').astype('Int64')
combined_df.loc[:, 'Watch time (hours)'] = pd.to_numeric(combined_df['Watch time (hours)'], errors='raise')

# TODO: find out from Minnie why Impressions and Impression click-through rate (%) is not in this dataset -> can be ignored
try:
    combined_df.loc[:, 'Impressions'] = combined_df['Impressions'].fillna(0)
    combined_df.loc[:, 'Impressions'] = pd.to_numeric(combined_df['Impressions'], errors='raise').astype('Int64')
except:
    print('could not change type of impressions - col does not exist and were created')

try:
    combined_df.loc[:, 'Impression click-through rate (%)'] = pd.to_numeric(combined_df['Impression click-through rate (%)'], errors='raise')
except:
    print('could not change type of impressions click through rate - col does not exist and was created')
    combined_df.loc[:, 'Impression click-through rate (%)'] = 0
combined_df.sample()

if files have been previously extracted the unzipping will be skipped


100%|████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 6240.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 6652.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 5287.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 5282.00it/s]

could not change type of impressions click through rate - col does not exist and was created





Unnamed: 0,Channel ID,Channel title,Impressions,Unique viewers,Engaged views,Views,Watch time (hours),Average view duration,w/c,Channel Group,source_path,Estimated partner revenue (USD),week_ending,Impression click-through rate (%)
6276,UC67ak4KkY16efmibaVpqIMQ,U&Yesterday,1285746,40337,76933.0,76933.0,10200.8696,0:07:57,2024-12-02,BBC Studios,../data/raw/YouTube/GAM2025_export/03 - STChannel 2024-12-02_2024-12-09 BBC Studios.zip,615.63,2024-12-08,0


In [7]:
combined_df[(combined_df['Channel ID'] == 'UC16niRr50-MSBwiO3YDb3RA') & 
            (combined_df['w/c'] == '2024-04-01')]

Unnamed: 0,Channel ID,Channel title,Impressions,Unique viewers,Engaged views,Views,Watch time (hours),Average view duration,w/c,Channel Group,source_path,Estimated partner revenue (USD),week_ending,Impression click-through rate (%)
2449,UC16niRr50-MSBwiO3YDb3RA,BBC News,135577389,9915589,15096071.0,15096071.0,483216.3855,0:01:55,2024-04-01,BBC Global News,../data/raw/YouTube/GAM2025_export/02 - GNChannel 2024-04-01_2024-04-08 BBC Global News.zip,,2024-04-07,0


### manual extracts
Media Action and channel by channel exports 

In [8]:
#TODO: review with minnie for individual exports
#because it contains geography or should we use table instead?

path = f"../data/raw/Youtube/{gam_info['file_timeinfo']}_manual/"
dataframes = []

for filename in os.listdir(path):
    if filename.endswith('.xlsx'):  # Assuming the files are excel files
        
        try:
            file_path = os.path.join(path, filename)
            df = pd.read_excel(file_path, sheet_name='Totals')
            df['Channel ID'] = filename.split('.')[0].split(' - ')[0]
            df['Channel title'] = filename.split('.')[0].split(' - ')[0]
            df['source_path'] = path+filename
            
            dataframes.append(df)
        except:
            print(filename)
media_action_df = pd.concat(dataframes)

def get_week_dates(date):
    if date.weekday() != 6:  # Check if the date is not a Sunday
        raise ValueError("The input date must be a Sunday.")
    
    from_date = date + pd.Timedelta(days=1)  # Monday after the given Sunday
    to_date = from_date + pd.Timedelta(days=6)  # Sunday after the Monday
    return from_date, to_date

media_action_df['Date'] = pd.to_datetime(media_action_df['Date'])

# Apply the function to get FromDate and ToDate
media_action_df['w/c'], media_action_df['week_ending'] = zip(*media_action_df['Date'].apply(get_week_dates))

# Group by Geography, FromDate, ToDate, and filename to sum Views
media_action_df = media_action_df.groupby(['w/c', 'week_ending', 'Channel ID', 'Channel title', 'source_path']).agg({'Views': 'sum'}).reset_index()
media_action_df['Channel Group'] = 'BBC Media Action'

channel_ids = {'Aksi Kita Indonesia': 'aksikitaindo', }
media_action_df['Channel ID'] = media_action_df['Channel ID'].replace(channel_ids)

media_action_df['Unique viewers'] = media_action_df['Views'] / gam_info['overlap_viewer_uniqueViever']


In [9]:
gam_info['overlap_viewer_uniqueViever']

1.1373

### combine CMS & non CMS

In [10]:
full_uv_df = pd.concat([combined_df, media_action_df])
print(full_uv_df.shape) #(7841, 14)

# add service & service code info 
youtube_uv = full_uv_df.merge(socialmedia_accounts[['Channel ID', 'Channel Name',  'Service', 'ServiceID']], 
                              on='Channel ID' , how='left')

youtube_uv['Unique viewers'] = youtube_uv['Unique viewers'].fillna(0)
youtube_uv.drop(columns=['week_ending'], inplace=True)

# TODO add test to ensure no data is lost with these 
#      (and keep on left ot make sure we never loose data)
#youtube_uv = youtube_uv.merge(week_tester[['week_ending', 'w/c']], on='week_ending', how='left', indicator=True)
#print(youtube_uv._merge.value_counts())



(7845, 14)


## Test 

In [11]:
################################### Testing ################################### 
test_step = 'combine CMS & non CMS'
# test accounts
test_functions.test_filter_elements_returned(youtube_uv, channel_ids, 'Channel ID', "1_YT_2", test_step)
# test weeks 
test_functions.test_weeks_presence_per_account('w/c', 'Channel ID', youtube_uv, week_tester, "1_YT_3", test_step)
# test duplicates
cols= ['Channel ID', 'Channel title', 'Channel Group', 'w/c',]
test_functions.test_duplicates(youtube_uv, cols, '1_YT_4', test_step)

test_functions.test_merge_row_count(youtube_uv, full_uv_df, '1_YT_5', test_step)

################################### Testing ################################### 

youtube_uv.sample()

...testing Channel ID...
Fail - not all elements were retrieved
...updating logbook...

Missing weeks for each group:
     Week Number  YearGAE        w/c week_ending                Channel ID
0             14     2025 2024-04-01  2024-04-07  UC3780MVtSV3Huj4si_CQvlQ
1             15     2025 2024-04-08  2024-04-14  UC3780MVtSV3Huj4si_CQvlQ
2             16     2025 2024-04-15  2024-04-21  UC3780MVtSV3Huj4si_CQvlQ
3             17     2025 2024-04-22  2024-04-28  UC3780MVtSV3Huj4si_CQvlQ
4             18     2025 2024-04-29  2024-05-05  UC3780MVtSV3Huj4si_CQvlQ
..           ...      ...        ...         ...                       ...
314            9     2025 2025-02-24  2025-03-02  UCxzzufxh4ILSk6hW5cMMR2w
315           10     2025 2025-03-03  2025-03-09  UCxzzufxh4ILSk6hW5cMMR2w
316           11     2025 2025-03-10  2025-03-16  UCxzzufxh4ILSk6hW5cMMR2w
317           12     2025 2025-03-17  2025-03-23  UCxzzufxh4ILSk6hW5cMMR2w
318           13     2025 2025-03-24  2025-03-30  UCxzzuf

Unnamed: 0,Channel ID,Channel title,Impressions,Unique viewers,Engaged views,Views,Watch time (hours),Average view duration,w/c,Channel Group,source_path,Estimated partner revenue (USD),Impression click-through rate (%),Channel Name,Service,ServiceID
7324,UCrO5leiEQ2BV2fsy6SWwu7w,BBC Brit South Africa,7.0,0.0,,0.0,,,2024-09-09,BBC Studios,../data/raw/YouTube/GAM2025_export/03 - STChannel 2024-09-09_2024-09-16 BBC Studios.zip,,0.0,BBC Brit South Africa,Studios,WOR


In [17]:
youtube_uv[youtube_uv['Channel ID'] == 'UCyL1hGLVGqeZ1ak3DJeik7Q']

Unnamed: 0,Channel ID,Channel Name,ServiceID,Channel Group,Channel title,Unique viewers,w/c


## Storing

In [12]:
cols = ['Channel ID', 'Channel Name', 'ServiceID', 'Channel Group',
        'Channel title', 'Unique viewers', 'w/c']
youtube_uv = youtube_uv[cols]

# clean cols 
youtube_uv['ServiceID'] = youtube_uv['ServiceID'].str.strip().fillna('')
youtube_uv['Channel ID'] = youtube_uv['Channel ID'].str.strip().fillna('')
youtube_uv['Unique viewers'] = youtube_uv['Unique viewers'].fillna(0)
youtube_uv['w/c'] = pd.to_datetime(youtube_uv['w/c'])

youtube_uv.to_csv(f"../data/processed/YouTube/_{gam_info['file_timeinfo']}_uniqueViewer_withAds.csv", 
                         index=None)

## remove Advertising 

In [13]:
# read in ad dataset
cols = ['Channel', 'Week', '% reach to be removed']
youtube_ads = pd.read_excel(f"../data/raw/Youtube/YouTube_advertising.xlsx", usecols=cols)
youtube_ads.rename(columns={'Channel': 'Channel ID', 
                            'Week': 'w/c'}, inplace=True)

# merge datasetsa
youtube_uv_withAds = youtube_uv.merge(youtube_ads, on=['Channel ID', 'w/c'], how='left')
youtube_uv_withAds['% reach to be removed'] = youtube_uv_withAds['% reach to be removed'].fillna(0)

# subset youtube_uv dataset 
youtube_uv_organic = youtube_uv_withAds.copy()
youtube_uv_organic['Unique viewers'] -= youtube_uv_organic['Unique viewers'].mul(youtube_uv_organic['% reach to be removed'])
youtube_uv_organic.drop(columns=['% reach to be removed'], inplace=True)
youtube_uv_organic.head()

Unnamed: 0,Channel ID,Channel Name,ServiceID,Channel Group,Channel title,Unique viewers,w/c
0,UCN7B-QD0Qgn2boVH5Q0pOWg,BBC News Hindi,HIN,BBC World Service,BBC News Hindi,22320780.0,2024-06-03
1,UCiTCB-B_weEmwHk7ifNobQw,BBC News Telugu,TEL,BBC World Service,BBC News Telugu,4292848.0,2024-06-03
2,UC7pluR6rB5KZIbN2IxamzxQ,BBC News Marathi,MAR,BBC World Service,BBC News Marathi,5299293.0,2024-06-03
3,UCb3TZ4SD_Ys3j4z0-8o6auA,BBC News 中文,MAN,BBC World Service,BBC News 中文,1968308.0,2024-06-03
4,UCelk6aHijZq-GJBBB9YpReA,BBC News عربي,ARA,BBC World Service,BBC News عربي,2618078.0,2024-06-03


## Testing 

In [14]:
# Get a rough estimate of channel average UV and sort descending by average UV
channel_avg_uv = youtube_uv_organic.groupby(['Channel ID', 'ServiceID'])['Unique viewers'].mean()\
                            .reset_index(name='average_UV')\
                            .sort_values(by='average_UV', ascending=False)

## TODO make heatmap

In [15]:
# Calculate the sum of unique viewers for each YT Service Code and Week Number
sum_uv = youtube_uv_organic.groupby(['ServiceID', 'w/c'])['Unique viewers'].sum()\
                               .reset_index(name='sum_UV')

# Calculate the average of unique viewers for each YT Service Code
avg_uv = youtube_uv_organic.groupby(['ServiceID'])['Unique viewers'].mean()\
                    .reset_index(name='average_UV')\
                    .sort_values(by='average_UV', ascending=False)

## TODO make heatmap

# storing dataset

In [16]:
youtube_uv_organic.to_csv(f"../data/processed/YouTube/{gam_info['file_timeinfo']}_uniqueViewer.csv", 
                         index=None)