## import libraries

In [1]:
import os
import zipfile

from tqdm import tqdm 
from datetime import datetime

import pandas as pd
pd.set_option('display.max_colwidth', None)

import numpy as np

import re

import yxdb

import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns 

import psycopg2

## import helper 

In [2]:
from config_GAM2025 import gam_info

from functions import execute_sql_query
import test_functions

In [3]:
gam_info['lookup_file']

'GAM2025_Lookup.xlsx'

In [4]:
# country
country_codes = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')

# week 
week_tester = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='GAM Period')
week_tester['w/c'] = pd.to_datetime(week_tester['w/c'])
week_tester['week_ending'] = pd.to_datetime(week_tester['week_ending'])

# social media accounts
dtype_dict = {'Channel ID': 'str',
              'Linked FB Account': 'str'}
socialmedia_accounts = pd.read_excel(f"../../{gam_info['lookup_file']}", dtype=dtype_dict,
                                     sheet_name='Social Media Accounts new')

socialmedia_accounts = socialmedia_accounts[(socialmedia_accounts['Platform'] == 'Youtube')
                                            & 
                                            (socialmedia_accounts['Status'] == 'active')]
socialmedia_accounts = socialmedia_accounts.rename(columns={'Excluding UK': 'Channel Group'})

channel_ids = socialmedia_accounts['Channel ID'].unique().tolist()
formatted_channel_ids = ', '.join(f"'{channel_id}'" for channel_id in channel_ids)
socialmedia_accounts.sample()

Unnamed: 0,Platform,Status,Channel ID,Channel Name,Service,ServiceID,Channel Group,Channel URL,Channel Username,Linked FB Account,Year
518,Youtube,active,UCSNvArbpIG1Kp6EGpx-QeZg,EastEnders,Studios,WOR,BBC Studios,,,,GAM2025


# automated 

In [5]:
sql_query = f""" 
    SELECT 
        yt_channel_id, 
        yt_subscribed_status, 
        yt_country_code,
        yt_metric_id, 
        yt_metric_period, 
        yt_metric_end_time, 
        yt_metric_value 
    FROM 
        redshiftdb.central_insights.yt_channel_insights 
    WHERE
        yt_metric_id = 'views' 
        AND
        yt_channel_id IN ({formatted_channel_ids})
        AND 
        yt_metric_end_time BETWEEN '{gam_info['weekEnding_start']}' AND '{gam_info['weekEnding_end']}'
    ;
    """
'''
df = execute_sql_query(sql_query)
    
df.to_csv(f"../data/raw/Youtube/{gam_info['file_timeinfo']}_YT_geography_redshift_extract.csv", 
            index=False)
'''
yt_views = pd.read_csv(f"../data/raw/Youtube/{gam_info['file_timeinfo']}_YT_geography_redshift_extract.csv")

renaming = {'yt_channel_id': 'Channel ID',
            'yt_country_code': 'YouTube Codes',
            'yt_metric_end_time': 'week_ending',
            }
yt_views.rename(columns = renaming, inplace=True)
yt_views['week_ending'] = pd.to_datetime(yt_views['week_ending'])
################################### Testing ################################### 
test_step = 'testing yt_channel_insights return from redshift'

column_name = 'Channel ID'
test_functions.test_filter_elements_returned(yt_views, channel_ids, column_name, "1_YT_6")

test_functions.test_weeks_presence('week_ending', yt_views, week_tester, '1_YT_7', test_step)

test_functions.test_weeks_presence_per_account('week_ending', column_name, yt_views, week_tester, '1_YT_8', test_step)

################################### Testing ################################### 


...testing Channel ID...
Fail - not all elements were retrieved
...updating logbook...

All weeks are present in the dataset.
...updating logbook...

Missing weeks for each group:
     Week Number  YearGAE        w/c week_ending                Channel ID
0             14     2025 2024-04-01  2024-04-07  UCCiLIKu4E0FHkIGR0bqCPiw
1             36     2025 2024-09-02  2024-09-08  UCCiLIKu4E0FHkIGR0bqCPiw
2             49     2025 2024-12-02  2024-12-08  UCCiLIKu4E0FHkIGR0bqCPiw
3              6     2025 2025-02-03  2025-02-09  UCCiLIKu4E0FHkIGR0bqCPiw
4              9     2025 2025-02-24  2025-03-02  UCCiLIKu4E0FHkIGR0bqCPiw
..           ...      ...        ...         ...                       ...
221            9     2025 2025-02-24  2025-03-02  UCtRf5L5EPwrzFpMWS69Nelw
222           10     2025 2025-03-03  2025-03-09  UCtRf5L5EPwrzFpMWS69Nelw
223           11     2025 2025-03-10  2025-03-16  UCtRf5L5EPwrzFpMWS69Nelw
224           12     2025 2025-03-17  2025-03-23  UCtRf5L5EPwrzFpMWS69

In [6]:
# add PlaceID
cols = ['PlaceID', 'YouTube Codes']
yt_views_cleanCountry = yt_views.merge(country_codes[cols], on=['YouTube Codes'], how='left', indicator=True)

################################### Testing ################################### 
test_step = 'adding country codes GAM'

test_functions.test_inner_join(yt_views, country_codes[cols], ['YouTube Codes'], '1_YT_9', test_step, focus='left')

################################### Testing ################################### 
# TODO add HM to GAM lookup
yt_views_cleanCountry[yt_views_cleanCountry._merge == 'left_only']['YouTube Codes'].unique()

Inner join test 1_YT_9 failed: Issues found.
Issues with df_left (rows present in df_left but not in df_right)
...updating logbook...



array(['HM'], dtype=object)

In [7]:
grouped_df_perCountry = yt_views_cleanCountry.groupby([
        'Channel ID',
        'PlaceID',#country
        'week_ending'
    ]).agg({'yt_metric_value': 'sum'}).reset_index()
grouped_df_perCountry = grouped_df_perCountry.rename(columns={'yt_metric_value': 'view_country'})
display(grouped_df_perCountry.sample())

# Group by the specified columns and sum the yt_metric_value
grouped_df_allCountries = yt_views_cleanCountry.groupby([
    'Channel ID',
    'week_ending'
]).agg({'yt_metric_value': 'sum'}).reset_index()
grouped_df_allCountries = grouped_df_allCountries.rename(columns={'yt_metric_value': 'total_view_country'})
#display(grouped_df_allCountries.sample())

country_proportion = grouped_df_allCountries.merge(grouped_df_perCountry, 
                                                   on=['Channel ID', 'week_ending'], 
                                                   how='inner')
country_proportion['country_%'] = (country_proportion['view_country'] / country_proportion['total_view_country'])

################################### Testing ################################### 
# todo: add a test that sums country % and needs to come to a very very very exact 100% (at least 8 decimals)
test_step = 'calculating % country'
cols= ['Channel ID', 'week_ending']
test_functions.test_inner_join(grouped_df_allCountries, grouped_df_perCountry, cols, "1_YT_9", test_step)

test_functions.test_merge_row_count(country_proportion, grouped_df_perCountry, '1_YT_10', test_step)

test_functions.test_percentage(country_proportion,  cols, '1_YT_11', test_step)

test_functions.test_larger_val(country_proportion,  'country_%', '1_YT_12', test_step, val=1)

################################### Testing ################################### 



Unnamed: 0,Channel ID,PlaceID,week_ending,view_country
550947,UC_QuRsbRQpaC4iS8WlxW0NA,ZAM,2025-03-23,528.0


Inner join test 1_YT_9 successful: No issues found.
...updating logbook...

...testing if merge leads to more rows on the metric side
pass! :)
...updating logbook...

...updating logbook...

Pass - No larger than 1 values
...updating logbook...



In [8]:
sql_query= f"""
    SELECT 
        yt_channel_id, 
        week_ending
    FROM 
        redshiftdb.central_insights.yt_channel_metadata 
    WHERE 
        yt_channel_id IN ({formatted_channel_ids})
        AND
        week_ending BETWEEN '{gam_info['weekEnding_start']}' AND '{gam_info['weekEnding_end']}'
        ;
"""
'''
df = execute_sql_query(sql_query)

df.to_csv(f"../data/raw/Youtube/{gam_info['file_timeinfo']}_YT_geography_redshift_metadata.csv", index=False)

'''
metadata = pd.read_csv(f"../data/raw/Youtube/{gam_info['file_timeinfo']}_YT_geography_redshift_metadata.csv")


renaming = {'yt_channel_id': 'Channel ID', }
metadata.rename(columns=renaming, inplace=True)
metadata['week_ending'] = pd.to_datetime(metadata['week_ending'])

################################### Testing ################################### 

test_step ='testing metadata return from redshift'

test_functions.test_filter_elements_returned(metadata, 'Channel ID', column_name, '1_YT_13', test_step)

test_functions.test_weeks_presence('week_ending', metadata, week_tester, '1_YT_14', test_step)

test_functions.test_weeks_presence_per_account('week_ending', 'Channel ID', metadata, week_tester, '1_YT_15', test_step)

################################### Testing ################################### 


...testing Channel ID...
Fail - not all elements were retrieved
...updating logbook...

All weeks are present in the dataset.
...updating logbook...

All weeks are present in the dataset for each group.
...updating logbook...



In [9]:
automated_country = country_proportion.merge(metadata, 
                                            on=['Channel ID', 'week_ending'], 
                                            how='inner')
automated_country = automated_country.merge(week_tester[['w/c', 'week_ending']], 
                                        on=['week_ending'], 
                                        how='left')
country_cols = ['w/c', 'Channel ID', 'PlaceID', 'country_%', ]
automated_country = automated_country[country_cols]
################################### Testing ################################### 
test_step = 'combining country metric and metadata'

test_functions.test_inner_join(country_proportion, metadata, ['Channel ID', 'week_ending'], '1_YT_16', test_step)

test_functions.test_merge_row_count(country_proportion, automated_country, '1_YT_17', test_step)
################################### Testing ################################### 




Inner join test 1_YT_16 failed: Issues found.
Issues with df_right (rows present in df_right but not in df_left)
...updating logbook...

...testing if merge leads to more rows on the metric side
pass! :)
...updating logbook...



# manual

## import media action

In [10]:
country_map = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')[['PlaceID', 'YouTube Codes']]

In [11]:
#TODO: review with minnie for individual exports
#because it contains geography or should we use table instead?

path = f"../data/raw/Youtube/{gam_info['file_timeinfo']}_manual/"
dataframes = []

for filename in os.listdir(path):
    if filename.endswith('.xlsx'):  # Assuming the files are excel files
        
        try:
            file_path = os.path.join(path, filename)
            df = pd.read_excel(file_path, sheet_name='Chart data')
            df['Channel ID'] = filename.split('.')[0].split(' - ')[0]
            df['Channel title'] = filename.split('.')[0].split(' - ')[0]
            df['source_path'] = path+filename
            
            dataframes.append(df)
        except:
            print(filename)
media_action_df = pd.concat(dataframes)

def get_week_dates(date):
    if date.weekday() != 6:  # Check if the date is not a Sunday
        raise ValueError("The input date must be a Sunday.")
    
    from_date = date + pd.Timedelta(days=1)  # Monday after the given Sunday
    to_date = from_date + pd.Timedelta(days=6)  # Sunday after the Monday
    return from_date, to_date

media_action_df['Date'] = pd.to_datetime(media_action_df['Date'])

# Apply the function to get FromDate and ToDate
media_action_df['w/c'], media_action_df['week_ending'] = zip(*media_action_df['Date'].apply(get_week_dates))
media_action_df = media_action_df.rename(columns={'Geography': 'YouTube Codes'})
media_action_df = media_action_df.merge(country_map, on='YouTube Codes', how='left')
# Group by Geography, FromDate, ToDate, and filename to sum Views
media_action_df = media_action_df.groupby(['w/c', 'week_ending', 'Channel ID', 'Channel title', 'PlaceID', 'source_path']).agg({'Views': 'sum'}).reset_index()

media_action_df['Channel Group'] = 'BBC Media Action'

channel_ids = {'Aksi Kita Indonesia': 'aksikitaindo', }
media_action_df['Channel ID'] = media_action_df['Channel ID'].replace(channel_ids)

In [12]:
_ma_global = media_action_df.groupby(['w/c', 'Channel ID'])['Views'].sum().reset_index()
ma_country_df = media_action_df.merge(_ma_global, on=['w/c', 'Channel ID'], how='left',
                                      suffixes=['_country', '_global'])
ma_country_df['country_%'] = ma_country_df['Views_country'] / ma_country_df['Views_global']
ma_country_df = ma_country_df[automated_country.columns]


In [13]:
youtube_country = pd.concat([automated_country, ma_country_df])

In [23]:
youtube_country.head()

Unnamed: 0,w/c,Channel ID,PlaceID,country_%
0,2024-04-01,UC0JypFmHP-9wh5A3JjJLpcA,AFG,7e-06
1,2024-04-01,UC0JypFmHP-9wh5A3JjJLpcA,ALB,7e-06
2,2024-04-01,UC0JypFmHP-9wh5A3JjJLpcA,ALG,1.4e-05
3,2024-04-01,UC0JypFmHP-9wh5A3JjJLpcA,ANG,1e-06
4,2024-04-01,UC0JypFmHP-9wh5A3JjJLpcA,ARG,4e-06


## import Serbian & Sinhala Dataset

In [17]:
ser_sin_df = pd.read_excel("../data/raw/Youtube/serbian sinhala youtube.xlsx", 
                           sheet_name='SERSIN')
ser_sin_df.rename(columns={'Geography': 'YouTube Codes',
                           'Channel': 'Channel ID',
                           'Total': 'country_%'}, inplace=True)
# join country codes 
ser_sin_df = ser_sin_df.merge(country_codes[['YouTube Codes', 'PlaceID']], on=['YouTube Codes'], indicator=True, how='left')
ser_sin_df = ser_sin_df[country_cols]

ser_sin_df.columns

Index(['w/c', 'Channel ID', 'PlaceID', 'country_%'], dtype='object')

In [18]:
# Find rows in additional_df that are NOT in master_df
additional_rows = ser_sin_df[~ser_sin_df.apply(tuple, axis=1).isin(youtube_country.apply(tuple, axis=1))]

# Append new rows to master_df
youtube_country_2 = pd.concat([youtube_country, additional_rows], ignore_index=True)

In [25]:
ser_sin_df.head()

Unnamed: 0,w/c,Channel ID,PlaceID,country_%
0,2024-04-01,UCCrAKchnDFMhrKeXSYWXjGA,UAE,0.0
1,2024-04-01,UCCrAKchnDFMhrKeXSYWXjGA,ALB,0.000799
2,2024-04-01,UCCrAKchnDFMhrKeXSYWXjGA,ARG,0.0
3,2024-04-01,UCCrAKchnDFMhrKeXSYWXjGA,OST,0.047364
4,2024-04-01,UCCrAKchnDFMhrKeXSYWXjGA,AUS,0.017587


# store dataset

In [16]:
youtube_country_2.to_csv(f"../data/processed/Youtube/{gam_info['file_timeinfo']}_country.csv", 
                         index=None)