In [None]:
# Install Packages
!pip install google-api-python-client google-cloud-bigquery-storage pycountry



In [None]:
# --- 1. SETUP AND AUTHENTICATION ---

from googleapiclient.discovery import build
from google.cloud import bigquery
from datetime import datetime
import pandas as pd
import pycountry
import time
import os

# Initialize the YouTube Data API client
youtube = build("youtube", "v3", developerKey=os.getenv("YOUTUBE_API_KEY"))

print("✅ YouTube client initialized successfully")

✅ YouTube client initialized successfully


In [None]:
# --- 2. DEFINE REQUEST PARAMETERS ---

# --- Define African regions with all 54 countries ---
regions = {
    "East Africa": ["KE", "UG", "TZ", "RW", "BI", "ET", "SO", "DJ", "ER", "SC", "MG", "MU"],
    "West Africa": ["NG", "GH", "CI", "SN", "TG", "SL", "LR", "GM", "BF", "BJ", "NE", "ML", "GN", "GW", "CV"],
    "North Africa": ["EG", "DZ", "MA", "TN", "LY", "SD"],
    "Southern Africa": ["ZA", "NA", "BW", "MZ", "ZW", "ZM", "LS", "SZ", "AO", "MW"],
    "Central Africa": ["CM", "CD", "CG", "GA", "GQ", "CF", "TD", "ST"]
}

# The API returns results in pages; this keeps track of which page to fetch next
next_page_token = None

In [None]:
# --- Initialize ---
videos = []

for region, countries in regions.items():
    print(f"Collecting data for {region} ✅")

    for country in countries:
        # --- Get full country name ---
        try:
            country_name = pycountry.countries.get(alpha_2=country).name
        except:
            country_name = "Unknown"

        try:
            next_page_token = None
            while True:
                request = youtube.videos().list(
                    part="snippet,statistics",
                    chart="mostPopular",
                    regionCode=country,
                    maxResults=50,
                    pageToken=next_page_token
                )

                response = request.execute()

                # --- Create a category map ---
                category_response = youtube.videoCategories().list(
                    part="snippet",
                    regionCode=REGION_CODE
                ).execute()

                category_map = {item["id"]: item["snippet"]["title"] for item in category_response.get("items", [])}

                for item in response.get("items", []):
                    snippet = item.get("snippet", {})
                    stats = item.get("statistics", {})

                    # Get category info
                    category_id = snippet.get("categoryId")
                    category_name = category_map.get(category_id, "Unknown")

                    videos.append({
                        "country_code": country,
                        "country_name": country_name,  # ✅ added country name
                        "region": region,
                        "video_id": item.get("id"),
                        "title": snippet.get("title"),
                        "description": snippet.get("description"),
                        "channel_title": snippet.get("channelTitle"),
                        "category_id": category_id,
                        "category_name": category_name,  # ✅ new field
                        "published_at": snippet.get("publishedAt"),
                        "tags": snippet.get("tags", []),
                        "view_count": int(stats.get("viewCount", 0)),
                        "like_count": int(stats.get("likeCount", 0)),
                        "comment_count": int(stats.get("commentCount", 0))
                    })

                next_page_token = response.get("nextPageToken")
                if not next_page_token:
                    break

        except Exception as e:
            # print(f"  ⚠️ Skipping {country} — Not supported or error occurred: {e}")
            continue

print(f"Total videos retrieved across Africa: {len(videos)} ✅")

✅ Total videos retrieved: 200


In [None]:
# Convert to DataFrame
bigdata = pd.DataFrame(videos)
bigdata.drop_duplicates(subset=["video_id", "country_code", "country_name", "region"], keep="first", inplace=True)
bigdata = bigdata.reset_index(drop=True)

In [None]:
bigdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_id       200 non-null    object
 1   title          200 non-null    object
 2   description    200 non-null    object
 3   channel_title  200 non-null    object
 4   category_id    200 non-null    object
 5   category_name  200 non-null    object
 6   published_at   200 non-null    object
 7   tags           200 non-null    object
 8   view_count     200 non-null    int64 
 9   like_count     200 non-null    int64 
 10  comment_count  200 non-null    int64 
dtypes: int64(3), object(8)
memory usage: 17.3+ KB


In [None]:
bigdata

Unnamed: 0,video_id,title,description,channel_title,category_id,category_name,published_at,tags,view_count,like_count,comment_count
0,ws9CaAY6oww,BAHATI - BYE BYE BABA (Raila Odinga Tribute Song),#bahati #bahatikenya #byebyebaba #tribute #rai...,Bahati Kenya,10,Music,2025-10-16T07:00:07Z,"[Bahati, Bahati kenya, kenya, baba, bye bye ba...",736076,58828,5071
1,tWi6Hp0ValI,TWIST OF JUSTICE ( Official Trailer ),,KADUSKO Tv,23,Comedy,2025-10-04T07:00:11Z,"[KADUSKO Tv, KADUSKO, Comedy]",114836,677,57
2,Gn7k2rDy7yo,Why Did Mikey family Leave the Village !?,"Today, Mikey's family has suddenly left! But w...",Maizen,20,Gaming,2025-10-11T11:00:24Z,"[Minecraft, Maizen]",2108367,11719,809
3,VrSC_SBgPf4,PRINCE INDAH - TRIBUTE TO RT. HON RAILA ODINGA,Artist : Prince Indah\nTitle : Tribute To Rt. ...,"Prince Indah, OGW",10,Music,2025-10-16T03:31:00Z,"[Prince Indah, Tribute To Raila Odinga, Raila,...",389082,27398,3350
4,R4wiXj9NmEE,Send Help | Official Trailer | In Theaters Jan 30,Rachel McAdams. Dylan O’Brien. SEND HELP.\n\nD...,20th Century Studios,1,Film & Animation,2025-10-14T16:00:36Z,[Trailer],2586229,51760,3662
...,...,...,...,...,...,...,...,...,...,...,...
195,RjtIm83MQdk,Masicka - Keys (Official Video),Stream/Download: https://masicka.lnk.to/Keys\n...,Masicka,10,Music,2025-10-10T16:00:07Z,"[Masicka, Dancehall, Jamaica, Jamaican, Danceh...",515276,25061,2173
196,WFha8pWTNac,Adopted by 67 KID FAMILY in Minecraft!,Drift is saved by the 67 KID FAMILY in Minecra...,Drift,20,Gaming,2025-10-05T02:00:35Z,"[drift, minecraft, minecraft mod, Playing As R...",1175536,20315,5237
197,PtU5SVopt4g,LALA SALAMA BABA RAILA AMOLO ODINGA BY KINYAMB...,#princeindah #odongoswagg #railaodinga,Kinyambu Boys Band,10,Music,2025-10-16T20:13:43Z,[],1667,333,44
198,FOufoMTfZdI,"Plants Vs Brainrots, But It's MODDED..",Today we played modded plants vs brainrots in ...,FoltynPlays,20,Gaming,2025-10-10T19:12:59Z,"[Foltyn, foltynplays, roblox, roblox gaming, f...",754095,13552,984


In [None]:
bigdata['category_name'].value_counts()

category_name
Music               86
Gaming              80
Entertainment       15
People & Blogs       7
Film & Animation     3
Sports               3
Comedy               2
Education            2
Unknown              2
Name: count, dtype: int64

In [None]:
# Initialize BigQuery client
client = bigquery.Client(project='project-adrian-aluoch')

In [None]:
bigdata['tags'] = bigdata['tags'].astype(str)

In [None]:
# Define Table ID
table_id = 'project-adrian-aluoch.youtube_trends_ke.trends'

# Export Data to BigQuery
job = client.load_table_from_dataframe(bigdata, table_id)
while job.state != 'DONE':
    time.sleep(2)
    job.reload()
    print(job.state)

DONE


In [None]:
# Define SQL Query to Retrieve Open Weather Data from Google Cloud BigQuery
sql = (
    'SELECT *'
    'FROM `project-adrian-aluoch.youtube_trends_ke.trends`'
      )

# Run SQL Query
data = client.query(sql).to_dataframe()

In [None]:
# Delete Original Table
client.delete_table(table_id)
print(f"Table deleted successfully.")

# Check Total Number of Duplicate Records
duplicated = data.duplicated(subset=['video_id']).sum()

# Remove Duplicate Records
data.drop_duplicates(subset=['video_id'], inplace=True)

Table deleted successfully.


In [None]:
data['tags'] = data['tags'].astype(str)

In [None]:
# Define the dataset ID and table ID
dataset_id = 'youtube_trends_ke'
table_id = 'trends'

# Define the BigQuery schema for YouTube trending videos
schema = [
    bigquery.SchemaField("country_code", "STRING"),
    bigquery.SchemaField("country_name", "STRING"),
    bigquery.SchemaField("region", "STRING"),
    bigquery.SchemaField("video_id", "STRING"),
    bigquery.SchemaField("title", "STRING"),
    bigquery.SchemaField("description", "STRING"),
    bigquery.SchemaField("channel_title", "STRING"),
    bigquery.SchemaField("category_id", "STRING"),
    bigquery.SchemaField("category_name", "STRING"),
    bigquery.SchemaField("published_at", "STRING"),
    bigquery.SchemaField("tags", "STRING"),  # optional: convert list to string before upload
    bigquery.SchemaField("view_count", "INTEGER"),
    bigquery.SchemaField("like_count", "INTEGER"),
    bigquery.SchemaField("comment_count", "INTEGER")
]

# Define the table reference
table_ref = client.dataset(dataset_id).table(table_id)

# Create the table object
table = bigquery.Table(table_ref, schema=schema)

try:
    # Create the table in BigQuery
    table = client.create_table(table)
    print(f"Table {table.table_id} created successfully.")
except Exception as e:
    print(f"Table {table.table_id} failed")

Table trends created successfully.


In [None]:
# Define the BigQuery table ID
table_id = 'project-adrian-aluoch.youtube_trends_ke.trends'

# Load the data into the BigQuery table
job = client.load_table_from_dataframe(data, table_id)

# Wait for the job to complete
while job.state != 'DONE':
    time.sleep(2)
    job.reload()
    print(job.state)

DONE
