In [1]:
import json 
import csv
import pandas as pd
import os

# Go up one dir

if not os.path.exists("ads/"):
    os.chdir("..")


In [2]:
import warnings
warnings.filterwarnings("ignore")

run_ids = ["20230518_1957", "20230526_1750", "20230527_2230",
               "20230530_1949", "20230529_2213", "20230601_1751",
               "20230606_0332", "20230606_1947", "20230615_2208"]
        
def get_tagging(info, database):
    # info is a tuple (video_url, site)
    
    url, site = info
    
    url_db = database[database["preroll_ad_video_url"]==url]
    if len(url_db) != 0:
        return url_db.iloc[0]["tag"], url_db.iloc[0]["is_scam"]
    
    site_db = database[database["preroll_ad_site"]==site]
    if len(site_db) != 0:
        return site_db.iloc[0]["tag"], site_db.iloc[0]["is_scam"]
    
    return None, None


# build the database
def build_database():
    tag_header = ["preroll_ad_video_url", "preroll_ad_site", "preroll_ad_advertiser",
                  "preroll_ad_location", "id", "tag", "is_scam"]

    db = pd.read_excel("ads/preroll_ads_tagging.xlsx", sheet_name = run_ids)
    dfs = []

    for run_id in run_ids:
        df = db.get(run_id)
        df = df[df['preroll_ad_video_url'].notna()]
        df['run_id'] = run_id
        dfs.append(df)

    database = pd.concat(dfs, ignore_index=True)
    return database

In [3]:
database = build_database()
database

Unnamed: 0,preroll_ad_video_url,preroll_ad_site,preroll_ad_advertiser,preroll_ad_location,tag,is_scam,Notes,run_id,df_index,id
0,https://www.youtube.com/watch?v=LCTqyxEUmHU,parasitesnomore.com,PureHealth Research LLC,United States,Health Products,1.0,,20230518_1957,,
1,https://www.youtube.com/watch?v=DcD1tOMiIvk,pacificrimathletics.com/online-reg,Pacific Rim Athletics,United States,Fitness,1.0,"athletic training, 10k for a short course",20230518_1957,,
2,https://www.youtube.com/watch?v=jR-gYFNLVxs,click.doodly.com,Voomly LLC,United States,Software Services,0.0,sketchy hidden subscription,20230518_1957,,
3,https://www.youtube.com/watch?v=PahO1czqaqM,go.masterclass-piano.com,Creators Secrets Inc.,United States,Educational Services,1.0,,20230518_1957,,
4,https://www.youtube.com/watch?v=kuQIm72_H3o,cf.spybriefing.com,,,Fitness,1.0,"self-defense training, very sketchy",20230518_1957,,
...,...,...,...,...,...,...,...,...,...,...
1322,https://www.youtube.com/watch?v=Niyh4CkH7RY,waldenu.edu,,,For-profit University,,,20230615_2208,,4927.0
1323,https://www.youtube.com/watch?v=znjwdCCguBw,hiddenvalley.com,,,Food/Drinks,,,20230615_2208,,4944.0
1324,https://www.youtube.com/watch?v=vVmVBwkqMoU,onetrust.com,"OneTrust, LLC",United States,Software Services,,,20230615_2208,,4964.0
1325,https://www.youtube.com/watch?v=QL_3IL1wnqE,modelousa.com,"Constellation Brands, Inc",United States,Food/Drinks,,,20230615_2208,,4976.0


In [4]:
from collections import Counter


info = ["video_url",
          "preroll_ad_video_url", "preroll_ad_site","preroll_ad_info",
          "preroll_ad2_video_url", "preroll_ad2_site","preroll_ad2_info",]

header = ["id"] + info + ["tag", "is_scam", "tag2", "is_scam2"]
df_data = []
preroll_tags, preroll2_tags, preroll_scam, preroll2_scam = [], [], [], []

for id in run_ids:

    ad_json_file = open(f"ads/ads_{id}.json", "r", encoding="utf-8")

    for line in ad_json_file:
        json_data = json.loads(line)

        if json_data['preroll_ad_video_url'] and json_data['preroll_ad2_video_url']:
            if json_data['preroll_ad_video_url'] != json_data['video_url'] and json_data['preroll_ad2_video_url'] != json_data['video_url']:
                
                video_url, site = json_data["preroll_ad_video_url"], json_data["preroll_ad_site"]
                video_url2, site2 = json_data["preroll_ad2_video_url"], json_data["preroll_ad2_site"]
                tag, is_scam = get_tagging((video_url, site), database)
                tag2, is_scam2 = get_tagging((video_url2, site2), database)
                
                preroll_tags.append(tag)
                preroll2_tags.append(tag2)
                if is_scam:
                    preroll_scam.append(tag)
                if is_scam2:
                    preroll2_scam.append(tag2)
                
                data = [id] + [json_data[cate] for cate in info] + [tag, is_scam, tag2, is_scam2]
                df_data.append(data)
    
df = pd.DataFrame(df_data, columns=header)
df

Unnamed: 0,id,video_url,preroll_ad_video_url,preroll_ad_site,preroll_ad_info,preroll_ad2_video_url,preroll_ad2_site,preroll_ad2_info,tag,is_scam,tag2,is_scam2
0,20230518_1957,https://www.youtube.com/watch?v=6v4SLn-_GeE,https://www.youtube.com/watch?v=BuIz-f6uUcM,subscribe.theepochtimes.com,"[MARKETFUEL SUBSCRIPTION SERVICES, United States]",https://www.youtube.com/watch?v=T8fGIon1JZM,smartinvestorsdaily.com,"[Powderday Digital, LLC, United States]",News,0.0,Financial,1.0
1,20230518_1957,https://www.youtube.com/watch?v=HC05gzknxAI,https://www.youtube.com/watch?v=3l7BbB4KDAs,betterhelp.com,"[BetterHelp, Inc., United States]",https://www.youtube.com/watch?v=c0ljp5YB1OE,wellness-hub.life,"[None, None]",Health Services,0.0,Health Products,1.0
2,20230518_1957,https://www.youtube.com/watch?v=EY5uZrnAJ2Y,https://www.youtube.com/watch?v=1B9v3GbE2G4,temu.com,"[None, None]",https://www.youtube.com/watch?v=akLmxtIAoC4,livingproof.com,"[Living Proof inc., United States]",Electronics,0.0,Beauty,0.0
3,20230518_1957,https://www.youtube.com/watch?v=4r-4mqy4F7Y,https://www.youtube.com/watch?v=kEMXZu2scKE,grammarly.com,"[Grammarly, Inc., United States]",https://www.youtube.com/watch?v=0PoMVDor5tY,wellness-hub.life,"[Kuyami Inc, United States]",Software Services,0.0,Health Products,1.0
4,20230518_1957,https://www.youtube.com/watch?v=e54vOGWJIqs,https://www.youtube.com/watch?v=aEIZBfWF_jA,grammarly.com,"[Grammarly, Inc., United States]",https://www.youtube.com/watch?v=MopERx1yXvQ,go.overwolf.com/free-download,"[Overwolf LTD, Israel]",Software Services,0.0,Games,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
312,20230615_2208,https://www.youtube.com/watch?v=iOJwhtqyfMg,https://www.youtube.com/watch?v=NmDtVQidvms,brooksrunning.com,"[None, None]",https://www.youtube.com/watch?v=7X9F_bTD3rc,benefiber.com/fiber-supplement-p...,"[None, None]",Clothing,,Health Products,0.0
313,20230615_2208,https://www.youtube.com/watch?v=CCDUJ54sCIM,https://www.youtube.com/watch?v=Kz8np8t7PIw,usa.philips.com/dental-professio...,"[Philips North America LLC, United States]",https://www.youtube.com/watch?v=LsDsmVSI0hI,youtube.com,"[UNIVERSITY OF CINCINNATI, United States]",Health Services,0.0,Nonprofit University,
314,20230615_2208,https://www.youtube.com/watch?v=LBXYLKCveBk,https://www.youtube.com/watch?v=ke3N8IMESW4,dc.com,"[Warner Brothers Entertainment Inc., United St...",https://www.youtube.com/watch?v=-67OAjLlCe8,macys.com,"[None, None]",Entertainment,,Beauty,0.0
315,20230615_2208,https://www.youtube.com/watch?v=oKTNyB2aCOU,https://www.youtube.com/watch?v=h3vdA3jIvHU,tums.com/,"[None, None]",https://www.youtube.com/watch?v=FNwDZc0MNGU,dc.com,"[Warner Brothers Entertainment Inc., United St...",Health Products,0.0,Entertainment,


In [13]:
import matplotlib.pyplot as plt

counter = Counter(preroll_tags).most_common()
counter2 = Counter(preroll2_tags)
scam_counter = Counter(preroll_scam)
scam_counter2 = Counter(preroll2_scam)

cate = [k for k, v in counter]
preroll_ad_count = [v for k,v in counter]
preroll_ad2_count = [counter2[i] for i in cate]
preroll_ad_scam = [scam_counter[k] if k in scam_counter.keys() else 0 for k in cate]
preroll_ad2_scam = [scam_counter2[k] if k in scam_counter2.keys() else 0 for k in cate]
count_dict = {
    "category": cate, 
    "preroll_ad": preroll_ad_count, 
    "preroll_ad2": preroll_ad2_count,
    "preroll_ad_scam": preroll_ad_scam,
    "preroll_ad2_scam": preroll_ad2_scam,
}
df_count = pd.DataFrame(count_dict)
display(df_count)
total = sum(counter2.values())
print(f"Number of pairs: {total}")
print(f"Number of scam ads in preroll 1: {sum(preroll_ad_scam)}/{total} = {sum(preroll_ad_scam)/total * 100:.1f}%")
print(f"Number of scam ads in preroll 2: {sum(preroll_ad2_scam)}/{total} = {sum(preroll_ad2_scam)/total * 100:.1f}%")

Unnamed: 0,category,preroll_ad,preroll_ad2,preroll_ad_scam,preroll_ad2_scam
0,Games,83,104,1,1
1,Software Services,79,56,3,1
2,Utility/Services,25,19,2,3
3,Food/Drinks,15,8,1,0
4,Entertainment,14,12,5,1
5,Home Goods,11,6,1,4
6,Financial,9,13,5,9
7,Health Products,9,21,2,15
8,Beauty,9,3,1,0
9,Consumer Hardware,8,7,0,2


Number of pairs: 317
Number of scam ads in preroll 1: 26/317 = 8.2%
Number of scam ads in preroll 2: 69/317 = 21.8%
