In [1]:
import json
import csv
import pandas as pd
import os

# Go up one dir

if not os.path.exists("ads/"):
    os.chdir("..")

In [2]:
import warnings
warnings.filterwarnings("ignore")

conspiracy_ids = ["20230527_2230","20230530_1949", "20230615_2208",
                 "20230518_1957", "20230526_1750"]

mainstream_ids = ["20230529_2213", "20230606_0332",
                  "20230601_1751", "20230606_1947", ]

run_ids = conspiracy_ids + mainstream_ids
        
def get_preroll_tag(info, database):
    # info is a tuple (video_url, site)
    
    url, site = info
    
    url_db = database[database["preroll_ad_video_url"]==url]
    if len(url_db) != 0:
        return url_db.iloc[0]["tag"], url_db.iloc[0]["is_scam"]
    
    site_db = database[database["preroll_ad_site"]==site]
    if len(site_db) != 0:
        return site_db.iloc[0]["tag"], site_db.iloc[0]["is_scam"]
    
    return None, None


def get_side_tag(info, database):
    # info is a tuple (side_ad_img, site)
    
    img, site = info
    
    img_db = database[database["side_ad_img"]==img]
    if len(img_db) != 0:
        return img_db.iloc[0]["tag"], img_db.iloc[0]["is_scam"]
    
    site_db = database[database["side_ad_site"]==site]
    if len(site_db) != 0:
        return site_db.iloc[0]["tag"], site_db.iloc[0]["is_scam"]
    
    return None, None


# build the database
def build_preroll_database():

    preroll_db = pd.read_excel("ads/preroll_ads_tagging.xlsx", sheet_name = run_ids)
    dfs = []

    for run_id in run_ids:
        preroll_df = preroll_db.get(run_id)
        preroll_df = preroll_df[preroll_df['preroll_ad_video_url'].notna()]
        preroll_df['run_id'] = run_id
        dfs.append(preroll_df)

    database = pd.concat(dfs, ignore_index=True)
    return database

def build_side_database():
    
    side_db = pd.read_excel("ads/side_ads_tagging.xlsx", sheet_name = run_ids)
    dfs = []

    for run_id in run_ids:

        side_df = side_db.get(run_id)
        side_df = side_df[side_df['side_ad_img'].notna()]
        side_df['run_id'] = run_id
        dfs.append(side_df)

    database = pd.concat(dfs, ignore_index=True)
    return database

In [3]:
preroll_database = build_preroll_database()
side_database = build_side_database()

In [4]:

def get_reasons(id):
    
    scam_count, main_count = 0, 0
    scam_reasons, main_reasons = [], []
    
    ad_json_file = open(f"ads/ads_{id}.json", "r", encoding="utf-8")

    for line in ad_json_file:
        json_data = json.loads(line)

        if json_data['preroll_ad_video_url']:

            if json_data['preroll_ad_video_url'] != json_data['video_url']:
                video_url, site = json_data["preroll_ad_video_url"], json_data["preroll_ad_site"]
                tag, is_scam = get_preroll_tag((video_url, site), preroll_database)
                if is_scam:
                    scam_count += 1
                    scam_reasons = scam_reasons + json_data["preroll_ad_reasons"]
                else:
                    main_count += 1
                    main_reasons = main_reasons + json_data["preroll_ad_reasons"]

            if json_data['preroll_ad2_video_url'] and json_data['preroll_ad2_video_url'] != json_data['video_url']:
                video_url, site = json_data["preroll_ad2_video_url"], json_data["preroll_ad2_site"]
                tag, is_scam = get_preroll_tag((video_url, site), preroll_database)
                if is_scam:
                    scam_count += 1
                    scam_reasons = scam_reasons + json_data["preroll_ad2_reasons"]
                else:
                    main_count += 1
                    main_reasons = main_reasons + json_data["preroll_ad2_reasons"]
                    
                    
            if json_data['side_ad_img']:
                ad_img, site = json_data["side_ad_img"], json_data["side_ad_site"]
                tag, is_scam = get_side_tag((ad_img, site), side_database)
                if is_scam:
                    scam_count += 1
                    scam_reasons = scam_reasons + json_data["side_ad_reasons"]
                else:
                    main_count += 1
                    main_reasons = main_reasons + json_data["side_ad_reasons"]
                    
    return scam_count, main_count, scam_reasons, main_reasons


In [6]:
main_scam_reasons, main_main_reasons, consp_scam_reasons, consp_main_reasons = [], [], [], []
main_scam_count, main_main_count, consp_scam_count, consp_main_count = 0, 0, 0, 0

for id in conspiracy_ids:
    scam_count, main_count, scam_reasons, main_reasons = get_reasons(id)
    consp_scam_count += scam_count
    consp_main_count += main_count
    consp_scam_reasons = consp_scam_reasons + scam_reasons
    consp_main_reasons = consp_main_reasons + main_reasons
    
for id in mainstream_ids:
    scam_count, main_count, scam_reasons, main_reasons = get_reasons(id)
    main_scam_count += scam_count
    main_main_count += main_count
    main_scam_reasons = main_scam_reasons + scam_reasons
    main_main_reasons = main_main_reasons + main_reasons
    
    
from collections import Counter

main_main_counter = Counter(main_main_reasons).most_common()
main_scam_counter = Counter(main_scam_reasons)
consp_main_counter = Counter(consp_main_reasons)
consp_scam_counter = Counter(consp_scam_reasons)

cate = [k for k,v in main_main_counter]
main_main_list = [round(v/main_main_count*100, 1) for k,v in main_main_counter]
main_scam_list = [round(main_scam_counter[i]/main_scam_count*100, 1) for i in cate]
consp_main_list = [round(consp_main_counter[i]/consp_main_count*100, 1) for i in cate]
consp_scam_list = [round(consp_scam_counter[i]/consp_scam_count*100, 1) for i in cate]

count_dict = {
    "category": cate, 
    "mainstream_nonscam (%)": main_main_list, 
    "mainstream_scam (%)": main_scam_list,
    "conspiracy_nonscam (%)": consp_main_list,
    "conspiracy_scam (%)": consp_scam_list
}

df_count = pd.DataFrame(count_dict)
display(df_count)

print(cate)

Unnamed: 0,category,mainstream_nonscam (%),mainstream_scam (%),conspiracy_nonscam (%),conspiracy_scam (%)
0,The time of day or your general location (like...,45.7,46.6,52.8,39.7
1,The time of day,45.7,53.4,47.2,60.3
2,Your general location (like your country or city),45.7,53.4,47.2,60.3
3,The video you're watching,34.0,17.8,13.2,6.4
4,Your age,25.2,31.5,33.5,28.3
5,Google's estimation of your interests,15.5,6.8,5.0,3.4
6,Your gender,8.2,6.8,5.0,6.1
7,"Google's estimation of your interests, based o...",5.9,2.7,27.4,16.8
8,The advertiser’s interest in reaching new cust...,1.0,4.1,1.1,0.7
9,"Your activity, while you were signed in to Google",0.9,16.4,5.8,10.4


['The time of day or your general location (like your country or city)', 'The time of day', 'Your general location (like your country or city)', "The video you're watching", 'Your age', "Google's estimation of your interests", 'Your gender', "Google's estimation of your interests, based on your activity while you were signed in to Google", 'The advertiser’s interest in reaching new customers who haven’t bought something from them before', 'Your activity, while you were signed in to Google', 'Household Income range', 'Parental Status', 'Relationship status']
