# App Threads Automated Tagging V1

In [None]:
!pip install deep-translator

In [None]:
import re
import time
import datetime as dt
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score
import pyspark.sql.functions as f
from pyspark.sql.types import *

import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go

from deep_translator import GoogleTranslator

In [None]:
spark.conf.set('spark.databricks.delta.formatCheck.enabled', False)
spark.conf.set('spark.sql.legacy.timeParserPolicy', 'LEGACY')
spark.conf.set('spark.sql.shuffle.partitions', 'auto')
spark.conf.set("spark.databricks.adaptive.autoOptimizeShuffle.enabled", "true")

# Create Base Table

In [None]:
start_dt = '2022-06-01'

In [None]:
@udf(returnType=StringType())
def strip_html(text):
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)


def thread_text_input(start_date):

  thread_df = spark.sql(
    r'''
    select
      thread_id,
      thread_card_name,
      content_language,
      array_join(collect_set(card_title), ', ') as card_title,
      array_join(collect_set(card_subtitle), ', ') as card_subtitle,
      array_join(collect_set(card_body), ', ') as card_body,
      array_join(collect_list(action_text), ', ') as action_text,
      array_join(collect_list(destination_type), ', ') as action_destination_type,
      -- char(round(avg(thread_card_count))) as thread_card_count,
      max(brand) as brand,
      max(targetted_interest_labels) as targetted_interest_labels,
      max(brand_offerings_l1) as brand_offerings_l1,
      max(brand_offerings_l2) as brand_offerings_l2,
      max(split(marketing_creative_type, ',') [0]) as marketing_type,
      max(thread_card.marketing_category) as marketing_category,
      max(
        regexp_replace(
          regexp_replace(
            regexp_extract(marketing_category, '(,|^)[a-zA-Z \\\']+\\(Category, Dimension\\)', 0),
            '\\(Category, Dimension\\)', ''
            ),
          '^[,| ]+|[,| ]+$', ''
          )
      ) as sport_dimension,
      max(
        regexp_replace(
          regexp_replace(
            regexp_extract(marketing_category, '(,|^)[a-zA-Z \\\']+\\(Category, Construct\\)', 0),
            '\\(Category, Construct\\)', ''
          ),
          '^[,| ]+|[,| ]+$', ''
        )
      ) as construct,
      max(
        regexp_replace(
          regexp_replace(
            regexp_extract(marketing_category, '(,|^)[a-zA-Z \\\']+\\(Field of Play\\)', 0),
            '\\(Field of Play\\)', ''
          ),
          '^[,| ]+|[,| ]+$', ''
        )
      ) as field_of_play
    from
      (
        select
          distinct thread_id,
          CARD_KEY,
          thread_card_name,
          card_subtype,
          card_title,
          card_subtitle,
          card_body,
          thread_card_count,
          content_marketplace,
          content_language,
          brand,
          targetted_interest_labels,
          brand_offerings_L1 as brand_offerings_l1,
          brand_offerings_L2 as brand_offerings_l2,
          marketing_creative_type,
          marketing_category,
          RANK() OVER (PARTITION BY thread_id, content_marketplace, content_language ORDER BY CONTENT_THREAD_VERSION DESC) AS thread_version_rank,
          RANK() OVER (PARTITION BY THREAD_ID, CARD_KEY ORDER BY CARD_VERSION DESC) AS card_version_rank
        from
          content.content_dimension
        where
          thread_sub_type = 'thread'
          -- and card_language in ('en', 'en-GB')
          and not thread_id is null
          and publish_start_date >= '{}'
      ) thread_card
      left join (
        select
          distinct card_key,
          action_id,
          action_text,
          destination_type
        from
          cms.cms_action
      ) action on thread_card.CARD_KEY = action.CARD_KEY
    where
      thread_version_rank = 1
      and card_version_rank = 1
    group by
      thread_id,
      thread_card_name,
      content_language
    '''.format(start_date)
  )

  airtable_tags = spark.sql("""
  WITH published_airtable_thread_keys AS (
    SELECT
      thread_key,
      id
    FROM
      cms.cms_thread_external_attributes
    WHERE
      domain = 'control_plane'
    GROUP BY
      thread_key,
      id
  ),
  cms_thread_keys AS (
    SELECT DISTINCT
      thread_key,
      thread_id
    FROM cms.cms_thread
  )
  SELECT
    b.thread_id,
    c.name as execution_name,
    c.season,
    c.primary_gender_construct,
    c.target_gender,
    c.primary_sport_segment,
    c.fields_of_play
  FROM
    published_airtable_thread_keys a
    JOIN cms_thread_keys b ON a.thread_key = b.thread_key
    JOIN content_planning.plan_executions c ON a.id = c.id
  """)

  airtable_tags = (
    airtable_tags
    .groupBy('thread_id')
    .agg(
      f.max('primary_gender_construct').alias('primary_gender_construct'),
      f.max('target_gender').alias('target_gender'),
      f.max('primary_sport_segment').alias('primary_sport_segment'),
      f.max('fields_of_play').alias('fields_of_play'),
    )
  )

  thread_df = (
    thread_df
    .join(airtable_tags, on='thread_id', how='left')
    .fillna("", subset=['thread_card_name', 'card_title', 'card_subtitle', 'card_body'])
    .withColumn('header', f.concat_ws("\n", f.col('thread_card_name'), f.col('card_title'), f.col('card_subtitle')))
    .withColumn('template', f.concat_ws("\n", f.col('header'), f.col('card_body')))
    .withColumn('template', f.trim(strip_html(f.col('template'))))
    .withColumn('template', f.regexp_replace(f.col('template'), "&#8232;", "\n"))
    .withColumn('template', f.when(f.col('template') == "", None).otherwise(f.col('template')))
  )

  return thread_df

In [None]:
"""
thread_df = thread_text_input(start_dt)

(
  thread_df
  .write.format('delta')
  .mode('overwrite')
  .option('overwriteSchema', 'True')
  .option('url', 's3://ngap2-user-data/gck/glbl_marsci_sandbox/owned/mchu10/thread_copy_nikeapp')
  .saveAsTable(f'mchu10.thread_copy_nikeapp')
)
print(f'mchu10.thread_copy_nikeapp saved')
"""

In [None]:
thread_df = spark.sql("SELECT * FROM mchu10.thread_copy_nikeapp")
print(thread_df.count())

In [None]:
thread_pdf = thread_df.toPandas()

translator = GoogleTranslator(source='auto', target='en')

def translate_to_en(txt):
  try:
    # time.sleep(0.3) # sleep to avoid exceeding API limit
    translation = translator.translate(txt)
  except:
    translation = None
  return translation

thread_pdf['header_en'] = thread_pdf['header'].apply(translate_to_en)
thread_pdf['template_en'] = thread_pdf['template'].apply(translate_to_en)

thread_df = spark.createDataFrame(thread_pdf)

"""
(
  thread_df
  .write.format('delta')
  .mode('overwrite')
  .option('overwriteSchema', 'True')
  .option('url', 's3://ngap2-user-data/gck/glbl_marsci_sandbox/owned/mchu10/thread_copy_nikeapp')
  .saveAsTable(f'mchu10.thread_copy_nikeapp')
)
"""

print(f'mchu10.thread_copy_nikeapp saved')

In [None]:
thread_df = spark.sql("select * from mchu10.thread_copy_nikeapp")
display(thread_df)

In [None]:
thread_df.groupBy('primary_gender_construct').count().display()

In [None]:
thread_df.count()

# String Matching Logic

In [None]:
test_qa_dict = {
  'test_qa': ['qa', 'test_', 'test lamb', 'proof test', 'proof_', 'internal seed'],
}

promo_dict = {
  'promo': ['code', r'(?<!.pre|pre-)sale', r'\bpromo\b', 'promotion', 'discount',
            'clearance', 'markdown', r'[\d]+ off', r'[\d]+\$ off', r'[\d]+% off', r'[\d]+€ off',
            r'[\d]+£ off', r'[\d]+ yen off', r'[\d]+ yuan off', r'[\d]+ pesos off',
            r'save.+\$[\d]+', r'save.+[\d]+\$', r'save.+%[\d]+', r'save.+[\d]+%',
            r'save.+\€[\d]+', r'save.+\€[\d]+', r'save.+\£[\d]+', r'save.+\£[\d]+',
            r'save.+[\d]+ yen', r'save.+[\d]+ yuan', r'save.+[\d]+ pesos',
            r'deduct.+[\d]+'],
}

other_dict = {
  'nrc': [r'\bnrc\b', 'nike run club'],
  'ntc': [r'\bntc\b', 'nike training club'],
  'snkrs': [],
  'nikebyyou': [],
  'lifestyle': ['lifestyle'],
  'sustainability': ['sustainab', 'green', 'climate change', 'recycle',
                     'zero waste', 'move to zero', 'trash', 'space hippie',
                     'earth day'],
  'trail': [r'\btrail\b', r'\bacg\b', 'all condition'],
  'exclusive': ['exclusive', 'first access', 'get it first'],
  'holiday': ['holiday', 'valentine', 'galentine', "mother's day", 'mothers day',
              "father's day", 'fathers day', 'christmas', 'boxing day', 'thanksgiving',
              'new year', r'\bcny\b', 'family day'],
  'blackfriday': ['black friday', 'cyber monday', 'cyber week'],
  'boxingday': ['boxing day'],
  'birthday': ['birthday', 'bday'],
  'backtoschool': ['back to school'],
  'memberdays': ['member days'],
  'sneakersoftheweek': ['sneakers of the week'],
  'newarrival': ['new arrival'],
  'weeklyoffense': ['weekly offense'],
  'mentalhealth': ['mental', 'meditat', 'headspace'],
  'holisticfitness': ['holistic fitness'],
  'trigger': [],
  'campaign': [],
}

brand_dict = {
  'nike': [],
  'jordan': ['jordan', r'aj[\d]', 'aj '],
  'converse': ['converse'],
  'hurley': ['hurley'],
}

construct_dict = {
  'mens': [r'\bmen\b', r'\bman\b', '\bmens\b', "\bmen's\b"],
  'womens': ['women', 'woman'],
  'dualgender': ['dual gender', 'unisex'],
  'kids': [],
}

sport_dict = {
  'running': [r'(?<!.in the |time is )running(?! out of time| back| errands)', 'runner', 'marathon'],
  'fitness': ['fitness', 'workout', r'\bgym\b', 'strength training', 'cardio'],
  'globalfootball': ['football', 'soccer'],
  'basketball': ['basketball', 'bball', 'b-ball', 'streetball', 'street ball', 'hooper'],
  'tennis': ['tennis'],
  'golf': ['golf'],
  'baseball': ['baseball', 'softball'],
  'nikesb': ['nikesb', 'nike sb', 'skateboard'],
  'nikedance': [r'\bdance\b', r'\bdancer\b', r'\bdancing\b', 'ballerina'],
  'yoga': ['yoga'],
  'americanfootball': [],
  'lacrosse': ['lacrosse'],
  'trackandfield': ['track and field', 'track & field'],
}

franchise_dict = {
  'drifitcotton': ['drifit', 'dri fit', 'dri-fit', 'df cotton'],
  'legend': ['legend essential'],
  'nikepro': [r'nike pro\b'],
  'nikeone': ['nike one'],
  'miler': ['miler'],
  'airforce1': ['air force 1', 'air force i', 'af1', 'af-1'],
  'spotlight': ['spotlight hoodie'],
  'victory': ['victory print'],
  'flex': ['nike flex'],
  'free': ['nike free'],
  'tech': [r'nike tech\b'],
  'windrunner': ['windrunner', 'wind runner'],
  'tempo': ['nike tempo'],
  'therma': ['therma'],
  'mercurial': ['mercurial'],
  'phantom': [r'phantom(?! run)'],
  'waffle': ['waffle'],
  'airmax': ['airmax', 'air max', r'\bam[\d]', r'\bam [\d]'],
  'indy': [r'\bindy\b'],
  'alpha': [r'\balpha\b'],
  'airjordan1': ['air jordan 1 ', 'air jordan i ', r'aj1\b', r'aj 1\b', r'aji\b', r'aj i\b'],
  'pegasus': ['pegasus'],
  'dunk': [r'(?<!slam )dunk'],
  'blazer': [r'\bblazer\b'],
  'revolution': ['nike revolution'],
  'cortez': ['cortez'],
  'metcon': ['metcon'],
  'vapormax': ['vapormax', 'vapourmax', 'vapor max', 'vapour max', 'vapor-max', 'vapour-max'],
  'alate': ['alate'],
  'vomero': ['vomero'],
  'winflo': ['winflo'],
  'structure': ['zoom structure'],
  'vaporfly': ['vaporfly', 'vapourfly', 'vapor fly', 'vapour fly'],
  'cosmicunity': ['cosmic unity'],
  'infinityrun': ['infinity run'],
  'invinciblerun': ['invincible run', 'invincible3', 'invincible 3'],
  'courtlegacy': ['court legacy'],
  'courtvision': ['court vision'],
  'huarache': ['huarache'],
  'stadium': ['jordan stadium'],
  'victori': [r'victori\b'],
  'phoenix': ['phoenix fleece'],
  'superrep': ['superrep', 'super rep'],
  'challenger': ['challenger'],
  'courtroyale': ['court royale'],
  'presto': ['presto'],
  'zoomfly': ['zoomfly', 'zoom fly'],
  'motiva': [r'\bmotiva\b'],
}

athlete_dict = {
  # basketball
  'michaeljordan': ['michael jordan', r'\bmj\b'],
  'lebronjames': ['lebron', r'\bbron\b', 'lbj', 'king james'],
  'kevindurant': ['durant', r'\bkd\b', 'kdx'],
  'kobebryant': ['kobe', 'mamba'],
  'kyrieirving': ['kyrie', 'irving', 'uncle drew'],
  'giannis': ['giannis', 'antetokounmpo', 'zoom freak'],
  'russellwestbrook': ['russell westbrook'],
  'paulgeorge': [r'\bpg\b', 'paul george'],
  'chrispaul': ['cp3', 'chris paul'],
  'zionwilliamson': ['zion', 'williamson'],
  'lukadoncic': [r'\bluka\b', 'doncic'],
  'jaysontatum': ['jayson', 'tatum'],
  'jamorant': [r'\bja(?! wilson)', 'morant'],
  'ruihachimura': ['rui hachimura'],
  'pennyhardaway': ['penny hardaway'],
  # global football
  'cristianoronaldo': ['cristiano', 'ronaldo', 'cr7'],
  'kylianmbappe': ['kylian', 'mbappe'],
  'kevindebruyne': ['de bruyne', 'debruyne'],
  'virgilvandijk': ['van dijk', 'vandijk'],
  'adahegerberg': ['hegerberg'],
  # american football
  'odellbeckhamjr': ['odell beckham', r'\bobj\b'],
  # tennis
  'rafaelnadal': ['rafa', 'nadal'],
  'rogerfederer': ['federer'],
  'serenawilliams': ['serena.+williams'],
  'naomiosaka': ['naomi.+osaka'],
  'mariasharapova': ['sharapova'],
  # golf
  'tigerwoods': ['tiger woods'],
  # skateboarding
  'nyjahhuston': ['nyjah', 'huston'],
  'ishodwair': ['ishod wair'],
  'leobaker': ['leo baker', 'lacey baker'],
  # running
  'eliudkipchoge': ['eliud', 'kipchoge'],
  # fitness
  'mathewfraser': ['mathew fraser', 'mat fraser'],
}

division_dict = {
  'apparel': ['apparel', 'clothing', 'clothes', 'garment'],
  'footwear': ['footwear', 'shoe', 'sneaker', 'kicks', 'sole'],
  'equipment': ['equipment', 'accessor'],
}

merchclass_dict = {
  'bras': [r'\bbra\b', r'\bbras\b'],
  'leggings': ['legging'],
  'fleece': ['fleece'],
  'tees': [r'\btees\b', 'tshirt', 't shirt', 't-shirt'],
  'sweatshirts': ['sweatshirt'],
  'shorts': ['shorts', 'volley short'],
  'jerseys': [r'jersey(?! liner)'],
  'jackets': ['jacket'],
  'pants': [r'\bpant\b', 'trouser'],
  'polos': [r'\bpolo\b'],
  'tights': ['tights'],
  'outerwear': ['outerwear'],
  'socks': [r'sock(?!liner)'],
  'baselayer': ['baselayer', 'base layer', 'base-layer'],
  'midlayer': ['midlayer', 'mid layer', 'mid-layer'],
  'skirts': ['skirt'],
  'hoodie': ['hoodie', 'hoody'],
  'vest': [r'\bvest\b'],
  'longsleeve': ['long sleeve', 'long-sleeve'],
  'shortsleeve': ['short sleeve', 'short-sleeve'],
  'sleeveless': ['sleeveless', r'\btank\b'],
  'scarf': ['scarf', 'scarves'],
  'gloves': ['glove'],
  'hat': [r'\bhat\b', 'beanie', '\bcap\b'],
  'cleats': ['cleat'],
  'boots': ['\bboot\b', '\bboots\b'],
  'slides': ['slides', 'sliders', 'sandal', 'flip flops', 'flip-flops', 'slippers'],
}

league_dict = {
  'ncaa': ['ncaa'],
  'fiba': ['fiba'],
  'nba': [r'\bnba\b'],
  'wnba': ['wnba'],
  'nfl': [r'\bnfl\b'],
  'mlb': ['mlb'],
  'fifa': ['fifa'],
  'laliga': ['laliga', 'la liga'],
  'premierleague': ['premier league'],
  'ligue1': ['ligue1', 'ligue 1'],
  'lalakers': ['lakers'],
  'laclippers': ['clippers'],
  'clevelandcavaliers': ['cavaliers'],
  'chicagobulls': ['chicago bulls'],
  'bostonceltics': ['celtic'],
  'goldenstatewarriors': ['golden state', 'warriors'],
  'barcelona': ['fc barcelona', r'f\.c\. barcelona', 'club barcelona', 'barcelona club'],
  'chelsea': ['fc chelsea', r'f\.c\. chelsea'],
  'manchestercity': ['manchester city', 'man city'],
  'manchesterunited': ['manchester united', 'man united'],
  'liverpool': ['liverpool'],
  'tottenhamhotspur': ['tottenham', 'tottenham'],
  'arsenal': ['arsenal', 'gunner'],
  'intermilan': ['intermilan', 'inter milan'],
  'parissaintgermain': ['paris saint-germain', 'paris saint germain', 'psg'],
  'olympic': ['olympic'],
  'grandslam': ['grand slam'],
  'superbowl': ['super bowl'],
  'worldcup': [],
  'marchmadness': ['march madness', 'final four', 'elite eight', 'sweet sixteen'],
}

In [None]:
header_dict = {
  'test_qa': ['airship', 'ncp', 'cp code', 'cpcd', r'(?<!la)test', 'proof'],
  'mens': [r'\bm\b', r'\bmn\b'],
  'womens': [r'\bw\b', r'\bwo\b', r'\bwn\b', r'\bwmn\b'],
  'dualgender': [r'\bdg\b', r'\bmw\b', r'\bmwk\b', r'\bm/w\b'],
  'kids': ['kid', r'\bbg\b', 'boy', 'girl', 'child', 'toddler', 'baby'],
  'lifestyle': ['nsw'],
  'globalfootball': ['gbf', 'gfb', 'ftbl'],
  'baseball': ['baseb'],
  'trigger': ['trigger'],
  'campaign': ['campaign']
}

category_map = {
  # League
  'ncaa': ['marchmadness'],
  'nba': ['lalakers', 'laclippers', 'clevelandcavaliers', 'chicagobulls', 'bostonceltics',
          'goldenstatewarriors'],
  'laliga': ['barcelona'],
  'premierleague': ['chelsea', 'manchestercity', 'manchesterunited', 'liverpool',
                    'tottenhamhotspur', 'arsenal'],
  'ligue1': ['parissaintgermain'],
  'nfl': ['superbowl'],
  # Sport dimension
  'running': ['miler', 'flex', 'pegasus', 'revolution', 'vomero', 'winflo', 'structure',
              'vaporfly', 'infinityrun', 'invinciblerun', 'zoomfly', 'motiva', 'eliudkipchoge'],
  'fitness': ['nikepro', 'nikeone', 'metcon', 'superrep', 'mathewfraser'],
  'globalfootball': ['mercurial', 'phantom', 'cristianoronaldo', 'kylianmbappe', 'kevindebruyne',
                     'virgilvandijk', 'adahegerberg', 'fifa', 'laliga', 'premierleague', 'ligue1',
                     'intermilan', 'parissaintgermain', 'worldcup'],
  'basketball': ['cosmicunity', 'michaeljordan', 'lebronjames', 'kevindurant', 'kobebryant',
                 'kyrieirving', 'giannis', 'russellwestbrook', 'paulgeorge', 'chrispaul',
                 'zionwilliamson', 'lukadoncic', 'jaysontatum', 'ruihachimura', 'pennyhardaway',
                 'jamorant', 'ncaa', 'fiba', 'nba', 'wnba'],
  'tennis': ['rafaelnadal', 'rogerfederer', 'serenawilliams', 'naomiosaka', 'mariasharapova',
             'grandslam'],
  'golf': ['tigerwoods'],
  'baseball': ['mlb'],
  'nikesb': ['nyjahhuston', 'ishodwair', 'leobaker'],
  'americanfootball': ['nfl', 'odellbeckhamjr'],
  # Merch class
  'bras': ['indy', 'alate'],
  'leggings': ['nikeone'],
  # Division
  'apparel': ['drifitcotton', 'miler', 'nikepro', 'phoenix', 'bras', 'leggings',
              'fleece', 'tees', 'sweatshirts', 'shorts', 'jerseys', 'jackets',
              'pants', 'polos', 'tights', 'outerwear', 'baselayer', 'midlayer',
              'skirts', 'hoodie', 'vest', 'longsleeve', 'shortsleeve', 'sleeveless'],
  'footwear': ['airforce1', 'airjordan1', 'dunk', 'airmax', 'blazer', 'pegasus', 'revolution',
               'phantom', 'mercurial', 'legend', 'cortez', 'metcon', 'vapormax', 'vomero',
               'winflo', 'structure', 'vaporfly', 'cosmicunity', 'infinityrun', 'invinciblerun',
               'courtlegacy', 'courtvision', 'huarache', 'victori', 'superrep',
               'courtroyale', 'presto', 'cleats', 'boots', 'slides'],
  'equipment': ['socks', 'scarf', 'gloves', 'hat'],
  # Construct
  'womens': ['bras', 'leggings', 'skirts'],
}

tag_cat_dict = {
  'test_qa': list(test_qa_dict.keys()),
  'promo': list(promo_dict.keys()),
  'other': list(other_dict.keys()),
  'brand': list(brand_dict.keys()),
  'franchise': list(franchise_dict.keys()),
  'athlete': list(athlete_dict.keys()),
  'league': list(league_dict.keys()),
  'sport': list(sport_dict.keys()),
  'merchclass': list(merchclass_dict.keys()),
  'division': list(division_dict.keys()),
  'construct': list(construct_dict.keys()),
  'fieldsofplay': ['kids_play_all_day', 'kids_play_sport', 'womens_running', 'womens_holistic_fitness',
                   'womens_lifestyle', 'womens_team_sports', 'mens_running', 'mens_basketball', 'mens_football',
                   'mens_lifestyle', 'jordan_basketball', 'jordan_streetwear', 'dual_gender_running',
                   'dual_gender_football', 'dual_gender_basketball', 'dual_gender_lifestyle']
}

# Create Automated Tagging Table

In [None]:
# Helper functions
def add_text_tags(df, text_col, text_dict):
  """
  Adds tags by string matching on text copy.

    Parameters:
      df: pyspark dataframe
      text_col (str): name of text copy column
      text_dict (dict): string matching dictionary

    Returns:
      df: modified pyspark dataframe
  """

  # Tag comms
  for k, v in text_dict.items():
    tag_col = k + "_tag"
    if tag_col not in df.columns:
      if v:
        regex_expr = "|".join(v)
        regex_expr = rf"{regex_expr}"
        df = (
          df
          .withColumn(tag_col, f.col(text_col).rlike(regex_expr))
          .withColumn(tag_col, f.col(tag_col).cast('int'))
        )
      else:
        df = df.withColumn(tag_col, f.lit(0).cast('int'))
    else:
      if v:
        regex_expr = "|".join(v)
        regex_expr = rf"{regex_expr}"
        df = (
          df
          .withColumn(tag_col, (f.col(tag_col) == 1) & f.col(text_col).rlike(regex_expr))
          .withColumn(tag_col, f.col(tag_col).cast('int'))
        )

  return df


# Eventually want to re-write rest of these functions in pyspark
def add_tags_from_metadata(df, tag_name, metadata_col, metadata_vals):
  """
  Adds tags from metadata.

    Parameters:
      df: pandas dataframe
      tag_name (str): name of tag
      metadata_col (str): name of metadata column
      metadata_vals (list): values in metadata column to be tagged on

    Returns:
      df: modified pandas dataframe
  """

  tag_col = tag_name + "_tag"
  df[tag_col] = ((df[tag_col] == 1) | df[metadata_col].str.contains("|".join(metadata_vals))).astype('int')
  # df[tag_col] = ((df[tag_col] == 1) | df[metadata_col].isin(metadata_vals)).astype('int')

  return df


def add_header_tags(df, header_dict):
  """
  Adds tags by string matching on the header.

    Parameters:
      df: pandas dataframe
      header_dict (dict): string matching dictionary

    Returns:
      df: modified pandas dataframe
  """

  for k, v in header_dict.items():
    tag_col = k + "_tag"
    regex_expr = "|".join(v)
    regex_expr = rf"{regex_expr}"
    df[tag_col] = ((df[tag_col] == 1) | (df['header'].str.contains(regex_expr))).astype('int')

  return df


def map_tags(df, map_dict):
  """
  Infers tags from word relationships/hierarchies.

    Parameters:
      df: pandas dataframe
      map_dict (dict): dictionary describing word relationships/hierarchies

    Returns
      df: modified pandas dataframe
  """

  for k, v in map_dict.items():
    tag_col = k + "_tag"
    map_cols = [c + "_tag" for c in v]
    map_cols.append(tag_col)

    df[tag_col] = df[map_cols].sum(axis=1).clip(upper=1)

  return df


def exclude_tags(df, tags_to_toggle, tags_to_exclude):
  """Changes tags_to_toggle to 0 if any of tags_to_exclude is 1"""

  for toggle_tag in tags_to_toggle:
    df[toggle_tag] = ((df[toggle_tag] == 1) & (df[tags_to_exclude].sum(axis=1).clip(upper=1) == 0)).astype('int')

  return df


def add_fields_of_play_tags(df):
  """Adds Fields of Play tags using construct and sport dimension tags."""

  # Fields of Play sport dimension categories
  play_sports = ['running_tag', 'basketball_tag', 'globalfootball_tag', 'fitness_tag', 'yoga_tag', 'golf_tag',
                 'tennis_tag', 'baseball_tag']
  team_sports = ['basketball_tag', 'globalfootball_tag', 'golf_tag', 'tennis_tag', 'baseball_tag', 'lacrosse_tag',
                 'trackandfield_tag']
  holistic_fitness = ['fitness_tag', 'yoga_tag', 'holisticfitness_tag', 'bras_tag', 'leggings_tag', 'tights_tag']
  performance = ['running_tag', 'basketball_tag', 'globalfootball_tag', 'fitness_tag', 'yoga_tag', 'golf_tag',
                'tennis_tag', 'baseball_tag', 'lacrosse_tag', 'trackandfield_tag']

  # Kids Fields of Play
  df['kids_play_all_day_tag'] = ((df[play_sports].sum(axis=1).clip(upper=1) == 0) & (df['kids_tag'] == 1)).astype('int')
  df['kids_play_sport_tag'] = ((df[play_sports].sum(axis=1).clip(upper=1) == 1) & (df['kids_tag'] == 1)).astype('int')

  # Womens Fields of Play
  df['womens_running_tag'] = ((df['running_tag'] == 1) & (df['womens_tag'] == 1)).astype('int')
  df['womens_holistic_fitness_tag'] = ((df[holistic_fitness].sum(axis=1).clip(upper=1) == 1) & (df['womens_tag'] == 1)).astype('int')
  df['womens_team_sports_tag'] = ((df[team_sports].sum(axis=1).clip(upper=1) == 1) & (df['womens_tag'] == 1)).astype('int')
  df['womens_lifestyle_tag'] = ((df[performance].sum(axis=1).clip(upper=1) == 0) & (df['womens_tag'] == 1)).astype('int')

  # Mens Fields of Play
  df['mens_running_tag'] = ((df['running_tag'] == 1) & (df['mens_tag'] == 1)).astype('int')
  df['mens_basketball_tag'] = ((df['basketball_tag'] == 1) & (df['mens_tag'] == 1)).astype('int')
  df['mens_football_tag'] = ((df['globalfootball_tag'] == 1) & (df['mens_tag'] == 1)).astype('int')
  df['mens_lifestyle_tag'] = ((df[performance].sum(axis=1).clip(upper=1) == 0) & (df['mens_tag'] == 1)).astype('int')

  # Dual Gender Fields of Play
  df['dual_gender_running_tag'] = ((df['running_tag'] == 1) & (df['dualgender_tag'] == 1)).astype('int')
  df['dual_gender_basketball_tag'] = ((df['basketball_tag'] == 1) & (df['dualgender_tag'] == 1)).astype('int')
  df['dual_gender_football_tag'] = ((df['globalfootball_tag'] == 1) & (df['dualgender_tag'] == 1)).astype('int')
  df['dual_gender_lifestyle_tag'] = ((df[performance].sum(axis=1).clip(upper=1) == 0) & (df['dualgender_tag'] == 1)).astype('int')

  # Jordan Fields of Play
  df['jordan_basketball_tag'] = ((df[performance].sum(axis=1).clip(upper=1) == 1) & (df['jordan_tag'] == 1)).astype('int')
  df['jordan_streetwear_tag'] = ((df[performance].sum(axis=1).clip(upper=1) == 0) & (df['jordan_tag'] == 1)).astype('int')

  # Remove performance Fields of Play if it contains a lifestyle franchise
  lifestyle_franchises = ['airforce1_tag', 'dunk_tag', 'airmax_tag', 'tech_tag']
  non_lifestyle_fop = ['womens_running_tag', 'womens_holistic_fitness_tag', 'womens_team_sports_tag', 'mens_running_tag',
                       'mens_basketball_tag', 'mens_football_tag', 'dual_gender_running_tag', 'dual_gender_football_tag']
  df = exclude_tags(df, non_lifestyle_fop, lifestyle_franchises)
  # Add lifestyle franchises to lifestyle Fields of Play
  df['womens_lifestyle_tag'] = ((df[lifestyle_franchises].sum(axis=1).clip(upper=1) == 1) & (df['womens_tag'] == 1)).astype('int')
  df['mens_lifestyle_tag'] = ((df[lifestyle_franchises].sum(axis=1).clip(upper=1) == 1) & (df['mens_tag'] == 1)).astype('int')
  df['dual_gender_lifestyle_tag'] = ((df[lifestyle_franchises].sum(axis=1).clip(upper=1) == 1) & (df['dualgender_tag'] == 1)).astype('int')

  # Remove gendered Fields of Play if Jordan
  non_jordan_fop = ['kids_play_all_day_tag', 'kids_play_sport_tag', 'womens_running_tag', 'womens_holistic_fitness_tag',
                    'womens_team_sports_tag', 'womens_lifestyle_tag', 'mens_running_tag', 'mens_basketball_tag',
                    'mens_football_tag', 'mens_lifestyle_tag', 'dual_gender_running_tag', 'dual_gender_basketball_tag',
                    'dual_gender_football_tag', 'dual_gender_lifestyle_tag']
  df = exclude_tags(df, non_jordan_fop, ['jordan_tag'])

  return df


def add_tag_categories(df, tag_cat_dict):
  """
  Adds tag categories.

    Parameters:
      df: pandas dataframe
      tag_cat_dict (dict): dictionary describing which tags belong in each tag category

    Returns:
      df: modified pandas dataframe
  """

  for k, v in tag_cat_dict.items():
    tag_cols = [c + "_tag" for c in v]
    df[k + "_tag"] = df[tag_cols].sum(axis=1).clip(upper=1)

  return df


def manual_overwrite(df, comm_id_list, tags_list, tag_val=1, overwrite_all=False):
  """
  Manually overwrites comm_ids with specified tags.

    Parameters:
      df: pandas dataframe
      comm_id_list (list): list of comm_ids to be overwritten
      tag_list (list): list of tags to be overwritten
      tag_val (int): 0 or 1, what value to overwrite tag with
      overwrite_all (bool): if True, overwrites all other tags to 0

    Returns:
      df: modified pandas dataframe
  """

  df = df.set_index('comm_id')

  for comm_id in comm_id_list:
    if overwrite_all:
      tag_cols = [c for c in df.columns if "_tag" in c]
      for tag in tag_cols:
        df.at[comm_id, tag] = 0

    for tag in tags_list:
      df.at[comm_id, tag + "_tag"] = tag_val

  df = df.reset_index().rename(columns={'index': 'comm_id'})

  return df

In [None]:
# Main function
def create_thread_auto_tag_tbl(env='dev'):
  """
  Creates and writes automated tagging table.

  'dev' writes to the mchu10 schema, 'prod' writes to the glbl_marsci_sandbox schema
  """

  thread_copy = spark.sql("SELECT * FROM mchu10.thread_copy_nikeapp")

  thread_copy = (
    thread_copy
    .withColumn('brand', f.lower(f.col('brand')))
    .withColumn('targetted_interest_labels', f.lower(f.col('targetted_interest_labels')))
    .withColumn('brand_offerings_l2', f.lower(f.col('brand_offerings_l2')))
    .withColumn('marketing_category', f.lower(f.col('marketing_category')))
    .withColumn('construct', f.lower(f.col('construct')))
    .withColumn('construct', f.regexp_replace(f.col('construct'), "'", ""))
    .withColumn('sport_dimension', f.lower(f.col('sport_dimension')))
    .withColumn('field_of_play', f.lower(f.col('field_of_play')))
    .withColumn('target_gender', f.lower(f.col('target_gender')))
    .withColumn('target_gender', f.regexp_replace(f.col('target_gender'), " ", ""))
    .withColumn('primary_gender_construct', f.lower(f.col('primary_gender_construct')))
    .withColumn('primary_gender_construct', f.regexp_replace(f.col('primary_gender_construct'), " ", ""))
    .withColumn('header', f.lower(f.col('header')))
    .withColumn('template', f.lower(f.col('template')))
    .withColumn('word_count', f.size(f.split(f.col('template'), " ")))
    .withColumn('construct_metadata', f.trim(f.coalesce(f.col('construct'), f.col('target_gender'), f.col('primary_gender_construct'))))
    .withColumn('construct_metadata', f.when(f.col('construct_metadata') == "", None).otherwise(f.col('construct_metadata')))
  )

  # Add tags on copy
  tag_dicts = [promo_dict, test_qa_dict, other_dict, brand_dict, construct_dict,
               sport_dict, franchise_dict, athlete_dict, division_dict, merchclass_dict,
               league_dict]
  for d in tag_dicts:
    thread_copy = add_text_tags(thread_copy, 'template', d)

  thread_copy_df = thread_copy.toPandas()

  # Add header tags
  thread_copy_df = add_header_tags(thread_copy_df, header_dict)

  # Add tags from metadata fields
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'jordan', 'brand', ['jordan'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'jordan', 'targetted_interest_labels', ['jordan'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'mens', 'targetted_interest_labels', [r'\bmen\b'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'womens', 'targetted_interest_labels', ['women'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'kids', 'targetted_interest_labels', ['boys', 'girls'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'running', 'targetted_interest_labels', ['running'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'fitness', 'targetted_interest_labels', ['training & gym'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'basketball', 'targetted_interest_labels', ['basketball'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'globalfootball', 'targetted_interest_labels', ['soccer'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'nikesb', 'targetted_interest_labels', ['skateboard'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'tennis', 'targetted_interest_labels', ['tennis'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'golf', 'targetted_interest_labels', ['golf'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'yoga', 'targetted_interest_labels', ['yoga'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'nikedance', 'targetted_interest_labels', ['dance'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'trail', 'targetted_interest_labels', ['acg'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'lifestyle', 'targetted_interest_labels', ['lifestyle'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'footwear', 'brand_offerings_l2', ['footwear'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'apparel', 'brand_offerings_l2', ['apparel'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'snkrs', 'brand_offerings_l2', ['snkrs'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'nrc', 'brand_offerings_l2', [r'\bnrc\b'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'ntc', 'brand_offerings_l2', [r'\bntc\b'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'running', 'brand_offerings_l2', ['audio guided run'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'promo', 'marketing_type', ['promo'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'jordan', 'marketing_category', ['jordan'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'mens', 'marketing_category', [r"\bmen's\b"])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'womens', 'marketing_category', ["women's"])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'kids', 'marketing_category', ["kids"])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'running', 'marketing_category', ['running'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'fitness', 'marketing_category', ['training', 'fitness'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'basketball', 'marketing_category', ['basketball'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'globalfootball', 'marketing_category', ['global football'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'nikesb', 'marketing_category', ['nike sb'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'tennis', 'marketing_category', ['tennis'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'golf', 'marketing_category', ['golf'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'americanfootball', 'marketing_category', ['american football'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'trail', 'marketing_category', ['acg'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'nikebyyou', 'marketing_category', ['nike by you'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'lifestyle', 'marketing_category', ['lifestyle', 'nsw'])
  thread_copy_df = add_tags_from_metadata(thread_copy_df, 'holisticfitness', 'marketing_category', ['holistic fitness'])

  # Infer tags from word hierarchies and relationships
  thread_copy_df = map_tags(thread_copy_df, category_map)

  # Logic for brand
  # Tag any comms without a brand as nike
  thread_copy_df['nike_tag'] = ((thread_copy_df['jordan_tag'] == 0) & (thread_copy_df['converse_tag'] == 0) & (thread_copy_df['hurley_tag'] == 0)).astype('int')

  # Logic for construct
  # If comm has both mens and womens tag -> tag as dual gender and remove mens and womens tag
  thread_copy_df['dualgender_tag'] = ((thread_copy_df['dualgender_tag'] == 1) | ((thread_copy_df['mens_tag'] == 1) & (thread_copy_df['womens_tag'] == 1))).astype('int')
  thread_copy_df['mens_tag'] = ((thread_copy_df['mens_tag'] == 1) & (thread_copy_df['dualgender_tag'] == 0)).astype('int')
  thread_copy_df['womens_tag'] = ((thread_copy_df['womens_tag'] == 1) & (thread_copy_df['dualgender_tag'] == 0)).astype('int')
  # Remove mens, womens, and dual gender tag if tagged as kids
  thread_copy_df['womens_tag'] = ((thread_copy_df['womens_tag'] == 1) & (thread_copy_df['kids_tag'] != 1)).astype('int')
  thread_copy_df['mens_tag'] = ((thread_copy_df['mens_tag'] == 1) & (thread_copy_df['kids_tag'] != 1)).astype('int')
  thread_copy_df['dualgender_tag'] = ((thread_copy_df['dualgender_tag'] == 1) & (thread_copy_df['kids_tag'] != 1)).astype('int')
  # Overwrite construct with already tagged columns
  thread_copy_df['mens_tag'] = thread_copy_df.apply(lambda row: 0 if row['construct_metadata'] is not None else row['mens_tag'], axis=1)
  thread_copy_df['womens_tag'] = thread_copy_df.apply(lambda row: 0 if row['construct_metadata'] is not None else row['womens_tag'], axis=1)
  thread_copy_df['dualgender_tag'] = thread_copy_df.apply(lambda row: 0 if row['construct_metadata'] is not None else row['dualgender_tag'], axis=1)
  thread_copy_df['kids_tag'] = thread_copy_df.apply(lambda row: 0 if row['construct_metadata'] is not None else row['kids_tag'], axis=1)
  thread_copy_df['mens_tag'] = ((thread_copy_df['mens_tag'] == 1) | (thread_copy_df['construct_metadata'] == "mens")).astype('int')
  thread_copy_df['womens_tag'] = ((thread_copy_df['womens_tag'] == 1) | (thread_copy_df['construct_metadata'] == "womens")).astype('int')
  thread_copy_df['dualgender_tag'] = ((thread_copy_df['dualgender_tag'] == 1) | (thread_copy_df['construct_metadata'] == "dualgender")).astype('int')
  thread_copy_df['kids_tag'] = ((thread_copy_df['kids_tag'] == 1) | (thread_copy_df['construct_metadata'] == "kids")).astype('int')
  thread_copy_df = thread_copy_df.drop(columns=['construct_metadata'])

  # Add Fields of Play tags and tag categories
  thread_copy_df = add_fields_of_play_tags(thread_copy_df)
  thread_copy_df = add_tag_categories(thread_copy_df, tag_cat_dict)

  # Manual overwrite

  # Exclude tags
  thread_copy_df = exclude_tags(thread_copy_df, ['globalfootball_tag'], ['americanfootball_tag'])
  thread_copy_df = exclude_tags(thread_copy_df, ['kids_play_all_day_tag', 'mens_lifestyle_tag', 'womens_lifestyle_tag'], ['birthday_tag'])

  # Add tagged column for at least one tag for coverage calculation
  tag_cols = [c for c in thread_copy_df.columns if "_tag" in c]
  thread_copy_df['tagged'] = thread_copy_df[tag_cols].sum(axis=1).clip(upper=1)

  thread_copy = spark.createDataFrame(thread_copy_df)
  tag_cols.append('tagged')
  thread_copy = thread_copy.fillna(0, subset=tag_cols)

  tbl_name = "thread_nikeapp_automated_tags"

"""
  if env == 'prod':
    (
      thread_copy
      .write.format('delta')
      .mode('overwrite')
      .option('overwriteSchema', 'True')
      .option('url', f's3://ngap--glbl-marsci--prod--us-east-1/Owned-Media/{tbl_name}')
      .saveAsTable(f'glbl_marsci_sandbox.{tbl_name}')
    )
    print(f"glbl_marsci_sandbox.{tbl_name} created")
  elif env == 'dev':
    (
      thread_copy
      .write.format('delta')
      .mode('overwrite')
      .option('overwriteSchema', 'True')
      .option('url', f's3://ngap2-user-data/gck/glbl_marsci_sandbox/owned/mchu10/{tbl_name}')
      .saveAsTable(f'mchu10.{tbl_name}')
    )
    print(f"mchu10.{tbl_name} created")
  else:
    raise Exception("Specify env = 'dev' or 'prod'")
"""

In [None]:
create_thread_auto_tag_tbl(env='dev')
create_thread_auto_tag_tbl(env='prod')

In [None]:
auto_tag = spark.sql("SELECT * FROM glbl_marsci_sandbox.thread_nikeapp_automated_tags")
auto_tag_df = auto_tag.toPandas()
display(auto_tag)

In [None]:
def transform_bool_to_list(df, tag_cat_dict):
  """
  Transforms automated tagging boolean columns into comma-separated string
  under tag categories.

    Parameters:
      df: pandas dataframe
      tag_cat_dict (dict): dictionary describing which tags belong in each tag category

    Returns:
      df: modified pandas dataframe
  """

  keep_cols = ['thread_id', 'thread_card_name', 'card_title', 'card_subtitle', 'card_body', 'action_text',
               'action_destination_type', 'brand', 'targetted_interest_labels', 'brand_offerings_l1',
               'brand_offerings_l2', 'marketing_type', 'marketing_category', 'sport_dimension', 'construct',
               'field_of_play', 'primary_gender_construct', 'target_gender', 'primary_sport_segment',
               'fields_of_play', 'header', 'template', 'word_count']
  list_df = df[keep_cols].copy()

  for k, v in tag_cat_dict.items():
    tag_cols = [c + "_tag" for c in v]
    tag_cat_df = df[tag_cols]
    list_df[k + "_tag_list"] = tag_cat_df.eq(1).dot(tag_cat_df.columns + ",").str.rstrip(",").str.replace("_tag", "")

  list_df = list_df.replace(r'^\s*$', np.nan, regex=True)

  return list_df

In [None]:
auto_tag_list = transform_bool_to_list(auto_tag_df, tag_cat_dict)

auto_tag_list = spark.createDataFrame(auto_tag_list)

auto_tag_list = (
  auto_tag_list
  # .withColumn(
  #   'construct_tag_list',
  #   f.when(f.col('construct_tag_list').contains(","), f.lit("multiple_construct")).otherwise(f.col('construct_tag_list'))
  # )
  .withColumn(
    'fieldsofplay_tag_list',
    f.when(f.col('fieldsofplay_tag_list').contains(","), f.concat(f.col('construct_tag_list'), f.lit("_multiple_fop"))).otherwise(f.col('fieldsofplay_tag_list'))
  ).withColumn(
    'division_tag_list',
    f.when(f.col('division_tag_list').contains(","), f.lit("multiple_division")).otherwise(f.col('division_tag_list'))
  ).withColumn(
    'merchclass_tag_list',
    f.when(f.col('merchclass_tag_list').contains(","), f.lit("multiple_merchclass")).otherwise(f.col('merchclass_tag_list'))
  ).withColumn(
    'sport_tag_list',
    f.when(f.col('sport_tag_list').contains(","), f.lit("multiple_sport")).otherwise(f.col('sport_tag_list'))
  ).withColumn(
    'league_tag_list',
    f.when(f.col('league_tag_list').contains(","), f.lit("multiple_league")).otherwise(f.col('league_tag_list'))
  ).withColumn(
    'athlete_tag_list',
    f.when(f.col('athlete_tag_list').contains(","), f.lit("multiple_athlete")).otherwise(f.col('athlete_tag_list'))
  ).withColumn(
    'franchise_tag_list',
    f.when(f.col('franchise_tag_list').contains(","), f.lit("multiple_franchise")).otherwise(f.col('franchise_tag_list'))
  )
)

In [None]:
tbl_name = "thread_nikeapp_automated_tags_list"

"""
(
  auto_tag_list
  .write.format('delta')
  .mode('overwrite')
  .option('overwriteSchema', 'True')
  .option('url', f's3://ngap2-user-data/gck/glbl_marsci_sandbox/owned/mchu10/{tbl_name}')
  .saveAsTable(f'mchu10.{tbl_name}')
)
"""

In [None]:
tbl_name = "thread_nikeapp_automated_tags_list"

"""
(
  auto_tag_list
  .write.format('delta')
  .mode('overwrite')
  .option('overwriteSchema', 'True')
  .option('url', f's3://ngap--glbl-marsci--prod--us-east-1/Owned-Media/{tbl_name}')
  .saveAsTable(f'glbl_marsci_sandbox.{tbl_name}')
)
"""

In [None]:
auto_tag_list = spark.sql("SELECT * FROM glbl_marsci_sandbox.thread_nikeapp_automated_tags_list")
display(auto_tag_list)

In [None]:
auto_tag_list.filter(f.col('construct_tag_list').contains(",")).display()

In [None]:
sample_sdf = auto_tag_list.filter(f.col('construct_tag_list').isNotNull())
frac = 52 / sample_sdf.count()
print(sample_sdf.count())
print(frac)
sample_sdf.sample(fraction=frac).display()

# Coverage

In [None]:
print(f"Total # of Comms: {auto_tag_df.shape[0]}")
print(f"Total # of Comms with non-null text copy: {auto_tag_df[auto_tag_df['template'].notnull()].shape[0]}")
print(f"Total # of Comms (test/qa removed): {auto_tag_df[auto_tag_df['test_qa_tag'] != 1].shape[0]}")

In [None]:
drop_cols = ['thread_id', 'thread_card_name', 'card_title', 'card_subtitle', 'card_body', 'action_text',
            'action_destination_type', 'brand', 'targetted_interest_labels', 'brand_offerings_l1',
            'brand_offerings_l2', 'marketing_type', 'marketing_category', 'sport_dimension', 'construct',
            'field_of_play', 'primary_gender_construct', 'target_gender', 'primary_sport_segment',
            'fields_of_play', 'header', 'template', 'word_count']

tag_freq = (
  auto_tag_df
  .drop(columns=drop_cols)
  .sum(axis=0)
  .reset_index()
)
tag_freq.columns = ['tag', 'count']
tag_freq['tag'] = tag_freq['tag'].str.replace("_tag", "")
tag_freq['perc'] = (tag_freq['count'] / auto_tag_df.shape[0]) * 100
display(tag_freq)

In [None]:
drop_cols = list(tag_freq[tag_freq['count'] == 0]['tag'].values)
drop_cols = [c + "_tag" for c in drop_cols]
print(drop_cols)
auto_tag_df = auto_tag_df.drop(columns=drop_cols)

In [None]:
plot_df = tag_freq[tag_freq['tag'].isin(tag_cat_dict['construct'])]
plot_df = plot_df.sort_values('count', ascending=False).head(15)

fig = px.bar(
  plot_df, x='tag', y='perc',
  text='perc', template='plotly_white',
  title='<b>Construct Tags Coverage</b>',
  labels={
    'tag': 'Tag',
    'perc': 'Comms %'
  },
  height=600, width=800
)

fig.update_traces(texttemplate='<b>%%{text: .1f}</b>')
fig.update_layout(font_size=16)

fig.show()

In [None]:
plot_df = tag_freq[tag_freq['tag'].isin(list(tag_cat_dict.keys()))]
plot_df = plot_df.sort_values('count', ascending=False).head(15)

fig = px.bar(
  plot_df, x='tag', y='perc',
  text='perc', template='plotly_white',
  title='<b>Tag Category Coverage</b>',
  labels={
    'tag': 'Tag Category',
    'perc': 'Comms %'
  },
  height=600, width=800
)

fig.update_traces(texttemplate='<b>%%{text: .0f}</b>')
fig.update_layout(font_size=16)

fig.show()

# QA

In [None]:
%sql
select * from content_planning.plan_executions where cardinality(fields_of_play) != 0

In [None]:
airtable_tags = spark.sql("""
WITH published_airtable_thread_keys AS (
  SELECT
    thread_key,
    id
  FROM
    cms.cms_thread_external_attributes
  WHERE
    domain = 'control_plane'
  GROUP BY
    thread_key,
    id
),
cms_thread_keys AS (
  SELECT DISTINCT
    thread_key,
    thread_id
  FROM cms.cms_thread
)
SELECT
  b.thread_id,
  c.name as execution_name,
  c.season,
  c.primary_gender_construct,
  c.target_gender,
  c.primary_sport_segment,
  c.fields_of_play
FROM
  published_airtable_thread_keys a
  JOIN cms_thread_keys b ON a.thread_key = b.thread_key
  JOIN content_planning.plan_executions c ON a.id = c.id
""")

airtable_tags = (
  airtable_tags
  .groupBy('thread_id')
  .agg(
    f.max('primary_gender_construct').alias('primary_gender_construct'),
    f.max('target_gender').alias('target_gender'),
    f.max('primary_sport_segment').alias('primary_sport_segment'),
    f.max('fields_of_play').alias('fields_of_play'),
  )
)

display(airtable_tags)

In [None]:
tag_comp_df = auto_tag_list.toPandas()
display(tag_comp_df)

print("Non-null construct: {}".format(tag_comp_df['construct'].notnull().sum()))
print("Non-null primary_gender_construct: {}".format(tag_comp_df['primary_gender_construct'].notnull().sum()))
print("Non-null target_gender: {}".format(tag_comp_df['target_gender'].notnull().sum()))

tag_comp_df['primary_gender_construct'] = tag_comp_df['primary_gender_construct'].str.lower().str.replace(" ", "")
tag_comp_df['target_gender'] = tag_comp_df['target_gender'].str.lower().str.replace(" ", "")

print(f"Number of threads where primary_gender_construct equals target_gender: {(tag_comp_df['primary_gender_construct'] == tag_comp_df['target_gender']).astype(int).sum()}")
print(f"Number of threads where construct tag equals target_gender: {(tag_comp_df['construct_tag_list'] == tag_comp_df['target_gender']).astype(int).sum()}")
print(f"Number of threads where construct tag equals construct: {(tag_comp_df['construct_tag_list'] == tag_comp_df['construct']).astype(int).sum()}")

In [None]:
auto_tag_list_df = auto_tag_list.toPandas()
auto_tag_list_df['construct'] = auto_tag_list_df['construct'].str.replace("'", "")
auto_tag_list_df['sport_dimension'] = auto_tag_list_df['sport_dimension'].str.replace(" ", "")

In [None]:
auto_tag_list_df['construct_tag_list'].value_counts()

In [None]:
auto_tag_list_df['construct'].value_counts()

In [None]:
auto_tag_list_df['sport_tag_list'].value_counts()

In [None]:
auto_tag_list_df['sport_dimension'].value_counts()

In [None]:
eval_df = auto_tag_list_df[auto_tag_list_df['construct'].notnull()].copy()
eval_df['correct'] = (eval_df['construct_tag_list'] == eval_df['construct']).astype(int)
print("'construct' accuracy (vs. CMS labels): {}".format(eval_df['correct'].sum() / eval_df.shape[0]))

values = list(set(eval_df['construct']))
precision = []
recall = []
for value in values:
  precision.append(precision_score(eval_df['construct'], eval_df['construct_tag_list'], labels=[value], average=None, zero_division=0)[0])
  recall.append(recall_score(eval_df['construct'], eval_df['construct_tag_list'], labels=[value], average=None, zero_division=0)[0])
metrics_df = pd.DataFrame({
  'actual': values,
  'precision': precision,
  'recall': recall
})
display(metrics_df)

In [None]:
eval_df = auto_tag_list_df[(auto_tag_list_df['sport_dimension'].notnull()) & (auto_tag_list_df['sport_dimension'] != 'nikebyyou')].copy()
eval_df['correct'] = (eval_df['sport_tag_list'] == eval_df['sport_dimension']).astype(int)
print("'sport dimension' accuracy: {}".format(eval_df['correct'].sum() / eval_df.shape[0]))

values = list(set(eval_df['sport_dimension']))
precision = []
recall = []
for value in values:
  precision.append(precision_score(eval_df['sport_dimension'], eval_df['sport_tag_list'], labels=[value], average=None, zero_division=0)[0])
  recall.append(recall_score(eval_df['sport_dimension'], eval_df['sport_tag_list'], labels=[value], average=None, zero_division=0)[0])
metrics_df = pd.DataFrame({
  'actual': values,
  'precision': precision,
  'recall': recall
})
display(metrics_df)

In [None]:
# Auto Tag Table without metadata
def create_thread_auto_tag_tbl(env='dev'):
  """
  Creates and writes automated tagging table.

  'dev' writes to the mchu10 schema, 'prod' writes to the glbl_marsci_sandbox schema
  """

  thread_copy = spark.sql("SELECT * FROM mchu10.thread_copy_nikeapp")

  thread_copy = (
    thread_copy
    .withColumn('brand', f.lower(f.col('brand')))
    .withColumn('targetted_interest_labels', f.lower(f.col('targetted_interest_labels')))
    .withColumn('brand_offerings_l2', f.lower(f.col('brand_offerings_l2')))
    .withColumn('marketing_category', f.lower(f.col('marketing_category')))
    .withColumn('construct', f.lower(f.col('construct')))
    .withColumn('construct', f.regexp_replace(f.col('construct'), "'", ""))
    .withColumn('sport_dimension', f.lower(f.col('sport_dimension')))
    .withColumn('field_of_play', f.lower(f.col('field_of_play')))
    .withColumn('target_gender', f.lower(f.col('target_gender')))
    .withColumn('target_gender', f.regexp_replace(f.col('target_gender'), " ", ""))
    .withColumn('primary_gender_construct', f.lower(f.col('primary_gender_construct')))
    .withColumn('primary_gender_construct', f.regexp_replace(f.col('primary_gender_construct'), " ", ""))
    .withColumn('header', f.lower(f.col('header')))
    .withColumn('template', f.lower(f.col('template')))
    .withColumn('word_count', f.size(f.split(f.col('template'), " ")))
    .withColumn('construct_metadata', f.trim(f.coalesce(f.col('construct'), f.col('target_gender'), f.col('primary_gender_construct'))))
    .withColumn('construct_metadata', f.when(f.col('construct_metadata') == "", None).otherwise(f.col('construct_metadata')))
  )

  # Add tags on copy
  tag_dicts = [promo_dict, test_qa_dict, other_dict, brand_dict, construct_dict,
               sport_dict, franchise_dict, athlete_dict, division_dict, merchclass_dict,
               league_dict]
  for d in tag_dicts:
    thread_copy = add_text_tags(thread_copy, 'template', d)

  thread_copy_df = thread_copy.toPandas()

  # Add header tags
  thread_copy_df = add_header_tags(thread_copy_df, header_dict)

  # Infer tags from word hierarchies and relationships
  thread_copy_df = map_tags(thread_copy_df, category_map)

  # Logic for brand
  # Tag any comms without a brand as nike
  thread_copy_df['nike_tag'] = ((thread_copy_df['jordan_tag'] == 0) & (thread_copy_df['converse_tag'] == 0) & (thread_copy_df['hurley_tag'] == 0)).astype('int')

  # Logic for construct
  # If comm has both mens and womens tag -> tag as dual gender and remove mens and womens tag
  thread_copy_df['dualgender_tag'] = ((thread_copy_df['dualgender_tag'] == 1) | ((thread_copy_df['mens_tag'] == 1) & (thread_copy_df['womens_tag'] == 1))).astype('int')
  thread_copy_df['mens_tag'] = ((thread_copy_df['mens_tag'] == 1) & (thread_copy_df['dualgender_tag'] == 0)).astype('int')
  thread_copy_df['womens_tag'] = ((thread_copy_df['womens_tag'] == 1) & (thread_copy_df['dualgender_tag'] == 0)).astype('int')
  # Remove mens, womens, and dual gender tag if tagged as kids
  thread_copy_df['womens_tag'] = ((thread_copy_df['womens_tag'] == 1) & (thread_copy_df['kids_tag'] != 1)).astype('int')
  thread_copy_df['mens_tag'] = ((thread_copy_df['mens_tag'] == 1) & (thread_copy_df['kids_tag'] != 1)).astype('int')
  thread_copy_df['dualgender_tag'] = ((thread_copy_df['dualgender_tag'] == 1) & (thread_copy_df['kids_tag'] != 1)).astype('int')
  thread_copy_df = thread_copy_df.drop(columns=['construct_metadata'])

  # Add Fields of Play tags and tag categories
  thread_copy_df = add_fields_of_play_tags(thread_copy_df)
  thread_copy_df = add_tag_categories(thread_copy_df, tag_cat_dict)

  # Manual overwrite

  # Exclude tags
  thread_copy_df = exclude_tags(thread_copy_df, ['globalfootball_tag'], ['americanfootball_tag'])
  thread_copy_df = exclude_tags(thread_copy_df, ['kids_play_all_day_tag', 'mens_lifestyle_tag', 'womens_lifestyle_tag'], ['birthday_tag'])

  # Add tagged column for at least one tag for coverage calculation
  tag_cols = [c for c in thread_copy_df.columns if "_tag" in c]
  thread_copy_df['tagged'] = thread_copy_df[tag_cols].sum(axis=1).clip(upper=1)

  thread_copy = spark.createDataFrame(thread_copy_df)
  tag_cols.append('tagged')
  thread_copy = thread_copy.fillna(0, subset=tag_cols)

  tbl_name = "thread_nikeapp_automated_tags_no_metadata"

"""
  if env == 'prod':
    (
      thread_copy
      .write.format('delta')
      .mode('overwrite')
      .option('overwriteSchema', 'True')
      .option('url', f's3://ngap--glbl-marsci--prod--us-east-1/Owned-Media/{tbl_name}')
      .saveAsTable(f'glbl_marsci_sandbox.{tbl_name}')
    )
    print(f"glbl_marsci_sandbox.{tbl_name} created")
  elif env == 'dev':
    (
      thread_copy
      .write.format('delta')
      .mode('overwrite')
      .option('overwriteSchema', 'True')
      .option('url', f's3://ngap2-user-data/gck/glbl_marsci_sandbox/owned/mchu10/{tbl_name}')
      .saveAsTable(f'mchu10.{tbl_name}')
    )
    print(f"mchu10.{tbl_name} created")
  else:
    raise Exception("Specify env = 'dev' or 'prod'")

create_thread_auto_tag_tbl(env='dev')
"""

In [None]:
auto_tag = spark.sql("SELECT * FROM mchu10.thread_nikeapp_automated_tags_no_metadata")
auto_tag_df = auto_tag.toPandas()

auto_tag_list = transform_bool_to_list(auto_tag_df, tag_cat_dict)

auto_tag_list = spark.createDataFrame(auto_tag_list)

auto_tag_list = (
  auto_tag_list
  # .withColumn(
  #   'construct_tag_list',
  #   f.when(f.col('construct_tag_list').contains(","), f.lit("multiple_construct")).otherwise(f.col('construct_tag_list'))
  # )
  .withColumn(
    'fieldsofplay_tag_list',
    f.when(f.col('fieldsofplay_tag_list').contains(","), f.concat(f.col('construct_tag_list'), f.lit("_multiple_fop"))).otherwise(f.col('fieldsofplay_tag_list'))
  ).withColumn(
    'division_tag_list',
    f.when(f.col('division_tag_list').contains(","), f.lit("multiple_division")).otherwise(f.col('division_tag_list'))
  ).withColumn(
    'merchclass_tag_list',
    f.when(f.col('merchclass_tag_list').contains(","), f.lit("multiple_merchclass")).otherwise(f.col('merchclass_tag_list'))
  ).withColumn(
    'sport_tag_list',
    f.when(f.col('sport_tag_list').contains(","), f.lit("multiple_sport")).otherwise(f.col('sport_tag_list'))
  ).withColumn(
    'league_tag_list',
    f.when(f.col('league_tag_list').contains(","), f.lit("multiple_league")).otherwise(f.col('league_tag_list'))
  ).withColumn(
    'athlete_tag_list',
    f.when(f.col('athlete_tag_list').contains(","), f.lit("multiple_athlete")).otherwise(f.col('athlete_tag_list'))
  ).withColumn(
    'franchise_tag_list',
    f.when(f.col('franchise_tag_list').contains(","), f.lit("multiple_franchise")).otherwise(f.col('franchise_tag_list'))
  )
)

tbl_name = "thread_nikeapp_automated_tags_list_no_metadata"

"""
(
  auto_tag_list
  .write.format('delta')
  .mode('overwrite')
  .option('overwriteSchema', 'True')
  .option('url', f's3://ngap2-user-data/gck/glbl_marsci_sandbox/owned/mchu10/{tbl_name}')
  .saveAsTable(f'mchu10.{tbl_name}')
)
"""

In [None]:
auto_tag_list = spark.sql("SELECT * FROM mchu10.thread_nikeapp_automated_tags_list_no_metadata")
auto_tag_list_df = auto_tag_list.toPandas()
auto_tag_list_df['construct'] = auto_tag_list_df['construct'].str.replace("'", "")
display(auto_tag_list_df)

In [None]:
eval_df = auto_tag_list_df[auto_tag_list_df['construct'].notnull()].copy()
eval_df['correct'] = (eval_df['construct_tag_list'] == eval_df['construct']).astype(int)
eval_df['construct_tag_list'] = eval_df['construct_tag_list'].fillna("Null Value")
print("'construct' accuracy (vs. CMS labels): {}".format(eval_df['correct'].sum() / eval_df.shape[0]))

values = list(set(eval_df['construct']))
precision = []
recall = []
for value in values:
  precision.append(precision_score(eval_df['construct'], eval_df['construct_tag_list'], labels=[value], average=None, zero_division=0)[0])
  recall.append(recall_score(eval_df['construct'], eval_df['construct_tag_list'], labels=[value], average=None, zero_division=0)[0])
metrics_df = pd.DataFrame({
  'actual': values,
  'precision': precision,
  'recall': recall
})
display(metrics_df)

In [None]:
auto_tag = spark.sql("SELECT * FROM mchu10.thread_nikeapp_automated_tags_no_metadata")
auto_tag_df = auto_tag.toPandas()
auto_tag_df['sport_dimension'] = auto_tag_df['sport_dimension'].str.replace(" ", "")
auto_tag_df['sport_dimension'] = auto_tag_df['sport_dimension'].replace("", None)
display(auto_tag_df)

In [None]:
eval_df = auto_tag_df[(auto_tag_df['sport_dimension'].notnull()) & (auto_tag_df['sport_dimension'] != 'nikebyyou')].copy()

values = list(set(eval_df['sport_dimension']))
precision = []
recall = []

eval_df = pd.get_dummies(eval_df, columns=['sport_dimension'])

for value in values:
  precision.append(precision_score(eval_df[f'sport_dimension_{value}'], eval_df[f'{value}_tag'], zero_division=0))
  recall.append(recall_score(eval_df[f'sport_dimension_{value}'], eval_df[f'{value}_tag'], zero_division=0))
metrics_df = pd.DataFrame({
  'actual': values,
  'precision': precision,
  'recall': recall
})
display(metrics_df)