# Safegraph Lookalike Model

This notebook implements the Cosine Medoid Lookalike Model to predict a user's interest in an offline brand.

Ideal users that have visited a specific offline brand will be used to find a "golden seed" (the cosine-medoid). We compare users to this golden seed by computing their cosine-similarity, and accepting similarity scores above a pre-defined percentile threshold

To use this notebook, edit the cells in the `Resluts` and `Experiments` sections

#### Outputs (see results/experiments section below):
- Lookalike Model classification plot: `plotly_results/{filename}.html`
- Center distribution experiment plot: `plotly_results/{center_fn}.html`

## Setup

In [None]:
import os
import pandas as pd
import numpy as np
from numpy import linalg as la

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import pairwise_distances

from math import floor
import pickle

import plotly.graph_objects as go

## Dataframe Processing

In [None]:
def preprocess(data):
    """ Pre-process text data in the given DF's
    
    Processing:
        - keywords from Watson are linked by "_", convert these to spaces
        - lowercase all text
    """
    data.loc[:,'keywords'] = data['keywords'].str.replace("_", " ")
    data = data.apply((lambda x: x.str.lower() if x.name in ['domains','titles','keywords'] else x), axis=0)
    return data

def label_embeddings(emb, labels, group, num_embeddings=None):
    """ Add text labels to an embeddings DF """
    num_embed = emb.shape[0] if num_embeddings is None else num_embeddings
    num_feats = emb.shape[1]
    df = pd.DataFrame(emb, columns = ["dim"+str(i) for i in range(num_feats)])
    df["titles"] = labels.loc[:num_embed, "titles"]
    df["keywords"] = labels.loc[:num_embed, "keywords"]
    df["brands"] = labels.loc[:num_embed, "brands"]
    df["group"] = group
    return df

def centroid_to_dataframe(centroid, labels, idx):
    """ Create a dummy 1-row DF for the centroid for easy concatenation later on """
    df = pd.DataFrame(centroid, columns = ["dim"+str(i) for i in range(num_feats)])
    df["titles"] = labels.loc[idx, "titles"]
    df["keywords"] = labels.loc[idx, "keywords"]
    df["brands"] = labels.loc[idx, "brands"]
    df["group"] = "centroid"
    return df

def add_hover_text(df, char_lim=500):
    """ Crop and wrap text to render as plotly labels """
    replace_endl = lambda s: s.replace('\n', '<br>')
    df['hover'] = "Cosine Similarity: " + df['cos_sim'].astype(str) + \
        "<br><br>Keywords: " + df['keywords'].astype(str).str[:char_lim].str.wrap(75).apply(replace_endl) + \
        "<br><br>Titles: " + df['titles'].astype(str).str[:char_lim].str.wrap(75).apply(replace_endl)

## Cosine Lookalike Model

In [None]:
def cos_sim(a,b):
    return (a@b/(la.norm(a)*la.norm(b)))[0]

def euc_dist(a,b):
    return la.norm(a-b)

def compute_center_kmedoids(data):
    """ Use KMeoids with 1 cluster to compute the center """
    model = KMedoids(metric="cosine", n_clusters=1)
    model.fit(data)
    centroids = model.cluster_centers_
    idx = model.medoid_indices_[0]
    return centroids, idx

def compute_center(data):
    """ Directly compute the medoid of the set """
    pw = pairwise_distances(data, metric='cosine')
    totals = np.sum(pw, axis=0)
    idx = np.argmin(totals)
    return np.expand_dims(data[idx], axis=0), idx
    
def add_prediction(data, centroid, num_feats, percentile):
    """ Compute cosine similarity and pos/neg predictions, add these as columns to the DF """
    data["cos_sim"] = data.apply(lambda row: cos_sim(centroid, np.array([row['dim{}'.format(str(i))] for i in range(num_feats)])), axis=1)
    data.sort_values(by = "cos_sim", ascending = False, inplace=True)
    
    threshold = np.percentile(data['cos_sim'].values, percentile)
    data["pred"] = data.apply(lambda row: "positive" if row["cos_sim"] >= threshold else "negative", axis=1)

## Evaluation

In [None]:
def pred_interpretation(data, main_group):
    """ Print accuracy, precision, and recall analysis """
    TP = len(data[(data['pred'] == "positive") & (data['group'] == main_group)])
    TN = len(data[(data['pred'] != "positive") & (data['group'] != main_group)])
    FP = len(data[(data['pred'] == "positive") & (data['group'] != main_group)])
    FN = len(data[(data['pred'] != "positive") & (data['group'] == main_group)])

    accuracy = (TP + TN)/(TP + TN + FP + FN) * 100
    precision = TP / (TP + FP) * 100
    recall = TP / (TP + FN) * 100
    print('FN={}, TN={}, TP={}, FP={}'.format(FN,TN,TP,FP))
    print("We have an accuracy={:.0f}%, precision={:.0f}% and recall={:.0f}% ".format(accuracy, precision, recall))
    
def dim_reduce(data):
    """ Reduce labelled input to 2 dimensions """
    tsne = TSNE(n_components=2, random_state=0)

    # TODO: take only embeddings as input and move label-dropping outside this function
    x = data.drop(axis=1, columns=["group","titles","keywords","cos_sim","pred","brands"]).values

    tsneComponents = tsne.fit_transform(x)
    principalDf = pd.DataFrame(data = tsneComponents
              , columns = ['component 1', 'component 2'])
    finalDf = pd.concat([principalDf, data[['titles','keywords','brands','group','cos_sim','pred']].reset_index()], axis = 1)
    return finalDf

def split_preds(df, main_group, comp_group):
    """ Split predictions into true/false positive/negative """
    fp = df[(df['pred'] == 'positive') & (df['group'] == comp_group)]
    tn = df[(df['pred'] == 'negative') & (df['group'] == comp_group)]
    tp = df[(df['pred'] == 'positive') & (df['group'] == main_group)]
    fn = df[(df['pred'] == 'negative') & (df['group'] == main_group)]
    center = df[df['group'] == 'centroid']
    return tn, fp, tp, fn, center

## Aggregate Model Generation

In [None]:
def gen_model(main_data, comp_data, main_group, comp_group, percentile):
    """ Compute a center and lookalike predictions
    
    :return: Returns the following 3-tuple
        - A DF of all embeddings (including the center) reduced to 2 dimensions
        - The computed center as a DF
        - The fully labelled embeddings DF with cos-sim and predictions
    """
    train, test = train_test_split(main_data, test_size=0.3)
    
    # Compute Center
    X_train = train.drop(axis=1, columns=["group","titles","keywords","brands"]).values
    center, idx = compute_center(X_train)
    center_data = centroid_to_dataframe(center, train.reset_index(), idx)
    
    # Create lookalike model
    predDf = pd.concat([comp_data, test, center_data])
    add_prediction(predDf, center, num_feats, percentile)

    return dim_reduce(predDf), center_data, predDf        # Reduce and return

## Visualization

In [None]:
def plot_figure(filename, main_group, comp_group, TN, FP, TP, FN, center_pred):
    """ Visualize lookalike model classification 
    
    :returns: A plotly figure to be saved and rendered
    """
    targets = [TN, FP, TP, FN]
    names = [comp_group+' negative', comp_group+' positive', main_group+' positive', main_group+' negative']
    colors = ['red', 'blue', 'green', 'pink']

    xy=[]

    # Plot provided embeddings
    for target, color, name in zip(targets, colors, names):
        xy.append(go.Scatter(x=target['component 1']
                , y=target['component 2']
                , mode='markers'
                , marker=dict(size=5, color=color)
                , name=name
                , hovertext=target['hover']))

    # Plot center
    xy.append(go.Scatter(
          mode = 'markers',
          x = center_pred['component 1'],
          y = center_pred['component 2'],
          name = "Center",
          hovertext=center_pred['hover'],
          marker = dict(
            color = 'orange',
            size = 20,
            symbol = 'x-dot')))

    layout = go.Layout(
        title= "2 components TSNE",
        hovermode='closest',
        xaxis=dict(title='component 1', ticklen=5, zeroline=False, gridwidth=2),
        yaxis=dict( title='component 2', ticklen=5, gridwidth=2),
        width=1000,
        height=1000)
    return go.Figure(data=xy, layout=layout)

# Results

### Load and Label Data

This is usually the only cell you will need to edit. Simply change the variables within this cell to fit your job, and then run the notebook.

In [None]:
# Names and groups
main_group = 'gamestop'
comp_group = 'full_merge'
filename = 'all_brands_gamestop_strict_SBERT-post'

# Label dataframes
main_label = pd.read_csv("data/safegraph/merge_brands/safegraph_gamestop_raw.csv").dropna().head(2000)
comp_label = pd.read_csv("data/safegraph/merge_brands/safegraph_full_merge_ndomain7.csv").dropna()
main_label["brands"] = "Gamestop"
comp_label["brands"] = "N/A"

main_label =  preprocess(main_label)
comp_label =  preprocess(comp_label)

# Embeddings dataframes
main_emb = np.load('npy_pickles/gamestop_raw_2000_SBERT.npy')
comp_emb = np.load('npy_pickles/full_merge_SBERT.npy')
assert(main_emb.shape[1] == comp_emb.shape[1])
num_feats = main_emb.shape[1]

In [None]:
# Combine embeddings, labels, and names
main_data = label_embeddings(main_emb, main_label, main_group)
comp_data = label_embeddings(comp_emb, comp_label, comp_group)

### Generate Lookalike Model for the brand

In [None]:
%%time
# filename = 'all_brands_gamestop_strict_euc'
redDf, centerDf, predDf = gen_model(main_data, comp_data, main_group, comp_group, 60)
pred_interpretation(predDf, main_group)

### Evaluate Lookalike Model

In [None]:
add_hover_text(redDf)
split = split_preds(redDf, main_group, comp_group)

In [None]:
fig = plot_figure(filename, main_group, comp_group, *split)
fig.write_html("plotly_results/{0}.html".format(filename))
fig.show()

# Experiments

This section details further investigations taken to analyze the proposed lookalike model

## Center Distribution

We generate 1000 different train/test splits and plot the centers computed from the training sets. We observe from the resulting graph that there are many repeated centers, showing good precision.

**Problem:** Not all of the most common centers may be very related to the brand, so we should find a way to retain only the centers that we deem relevant. Since generating the "Golden Seed" should be a one-time task (or at least a few-time task) then we can just have a human examine the centers generated from this experiment and select the best-fitting ones.

### Generate and label centers

May need to change filename, otherwise data is loaded above. It may be better to move the filename variable to another cell to keep track of it easier

In [None]:
%%time

center_fn = "centroids_SBERT-post"

# Start with all data points, and merge the new centers together with the original data
dfs = [main_data]

N = 1000
for i in range(N):
    train, test = train_test_split(main_data, test_size=0.3)
    
    # Compute Center
    X_train = train.drop(axis=1, columns=["group","titles","keywords","brands"]).values
    center, idx = compute_center(X_train)
    center_data = centroid_to_dataframe(center, train.reset_index(), idx)
    dfs.append(center_data)

# Merge all data points into a single DF and add dummy labels
mergedDf = pd.concat(dfs)
mergedDf['cos_sim'] = 0
mergedDf['pred'] = 'n/a'

### Post-Processing

In [None]:
%%time

reducedDf = dim_reduce(mergedDf)
add_hover_text(reducedDf)

main = reducedDf[reducedDf['group'] != "centroid"]
centers = reducedDf[reducedDf['group'] == "centroid"]

### Visualization

The resulting plot will be saved as an html file

In [None]:
xy = []
xy.append(go.Scatter(x=main['component 1']
        , y=main['component 2']
        , mode='markers'
        , marker=dict(size=5, color='blue')
        , name=main_group
        , hovertext=main['hover']
                        ))

xy.append(go.Scatter(
      mode = 'markers',
      x = centers['component 1'],
      y = centers['component 2'],
      name = "Center",
      hovertext=centers['hover'],
      marker = dict(
        color = 'orange',
        size = 5)))

layout = go.Layout(
    title= "{0} Generated Centroids".format(N),
    hovermode='closest',
    xaxis=dict(title='component 1', ticklen=5, zeroline=False, gridwidth=2),
    yaxis=dict( title='component 2', ticklen=5, gridwidth=2),
    width=1000,
    height=1000)
fig = go.Figure(data=xy, layout=layout)
fig.write_html("plotly_results/{0}.html".format(center_fn))
fig.show()