In [1]:
import json
import warnings

import pandas as pd
import numpy as np


from scipy.stats import pearsonr
from collections import Counter
from math import log, ceil, floor
from scipy.stats import entropy
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

In [2]:
# load dataframe
infile = '../data/full-udrp-parsed-proceedings.jsonl.gz'

df = pd.read_json(
    infile, 
    lines = True, 
    convert_dates=['submitted', 'commenced', 'panel_appointed', 'published']
)

# drop lines with inconsistent values
dfFoc = df[(df.submitted >= "1999-12-31") & (df.published >= df.submitted)].copy()

# get statistics about the legal representatives
firmsStats = dfFoc.groupby('represented_by').agg({'source': 'nunique', 'number': 'nunique', 'complainants': lambda x: len(set().union(*x.values))})

### Different groups of legal representatives

In [3]:
# G1: the Loyals
g1_df = firmsStats[firmsStats.source == 1].sort_values('number', ascending=False)
for g1_firm in g1_df.head(3).index:
    fdf = dfFoc[dfFoc.represented_by == g1_firm]
    print(g1_firm, fdf.number.nunique(), fdf.source.unique()[0], fdf.submitted.min(), fdf.submitted.max())
print(g1_df.shape[0], 'firms', f'({g1_df.shape[0] / firmsStats.shape[0]:.2%})')
print(g1_df.number.sum(), 'cases', f'({g1_df.number.sum() / firmsStats.number.sum():.2%})')
print(g1_df.number.median(), 'median cases')

CitizenHawk, Inc. 923 FORUM 2007-09-27 00:00:00 2022-03-22 00:00:00
State Farm Mutual Automobile Insurance Company 640 FORUM 2000-09-22 00:00:00 2024-05-08 00:00:00
The GigaLaw Firm 511 WIPO 2005-07-14 00:00:00 2024-06-05 00:00:00
9652 firms (93.19%)
34547 cases (43.52%)
1.0 median cases


In [4]:
# G2: the one-timers
g2_out_arr = []
failed_purchase = 0

for firm, cases in firmsStats[firmsStats.source > 1].number.to_dict().items():
    history = []
    choices = dfFoc[dfFoc.represented_by == firm].sort_values('submitted').source.values
    decisions = dfFoc[dfFoc.represented_by == firm].sort_values('submitted').status.values
    cur_val = choices[0]
    repeats = 0
    switches = []
    firm_failed_purchase = False
    for idx, choice in enumerate(choices):
        if choice == cur_val:
            repeats += 1
        else:
            # dump the previous result
            history += [(cur_val, repeats, f"{repeats / cases:.2%}")]
            switches += [cur_val]
            repeats = 1
            cur_val = choice

            firm_failed_purchase |= (decisions[idx] == 'Denied')
            # at the end
    history += [(cur_val, repeats, f"{repeats / cases:.2%}")]
    switches += [cur_val]
    if len(history) <= len(set(choices)) + 1 and any(vv[1] == 1 for vv in history[:-1]) and not all(vv[1] == 1 for vv in history):
        g2_out_arr += [(firm, history, cases, '->'.join(switches))]
        failed_purchase += firm_failed_purchase
g2_df = pd.DataFrame(g2_out_arr, columns = ['firm', 'history', 'number', 'switches'])
print(g2_df.shape[0], 'firms', f'({g2_df.shape[0] / firmsStats.shape[0]:.2%})')
print(g2_df.number.sum(), 'cases', f'({g2_df.number.sum() / firmsStats.number.sum():.2%})')
print(g2_df.number.median(), 'median cases')
print(failed_purchase, 'with denied one-time purchases')
g2_df.sort_values('number', ascending=False).head(3)

196 firms (1.89%)
4806 cases (6.05%)
9.0 median cases
55 with denied one-time purchases


Unnamed: 0,firm,history,number,switches
166,Studio Barbero,"[(WIPO, 152, 39.18%), (FORUM, 1, 0.26%), (WIPO...",388,WIPO->FORUM->WIPO
36,"Christie, Parker & Hale LLP","[(FORUM, 247, 76.95%), (WIPO, 1, 0.31%), (FORU...",321,FORUM->WIPO->FORUM
117,Novagraaf,"[(WIPO, 201, 77.91%), (CAC, 1, 0.39%), (WIPO, ...",258,WIPO->CAC->WIPO


In [5]:
## G3: Brand switchers (we also create the array for the forum shoppers)

g1_g2 = set().union(g1_df.index, g2_df.firm)

fl_ctr = set()
forum_shoppers = set()

fail_moves_arr = []
g3_out_arr = []
for firm, cases in firmsStats[firmsStats.source > 1].number.to_dict().items():
        ss = set()
        if firm in g1_g2:
             continue
        history = []
        choices = dfFoc[dfFoc.represented_by == firm].sort_values('submitted').source.values
        decisions = dfFoc[dfFoc.represented_by == firm].sort_values('submitted').status.values
        cur_val = choices[0]
        repeats = 0
        switches = []
        firm_first_fail = False
        for idx, choice in enumerate(choices):
            if choice == cur_val:
                repeats += 1
            else:
                if idx > 0:
                    ss.update([decisions[idx - 1 ]])
                # dump the previous result
                history += [(cur_val, repeats, f"{repeats / cases:.2%}")]
                switches += [cur_val]
                repeats = 1
                cur_val = choice

                firm_first_fail |= idx > 0 and decisions[idx - 1] == 'Denied'
                if firm_first_fail:
                    fail_moves_arr += [(firm, idx )]

                    forum_shoppers.add(firm)
                    firm_first_fail = False

                    if all(dec != 'Denied' for dec in decisions[ :idx - 1]):
                        fl_ctr.add(firm)
        history += [(cur_val, repeats, f"{repeats / cases:.2%}")]
        switches += [cur_val]
        if len(history) == len(set(choices)):
            g3_out_arr += [(firm, history, cases, '->'.join(switches))]

g3_df = pd.DataFrame(g3_out_arr, columns = ['firm', 'history', 'number', 'switches'])
print(g3_df.shape[0], 'firms', f'({g3_df.shape[0] / firmsStats.shape[0]:.2%})')
print(g3_df.number.sum(), 'cases', f'({g3_df.number.sum() / firmsStats.number.sum():.2%})')
print(g3_df.number.median(), 'median cases')
g3_df.sort_values('number', ascending=False).head(3)

218 firms (2.10%)
1801 cases (2.27%)
4.0 median cases


Unnamed: 0,firm,history,number,switches
19,Bodman PLC,"[(WIPO, 87, 94.57%), (FORUM, 5, 5.43%)]",92,WIPO->FORUM
45,"Crowell & Moring, LLP","[(WIPO, 77, 84.62%), (FORUM, 14, 15.38%)]",91,WIPO->FORUM
93,Istanbul Patent & Trademark Consultancy Ltd.,"[(FORUM, 2, 2.30%), (WIPO, 85, 97.70%)]",87,FORUM->WIPO


In [6]:
g1_g3 = set().union(g1_df.index, g2_df.firm, g3_df.firm)


g4_out_arr = []
for firm, cases in firmsStats[firmsStats.source > 1].number.to_dict().items():
        if firm in g1_g3:
             continue
        history = []
        choices = dfFoc[dfFoc.represented_by == firm].sort_values('submitted').source.values

        cur_val = choices[0]
        repeats = 0
        switches = []
        for choice in choices:
            if choice == cur_val:
                repeats += 1
            else:
                # dump the previous result
                history += [(cur_val, repeats, f"{repeats / cases:.2%}")]
                switches += [cur_val]
                repeats = 1
                cur_val = choice
                # at the end
        history += [(cur_val, repeats, f"{repeats / cases:.2%}")]

        switches += [cur_val]
        g4_out_arr += [(firm, history, cases, '->'.join(switches))]
g4_df = pd.DataFrame(g4_out_arr, columns = ['firm', 'history', 'number', 'switches'])
print(g4_df.shape[0], 'firms', f'({g4_df.shape[0] / firmsStats.shape[0]:.2%})')
print(g4_df.number.sum(), 'cases', f'({g4_df.number.sum() / firmsStats.number.sum():.2%})')
print(g4_df.number.median(), 'median cases')
g4_df.sort_values('number', ascending=False).head(3)

291 firms (2.81%)
38230 cases (48.16%)
48.0 median cases


Unnamed: 0,firm,history,number,switches
187,NAMESHIELD S.A.S.,"[(WIPO, 30, 1.14%), (CAC, 4, 0.15%), (WIPO, 1,...",2631,WIPO->CAC->WIPO->CAC->WIPO->CAC->WIPO->CAC->WI...
35,CSC Digital Brand Services Group AB,"[(WIPO, 60, 3.18%), (FORUM, 1, 0.05%), (WIPO, ...",1889,WIPO->FORUM->WIPO->FORUM->WIPO->FORUM->WIPO->F...
68,Dreyfus & associes,"[(WIPO, 134, 10.77%), (CAC, 1, 0.08%), (WIPO, ...",1244,WIPO->CAC->WIPO->CAC->WIPO->FORUM->WIPO->CAC->...


### Train the logistic regression models to study forum shopping

In [7]:
reprDf = pd.concat([dfFoc[dfFoc.represented_by.isin(g4_df.firm)], dfFoc.assign(represented_by = "ALL")])
reprDf['quarter'] = reprDf.submitted.dt.to_period('Q')

## estimate historical win rates and processing speeds for legal representatives
perf_arr = [(
    source, date, firm, 
    valChunk[valChunk.status.isin({'Transferred', 'Cancelled'})].number.nunique() / valChunk.number.nunique(), 
    (valChunk.published - valChunk.submitted).dt.days.mean(), 
    (
        prevQuarter[prevQuarter.status.isin({'Transferred', 'Cancelled'})].number.nunique() / prevQuarter.number.nunique() 
        if not prevQuarter.empty else None
    ), 
    (prevQuarter.published - prevQuarter.submitted).dt.days.mean())
    for (source, date, quarter, firm) in tqdm(
        reprDf[['source', 'submitted', 'quarter', 'represented_by']].drop_duplicates().values
    ) if not (
        valChunk := reprDf[(reprDf.published <= date) & (reprDf.source == source) & (reprDf.represented_by == firm)]).empty and 
        (prevQuarter := valChunk[valChunk.quarter == quarter - 1]).shape[1]
]

perfDf = pd.DataFrame(perf_arr, columns=['provider', 'submitted', 'firm', 'past_success', 'past_delay', 'qtr_success', 'qtr_delay'])

  0%|          | 0/50863 [00:00<?, ?it/s]

In [8]:
# function to get the features for a firm. Missing values are filled from recent quarters, or overall statistics
def get_feature_vectors(firm: str, df: pd.DataFrame, refDf: pd.DataFrame = None):
    xDf = pd.merge(
        # a previous version of this had a ffill to fix the NaNs
            df[df.firm == firm].pivot(index = 'submitted', columns = 'provider', values = 'past_success').rename(columns = lambda x: f'past_success_{x}'),
            df[df.firm == firm].pivot(index = 'submitted', columns = 'provider', values = 'past_delay').rename(columns = lambda x: f'past_delay_{x}'),
            left_index=True,
            right_index=True
        ).merge(
            pd.merge(
                df[df.firm == firm].pivot(index = 'submitted', columns = 'provider', values = 'qtr_success').rename(columns = lambda x: f'qtr_success_{x}'),
                df[df.firm == firm].pivot(index = 'submitted', columns = 'provider', values = 'qtr_delay').rename(columns = lambda x: f'qtr_delay_{x}'),
                left_index=True,
                right_index=True
            ),
            right_index=True,
            left_index=True
    )
    if refDf is not None:
        xDf = xDf.merge(refDf, left_index=True, right_on = 'submitted')

        return xDf.fillna({
            col: xDf['_'.join(col.split('_')[:-1])]
            for col in xDf.columns if len(col.split('_')) > 2
        }).drop(columns = ['past_success', 'past_delay', 'qtr_success', 'qtr_delay'])
    return xDf if not xDf.empty else get_feature_vectors('ALL', df, refDf)

In [9]:
with open('../resources/countries_to_regions.json', 'r') as fin:
    countriesToRegion = json.load(fin)

allStats = get_feature_vectors("ALL", perfDf)

sx_vals = []
lc_vals = []
scores = []
game_scores = []
preds_stats_arr = []

conf_intval = 0.05
corrs = []

dfFoc['n_domains'] = dfFoc.domains.apply(len)
dfFoc['n_panelists'] = dfFoc.panelists.apply(len)
for firm, chunk in dfFoc[(dfFoc.represented_by.isin(g4_df.firm)) & (dfFoc.submitted >= "2020-01-01")].groupby('represented_by'):
    dd = chunk[['submitted', 'n_domains', 'n_panelists', 'complainants_countries', 'respondents_countries', 'source']].copy()

    if dd.shape[0] <= 10: 
        continue

    dd['C_IN_N_AMER'] = dd.complainants_countries.apply(lambda x: any(el in countriesToRegion['North America'] for el in x))
    dd['R_IN_N_AMER'] = dd.respondents_countries.apply(lambda x: any(el in countriesToRegion['North America'] for el in x))
    dd['C_IN_EUROPE'] = dd.complainants_countries.apply(lambda x: any(el in countriesToRegion['Europe'] for el in x))
    dd['R_IN_EUROPE'] = dd.complainants_countries.apply(lambda x: any(el in countriesToRegion['Europe'] for el in x))
    dd['C_ELSEWHERE'] = dd.complainants_countries.apply(lambda x: any(el in countriesToRegion['Elsewhere'] for el in x))
    dd['R_ELSEWHERE'] = dd.complainants_countries.apply(lambda x: any(el in countriesToRegion['Elsewhere'] for el in x))

    dd['one_panelist'] = dd.n_panelists == 1

    dd.drop(columns = ['complainants_countries', 'respondents_countries', 'n_panelists'], inplace = True)

    provStats = get_feature_vectors(firm, perfDf)
    mcols = allStats.columns

    mlDf = provStats.merge(allStats, left_index=True, right_index=True, how = 'outer', suffixes = [None, '_all']).fillna(method = 'ffill')
    
    mlDf = mlDf.fillna({
        col: mlDf[f'{col}_all'] for col in set(provStats.columns).intersection(mcols)
    })[mcols].fillna({
        col: 90 if 'delay' in col else 0 for col in mcols
    }).merge(dd, left_index = True, right_on = 'submitted')

    feature_cols = [
        'C_IN_N_AMER', 'C_IN_EUROPE', 'C_ELSEWHERE', 'R_IN_N_AMER', 
        'R_IN_EUROPE', 'R_ELSEWHERE', 'one_panelist', 'n_domains'
    ]
    mlDfRed = mlDf.loc[:, sum([[f'qtr_success_{prov}', f'qtr_delay_{prov}'] for prov in mlDf.source.unique()], []) + feature_cols + ['source']]


    # if the representative never used FORUM, remove the AMERICA's features
    if 'FORUM' not in mlDfRed.source.unique():
        mlDfRed.loc[mlDfRed.C_IN_N_AMER, 'C_ELSEWHERE'] = True
        mlDfRed.loc[mlDfRed.R_IN_N_AMER, 'R_ELSEWHERE'] = True
        mlDfRed.drop(columns = ['C_IN_N_AMER', 'R_IN_N_AMER'], inplace = True)

    for source in mlDfRed.source.unique():
        mlDfRed.loc[:, f'qtr_delay_{source}'] = np.log(mlDfRed[f'qtr_delay_{source}'])

    train_df = mlDfRed.head(floor(0.8 * mlDfRed.shape[0]))
    test_df = mlDfRed.tail(mlDfRed.shape[0] - train_df.shape[0])

    if len(set(train_df.source)) >= 2:
        lr = LogisticRegression(max_iter=1000000)
        lr = lr.fit(train_df.drop(columns = 'source'), train_df.source)
        preds = lr.predict(test_df.drop(columns = ['source']))
        fs = f1_score(test_df.source, preds, average = 'weighted')
        acc = accuracy_score(test_df.source, preds)

        scores += [fs]

        featVals = {}

        probs = lr.predict_proba(mlDfRed.drop(columns = 'source'))
        if any(tt.pvalue < conf_intval and tt.statistic > 0 for idx, provider in enumerate(lr.classes_) if 
               (tt := pearsonr(mlDf[f'past_success_{provider}'], [rr[idx] for rr in probs]))
            ):
            forum_shoppers.add(firm)
            corrs.append(tt.statistic)

print(f'Median F1-score: {np.median(scores):.2%}')

Median F1-score: 89.04%


In [10]:
print(f'{dfFoc[dfFoc.represented_by.isin(forum_shoppers)].number.nunique() / dfFoc.number.nunique():.2%} of disputes likely affected by forum shopping')

33.01% of disputes likely affected by forum shopping
