In [3]:
import os
import glob
import requests 
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup

### Data Preparation

Scrape CPI search results for GAFAM 

Total number of search pages per platform
* Google = 415 pages
* Amazon = 188 pages
* Facebook = 206 pages
* Apple = 229 pages
* Microsoft = 107 pages

In [4]:
def parse_cpi(platform, pages):
    links = list()
    data = []
    counter = 0

    for i in range(1, pages+1): 
        r = requests.get("https://www.competitionpolicyinternational.com/page/" + str(i) + "/?s=" + str(platform))
        soup = BeautifulSoup(r.content)
        temp = soup.select("h3.entry-title.td-module-title")
        links.extend(temp)

    print("Total no. of links: " + str(len(links)))

    for link in links: 
        r2 = requests.get(link.a["href"])
        link_soup = BeautifulSoup(r2.content)
        
        try: 
            data.append({"date": link_soup.select("time.entry-date.updated.td-module-date")[0].text,
                         "title": link_soup.select("h1.entry-title")[0].text,
                         "text": link_soup.select("div.td-post-content")[0].text,
                         "company": platform})
        except IndexError:
            pass

        counter += 1
        if (counter) % 100 == 0:
            print("No. links scraped: " + str(counter))

    raw = pd.DataFrame.from_dict(data)
    raw.to_csv('data/raw/' + platform + '_raw.csv')

In [57]:
# Microsoft
parse_cpi("Microsoft", 107)

In [76]:
# Apple 
parse_cpi("Apple", 229)

Total no. of links: 2287
No. links scraped: 100
No. links scraped: 200
No. links scraped: 300
No. links scraped: 400
No. links scraped: 500
No. links scraped: 600
No. links scraped: 700
No. links scraped: 800
No. links scraped: 900
No. links scraped: 1000
No. links scraped: 1100
No. links scraped: 1200
No. links scraped: 1300
No. links scraped: 1400
No. links scraped: 1500
No. links scraped: 1600
No. links scraped: 1700
No. links scraped: 1800
No. links scraped: 1900
No. links scraped: 2000
No. links scraped: 2100
No. links scraped: 2200


In [61]:
# Facebook
parse_cpi("Facebook", 206)

In [60]:
# Amazon
parse_cpi("Amazon", 188)

In [74]:
# Google
parse_cpi("Google", 415)

Total no. of links: 1145
No. links scraped: 100
No. links scraped: 200
No. links scraped: 300
No. links scraped: 400
No. links scraped: 500
No. links scraped: 600
No. links scraped: 700
No. links scraped: 800
No. links scraped: 900
No. links scraped: 1000
No. links scraped: 1100


Merge raw data for all platforms

In [7]:
files = os.path.join('data/raw/', "*.csv")
files = glob.glob(files)

df = pd.concat(map(pd.read_csv, files), ignore_index=True)
df = df.drop(df.columns[0], axis = 1)

In [8]:
len(df)

11367

In [9]:
# remove white spaces
df['text'] = [x.replace("\n", "") for x in df['text']]
print(df.head())

             date                                              title  \
0  March 25, 2021  Facebook’s Zuckerberg Proposes Changes To Sect...   
1  March 24, 2021  Antitrust in a Digital World: Does It Work? – ...   
2  March 22, 2021  US House Antitrust Chairman Has New Big Tech B...   
3  March 22, 2021  Biden Nominates Antitrust Expert Lina Khan For...   
4  March 19, 2021        Facebook May Face New Antitrust Probe In UK   

                                                text company  
0  This week, Facebook CEO Mark Zuckerberg propos...  Google  
1  Below, we have provided the full transcript of...  Google  
2  Democratic Representative David Cicilline, cha...  Google  
3  US President Joe Biden intends to nominate Lin...  Google  
4  Britain’s competition regulator is set to begi...  Google  


NER to extract countries from title

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

In [10]:
import spacy
from spacy import displacy
import pycountry
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
nlp_sm = spacy.load("en_core_web_sm") # smaller pipeline
nlp_lg = spacy.load("en_core_web_lg")
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [13]:
legend = pd.read_csv("data/others/country_nationality_list.csv")
legend = legend.set_index('nationality').to_dict()['country']

In [93]:
# extract info from string         
def extract(ner_function, pipeline, texts):
    list = []
    for text in texts:
        raw = str(ner_function(pipeline, text))
        list.append(str(raw))            
    return (list)

In [94]:
# ner get countries and nationalities
def get_countries(pipeline, texts):
    doc = pipeline(texts)
    for entity in doc.ents:
        if entity.label_ == 'GPE': # country, city, state
            if entity.text in legend.values(): # keep if in country list
                return lm.lemmatize(entity.text)
            else: 
                return None            
        elif entity.label_ == 'NORP': # nationality 
            if entity.text in legend.keys(): # match to country
                return legend[entity.text]
            else:
                 return None              
        else: 
            return None

In [95]:
# check title & text for values
def check_both(list1, list2):
    list = []
    for i, j in zip(list1, list2):
        if j != 'None':
            list.append(j)
        else:
            list.append(i)
    return list

In [40]:
titles = df['title']
texts = df['text']

c_title = extract(get_countries, nlp_lg, titles)
c_text = extract(get_countries, nlp_lg, texts)

countries = check_both(c_title, c_text)

# titles that mention cities instead of countries not captured

In [48]:
countries.count('None')

7003

NER to extract fines from text

In [96]:
# ner get fines
def get_fines(pipeline, texts):
    doc = pipeline(texts)
    for entity in doc.ents:
        if entity.label_ == 'MONEY': 
            return entity.text
        else: 
            return None

In [97]:
f_title = extract(get_fines, nlp_lg, titles)
f_text = extract(get_fines, nlp_lg, texts)

fines = check_both(f_title, f_text)

In [98]:
fines.count('None')

11273

In [99]:
np.unique(fines)

array(['$1.1 billion', '$1.7 billion', '$1.8 billion', '$100 million',
       '$10B US', '$113M Settlement', '$14.7B Deal', '$16.48 million',
       '$17 million', '$19 billion', '$1B', '$2.1B', '$2.6B',
       '$209B Ad-Tech', '$25 billion', '$3.4B Deal', '$300 billion',
       '$300M Patent Verdict', '$324.5 million', '$37 billion',
       '$392M To Settle Privacy Suit With', '$4.5 billion',
       '$415 million', '$49 billion', '$5.1 billion',
       '$5B Antitrust Fine Is Flawed', '$5B Fine', '$600 million',
       '$69 Billion', '$7.4 billion', '$90M Privacy Settlement',
       '1.49-billion-euro', '1.75B', '1.9B', '15.7bn',
       '150 million euros', '2.3B', '24B', '30B', '39 million euros',
       '5.7B', '660,000', '7', '7.2B', 'A$1 billion',
       'Billions of Dollars', 'Multimillion-euro', 'None',
       'The estimated $3.84 billion', 'US$1.36 billion', 'US$16 billion',
       'US$26 billion', 'US$3 billion', 'US$375 million', 'US$4 billion',
       'US$4.7 billion', 'US$5 

### Topic modeling with BERT

In [21]:
text = df['text']

Word embeddings from distilbert

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(text, show_progress_bar=True)

Reduce dimension of embeddings & cluster similar documents

In [127]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)

In [128]:
import hdbscan

cluster = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit(umap_embeddings)

Derive topics

In [129]:
docs_df = pd.DataFrame(text, columns = ['text'])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'text': ' '.join})

In [130]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.text.values, m=len(data))

In [131]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .text
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "text": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)



Unnamed: 0,Topic,Size
0,-1,427
2,1,238
4,3,101
3,2,61
6,5,48
7,6,45
5,4,40
8,7,21
1,0,19


In [141]:
top_n_words[1][:10]

[('activision', 0.006905657198766965),
 ('deal', 0.006687424129165262),
 ('acquisition', 0.006252071693264714),
 ('cloud', 0.005573176124872544),
 ('billion', 0.005512475628019427),
 ('game', 0.005421730655356037),
 ('sony', 0.005367325511335517),
 ('announced', 0.005360809389522881),
 ('blizzard', 0.0049641807917932814),
 ('gaming', 0.004566935387692416)]

In [143]:
top_n_words[3][:10]

[('ftc', 0.011437544970115957),
 ('india', 0.00926705088817797),
 ('future', 0.008121835373573348),
 ('whatsapp', 0.00698339887267274),
 ('retail', 0.006975661037723728),
 ('arbitration', 0.006559208660652846),
 ('judge', 0.006319017010858317),
 ('giphy', 0.006262419138645957),
 ('sellers', 0.006108051485879766),
 ('instagram', 0.006012050508269187)]

In [144]:
top_n_words[2][:10]

[('fine', 0.01569219398302305),
 ('russian', 0.01384387122347253),
 ('russia', 0.013556155332437116),
 ('fined', 0.013427234277430276),
 ('million', 0.012642551584673468),
 ('roubles', 0.01257520786215803),
 ('moscow', 0.011553007770666547),
 ('fines', 0.0113440511570926),
 ('cnil', 0.009983435399267129),
 ('cookies', 0.009569651411018636)]