In [1]:
import os
import glob
import requests 
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup

### Data Preparation

##### Scrape CPI search results for GAFAM 

Total number of search pages per platform
* Google = 415 pages
* Amazon = 188 pages
* Facebook = 206 pages
* Apple = 229 pages
* Microsoft = 107 pages

In [4]:
def parse_cpi(platform, pages):
    links = list()
    data = []
    counter = 0

    for i in range(1, pages+1): 
        r = requests.get("https://www.competitionpolicyinternational.com/page/" + str(i) + "/?s=" + str(platform))
        soup = BeautifulSoup(r.content)
        temp = soup.select("h3.entry-title.td-module-title")
        links.extend(temp)

    print("Total no. of links: " + str(len(links)))

    for link in links: 
        r2 = requests.get(link.a["href"])
        link_soup = BeautifulSoup(r2.content)
        
        try: 
            data.append({"date": link_soup.select("time.entry-date.updated.td-module-date")[0].text,
                         "title": link_soup.select("h1.entry-title")[0].text,
                         "text": link_soup.select("div.td-post-content")[0].text,
                         "company": platform})
        except IndexError:
            pass

        counter += 1
        if (counter) % 100 == 0:
            print("No. links scraped: " + str(counter))

    raw = pd.DataFrame.from_dict(data)
    raw.to_csv('data/raw/' + platform + '_raw.csv')

In [57]:
# Microsoft
parse_cpi("Microsoft", 107)

In [76]:
# Apple 
parse_cpi("Apple", 229)

Total no. of links: 2287
No. links scraped: 100
No. links scraped: 200
No. links scraped: 300
No. links scraped: 400
No. links scraped: 500
No. links scraped: 600
No. links scraped: 700
No. links scraped: 800
No. links scraped: 900
No. links scraped: 1000
No. links scraped: 1100
No. links scraped: 1200
No. links scraped: 1300
No. links scraped: 1400
No. links scraped: 1500
No. links scraped: 1600
No. links scraped: 1700
No. links scraped: 1800
No. links scraped: 1900
No. links scraped: 2000
No. links scraped: 2100
No. links scraped: 2200


In [61]:
# Facebook
parse_cpi("Facebook", 206)

In [60]:
# Amazon
parse_cpi("Amazon", 188)

In [74]:
# Google
parse_cpi("Google", 415)

Total no. of links: 1145
No. links scraped: 100
No. links scraped: 200
No. links scraped: 300
No. links scraped: 400
No. links scraped: 500
No. links scraped: 600
No. links scraped: 700
No. links scraped: 800
No. links scraped: 900
No. links scraped: 1000
No. links scraped: 1100


##### Merge raw data for all platforms

In [355]:
import re

In [359]:
files = os.path.join('data/raw/', "*.csv")
files = glob.glob(files)

df = pd.concat(map(pd.read_csv, files), ignore_index=True)
df = df.drop(df.columns[0], axis = 1)

In [357]:
len(df)

11367

In [367]:
# Remove unrelated paragraphs (e.g. Read More: & Related:)
pattern = "[\w\s]+:[\w\s]+[^a-zA-Z\d\s]+[\w\s]+\\n"
df['text'] = [re.sub(pattern, " ", x) for x in df['text']]

# Remove white spaces
df['text'] = [x.replace("\n", "") for x in df['text']]
df.head()


Unnamed: 0,date,title,text,company
0,"March 25, 2021",Facebook’s Zuckerberg Proposes Changes To Sect...,"This week, Facebook CEO Mark Zuckerberg pro...",Google
1,"March 24, 2021",Antitrust in a Digital World: Does It Work? – ...,"Below, we have provided the full transcript ...",Google
2,"March 22, 2021",US House Antitrust Chairman Has New Big Tech B...,"Democratic Representative David Cicilline, ...",Google
3,"March 22, 2021",Biden Nominates Antitrust Expert Lina Khan For...,US President Joe Biden intends to nominate ...,Google
4,"March 19, 2021",Facebook May Face New Antitrust Probe In UK,Britain’s competition regulator is set to b...,Google


##### NER to extract countries from title

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

In [368]:
import spacy
from spacy import displacy
import pycountry
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [369]:
nlp_sm = spacy.load("en_core_web_sm") # smaller pipeline
nlp_lg = spacy.load("en_core_web_lg")
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [370]:
legend = pd.read_csv("data/others/country_nationality_list.csv")
legend = legend.set_index('nationality').to_dict()['country']

In [371]:
# extract info from string         
def extract(ner_function, pipeline, texts):
    list = []
    for text in texts:
        raw = str(ner_function(pipeline, text))
        list.append(str(raw))            
    return (list)

In [372]:
# ner get countries and nationalities
def get_countries(pipeline, texts):
    doc = pipeline(texts)
    for entity in doc.ents:
        if entity.label_ == 'GPE': # country, city, state
            if entity.text in legend.values(): # keep if in country list
                return lm.lemmatize(entity.text)
            else: 
                return None            
        elif entity.label_ == 'NORP': # nationality 
            if entity.text in legend.keys(): # match to country
                return legend[entity.text]
            else:
                 return None              
        else: 
            return None

In [373]:
# check title & text for values
def check_both(list1, list2):
    list = []
    for i, j in zip(list1, list2):
        if j != 'None':
            list.append(j)
        else:
            list.append(i)
    return list

In [374]:
titles = df['title']
texts = df['text']

c_title = extract(get_countries, nlp_lg, titles)
c_text = extract(get_countries, nlp_lg, texts)

countries = check_both(c_title, c_text)

# titles that mention cities instead of countries not captured

In [376]:
countries.count('None')

7023

##### NER to extract fines from text

In [377]:
pattern = "[^a-zA-Z\d\s:]?\d*\.?\d*\-?\s?\w+illion$"

# ner get fines
def get_fines(pipeline, texts):
    doc = pipeline(texts)
    for entity in doc.ents:
        if entity.label_ == 'MONEY': 
            match = re.findall(pattern, entity.text)
            return match
        else: 
            return None

In [378]:
f_title = extract(get_fines, nlp_lg, titles)
f_text = extract(get_fines, nlp_lg, texts)

fines = check_both(f_title, f_text)

In [379]:
fines.count('None')

11281

In [380]:
np.unique(fines)

array(['None', "['$1 billion']", "['$1 million']", "['$1.36 billion']",
       "['$1.7 billion']", "['$1.8 billion']", "['$100 million']",
       "['$15 billion']", "['$16 billion']", "['$16.48 million']",
       "['$17 million']", "['$19 billion']", "['$25 billion']",
       "['$26 billion']", "['$3 billion']", "['$3.84 billion']",
       "['$300 billion']", "['$324.5 million']", "['$37 billion']",
       "['$375 million']", "['$4 billion']", "['$4.5 billion']",
       "['$415 million']", "['$49 billion']", "['$5 billion']",
       "['$50 million']", "['$600 million']", "['$69 Billion']",
       "['$7.4 billion']", "['$774 million']", "['£22 billion']",
       "['£4-trillion']", "['€1.49 billion']", "['€3 billion']", '[]'],
      dtype='<U18')

##### Clean new dataset

In [6]:
from datetime import datetime

In [None]:
country_fines = pd.DataFrame.from_dict({'country':countries, 'fine':fines})
df_clean = pd.concat([df, country_fines], axis=1)
df_clean.head()

In [50]:
# remove unavailable articles
unavail = "THIS ARTICLE IS NOT AVAILABLE FOR IP ADDRESS"
df_clean = df_clean[df_clean["text"].str.contains(unavail) == False]

# keep only articles that mention GAFAM
platforms = ["Google", "Amazon", "Facebook", "Apple", "Microsoft"]
df_clean = df_clean[df_clean["text"].str.contains('|'.join(platforms))]

# format date variable
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean['year'] = [x.year for x in df_clean['date']]

df_clean = df_clean.drop(df_clean.columns[0], axis = 1)

In [53]:
df_clean.head()

Unnamed: 0,date,title,text,company,country,fine,year
0,2021-03-25,Facebook’s Zuckerberg Proposes Changes To Sect...,"This week, Facebook CEO Mark Zuckerberg pro...",Google,,,2021
1,2021-03-22,US House Antitrust Chairman Has New Big Tech B...,"Democratic Representative David Cicilline, ...",Google,,,2021
2,2021-03-22,Biden Nominates Antitrust Expert Lina Khan For...,US President Joe Biden intends to nominate ...,Google,US,,2021
3,2021-03-19,Facebook May Face New Antitrust Probe In UK,Britain’s competition regulator is set to b...,Google,UK,,2021
4,2021-03-16,Facebook Agrees To Pay News Corp For Content I...,News Corp has struck a three-year deal to pro...,Google,Australia,,2021


In [54]:
len(df_clean)

7608

In [55]:
df_clean.to_csv("data/df_clean.csv")

### Topic modeling with BERT

In [225]:
text = list(df_clean['text'])

Word embeddings from distilbert

In [226]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(text, show_progress_bar=True)

Batches: 100%|██████████| 238/238 [07:03<00:00,  1.78s/it]


Reduce dimension of embeddings & cluster similar documents

In [227]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [228]:
import hdbscan

cluster = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit(umap_embeddings)

Derive topics

In [229]:
docs_df = pd.DataFrame(text, columns = ['text'])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'text': ' '.join})

In [231]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.text.values, m=len(text))

In [232]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .text
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "text": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)



Unnamed: 0,Topic,Size
0,-1,2813
82,81,793
106,105,371
77,76,168
1,0,110
73,72,96
99,98,92
108,107,83
105,104,79
28,27,74


In [233]:
top_n_words[81][:10]

[('deal', 0.009761933632152565),
 ('billion', 0.007693816306387451),
 ('acquisition', 0.005651891442568295),
 ('announced', 0.004868824607617178),
 ('cloud', 0.004733477069163744),
 ('buy', 0.004469146807368187),
 ('approval', 0.00426996103415094),
 ('walmart', 0.004098650763737676),
 ('microsoft', 0.003919413346215811),
 ('activision', 0.0038622241087137646)]

In [234]:
top_n_words[105][:10]

[('judge', 0.011200788119330654),
 ('iphone', 0.010738869558496393),
 ('cote', 0.010507861991163422),
 ('epic', 0.008570872640576915),
 ('ebooks', 0.008569484704532525),
 ('bromwich', 0.006881275695260066),
 ('store', 0.006876899357315192),
 ('ruling', 0.006581954820400608),
 ('developers', 0.006431557401341426),
 ('app', 0.006175678419772144)]

In [235]:
top_n_words[76][:10]

[('australian', 0.02084943203893733),
 ('accc', 0.019020900445906645),
 ('australia', 0.017932806241640478),
 ('code', 0.017043954788835934),
 ('inquiry', 0.008977780322464416),
 ('media', 0.00887345312456362),
 ('sims', 0.008389804393621966),
 ('publishers', 0.007958226487556248),
 ('au', 0.00793549621143133),
 ('mandatory', 0.006872204682367001)]