In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
import spacy
from dask import delayed, compute
from dask.distributed import Client

In [2]:
corpus = pd.read_json('indian_foreign_policy_text.json')

In [3]:
corpus = corpus.loc[(corpus['office'] != 'President') &
                   corpus['year'].isin(list(range(1948, 2020)))]

In [4]:
stopwords = pd.read_csv('stopwords.csv')['stopwords'].to_list()

In [325]:
ccodes = pd.read_csv('ccodes.csv')

In [326]:
ccodes = ccodes.filter(regex = '\.en')

In [320]:
ccodes['country.name.en.regex'] = ccodes['country.name.en.regex'].str.replace('\\', '\\\\')

  """Entry point for launching an IPython kernel.


In [321]:
ccodes['country.name.en.regex'] = '(' + ccodes['country.name.en.regex'] + '){i<=5,d<=5,e<=5}'

In [327]:
ccode_dict = dict(zip(ccodes['country.name.en.regex'], ccodes['country.name.en']))

In [323]:
ccode_dict 

{'(afghan){i<=5,d<=5,e<=5}': 'Afghanistan',
 '(^[å|a]land){i<=5,d<=5,e<=5}': 'Åland Islands',
 '(albania){i<=5,d<=5,e<=5}': 'Albania',
 '(algeria){i<=5,d<=5,e<=5}': 'Algeria',
 '(^(?=.*americ).*samoa){i<=5,d<=5,e<=5}': 'American Samoa',
 '(andorra){i<=5,d<=5,e<=5}': 'Andorra',
 '(angola){i<=5,d<=5,e<=5}': 'Angola',
 '(anguill?a){i<=5,d<=5,e<=5}': 'Anguilla',
 '(antarctica){i<=5,d<=5,e<=5}': 'Antarctica',
 '(antigua){i<=5,d<=5,e<=5}': 'Antigua & Barbuda',
 '(argentin){i<=5,d<=5,e<=5}': 'Argentina',
 '(armenia){i<=5,d<=5,e<=5}': 'Armenia',
 '(^(?!.*bonaire).*\\\\baruba){i<=5,d<=5,e<=5}': 'Aruba',
 '(australia){i<=5,d<=5,e<=5}': 'Australia',
 '(^(?!.*hungary).*austria|\\\\baustri.*\\\\bemp){i<=5,d<=5,e<=5}': 'Austria',
 '(austria-hungary){i<=5,d<=5,e<=5}': 'Austria-Hungary',
 '(azerbaijan){i<=5,d<=5,e<=5}': 'Azerbaijan',
 '(baden){i<=5,d<=5,e<=5}': 'Baden',
 '(bahamas){i<=5,d<=5,e<=5}': 'Bahamas',
 '(bahrain){i<=5,d<=5,e<=5}': 'Bahrain',
 '(bangladesh|^(?=.*east).*paki?stan){i<=5,d<=5,e<=

In [10]:
# spacy.require_gpu()
nlp = spacy.load('en_core_web_md')

In [11]:
nlp.max_length = max(corpus['text'].str.len())

In [12]:
nlp.add_pipe('merge_entities')

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [13]:
def include_token(x, stopwords):
    latin_character = x.is_ascii
    not_punctuation = not x.is_punct 
    not_digit = not x.is_digit
    not_stopword = x.lemma_ not in stopwords
    length_over_one = (len(x.lemma_) > 1)
    include = latin_character and not_punctuation and not_digit and not_stopword and length_over_one
    return include

In [15]:
def custom_tokenizer(doc, stopwords):
    doc = nlp(doc, disable = ['parser'])
    tokens = [x.lemma_ for x in doc if include_token(x, stopwords)]
    return tokens

In [261]:
docs = nlp.pipe(corpus['text'], batch_size = 400, disable = ['parser'], n_process = 8)

In [17]:
# client = Client(n_workers = 4)

In [19]:
tokenized_docs = [custom_tokenizer(doc, stopwords) for doc in docs]

In [252]:
spacy.explain('GPE')

'Countries, cities, states'

In [262]:
vocab = []

In [263]:
for doc in docs:
    for token in doc:
        if token.ent_type_ == 'GPE':
            vocab.append(token.lemma_)

In [276]:
vocab = list(set(vocab))

In [277]:
vocab

['Badan',
 'Comoros',
 'Gaza strip',
 'the U. S. Administration',
 'Indo Africa',
 'Alexandria',
 'India Thailand',
 'Phnom Penh',
 'Veracruz',
 'Costa  \n\nRica',
 'Masjid',
 'Mauritian Republic',
 'the Democratic Republic of Afghanistan',
 'the Republic of Oriental del Uruguay',
 'South Asia',
 'post-kyoto',
 'Prime mini14th',
 'Shahjehanabad',
 'Wangari Maathai',
 'the Republice of Venezuela',
 'Methacillin',
 'Peopleâ\x80\x99s',
 'north versus',
 'Sidiropoulos',
 'Arab Republic',
 'Vishakapatnam',
 'Â\x90',
 'Iccland',
 'Satyagrah',
 'Dalit',
 'the Republic of Botswana',
 'the Centre of russian Glory',
 'Handpumps',
 'Wexford',
 'Radioisopes',
 'the Centre for Rural and Industrial Development Studies',
 'Lichtenstein',
 'Rangpur',
 'the United States America',
 'Ladakh',
 'Fredericton',
 'Bicholim',
 'the People\x92s Republic of Bangladesh',
 'Chavan',
 'Badshah Khan',
 'Locomotives',
 'Shantiniketan',
 'Trivandrum',
 'Tehran(Iran',
 'Kirti',
 'Joint Indo-British',
 'South \n\n    

In [333]:
import pycountry

In [336]:
[{c.name:token for token in vocab if c.name in token} for c in pycountry.countries]

[{'Aruba': 'Aruba'},
 {'Afghanistan': 'Indo-Afghanistan'},
 {'Angola': 'the Peoples Republic of Angola'},
 {'Anguilla': 'Anguilla'},
 {},
 {'Albania': 'Albania'},
 {},
 {'United Arab Emirates': 'Indo-United Arab Emirates Joint Commission on Economic, scientific and Technical Cooperation'},
 {'Argentina': 'initiative\x97Argentina'},
 {'Armenia': 'the Republic of Armenia'},
 {},
 {'Antarctica': 'Antarctica'},
 {},
 {},
 {'Australia': 'the Republic of Australia'},
 {'Austria': 'Austria'},
 {'Azerbaijan': 'Azerbaijan Republic'},
 {'Burundi': 'Burundi'},
 {'Belgium': 'Belgium'},
 {'Benin': 'Benin'},
 {},
 {'Burkina Faso': 'Burkina Faso'},
 {'Bangladesh': 'Bangladesh Meghalaya'},
 {'Bulgaria': 'Indo-Bulgaria Joint Press Statement'},
 {'Bahrain': 'the Kingdom of Bahrain for'},
 {'Bahamas': 'Bahamas'},
 {'Bosnia and Herzegovina': 'Bosnia and Herzegovina on'},
 {},
 {'Belarus': 'Belarus'},
 {'Belize': 'Belizean'},
 {'Bermuda': 'Bermuda'},
 {},
 {'Brazil': 'the Republic of Indiaand the Governmen

In [330]:
pd.Series(vocab).replace('\\s{2,}', ' ', regex = True).replace(ccode_dict, regex = True).to_list()

['Badan',
 'Comoros',
 'Gaza strip',
 'the U. S. Administration',
 'Indo Africa',
 'Alexandria',
 'India Thailand',
 'Phnom Penh',
 'Veracruz',
 'Costa Rica',
 'Masjid',
 'Mauritian Republic',
 'the Democratic Republic of Afghanistan',
 'the Republic of Oriental del Uruguay',
 'South Asia',
 'post-kyoto',
 'Prime mini14th',
 'Shahjehanabad',
 'Wangari Maathai',
 'the Republice of Venezuela',
 'Methacillin',
 'Peopleâ\x80\x99s',
 'north versus',
 'Sidiropoulos',
 'Arab Republic',
 'Vishakapatnam',
 'Â\x90',
 'Iccland',
 'Satyagrah',
 'Dalit',
 'the Republic of Botswana',
 'the Centre of Russian Glory',
 'Handpumps',
 'Wexford',
 'Radioisopes',
 'the Centre for Rural and Industrial Development Studies',
 'Lichtenstein',
 'Rangpur',
 'the United States America',
 'Ladakh',
 'Fredericton',
 'Bicholim',
 'the People\x92s Republic of Bangladesh',
 'Chavan',
 'Badshah Khan',
 'Locomotives',
 'Shantiniketan',
 'Trivandrum',
 'Tehran(Iran',
 'Kirti',
 'Joint Indo-British',
 'South Africa',
 'Cr

In [None]:
# tokenized_docs = compute(*[delayed(custom_tokenizer)(doc, country_dict, stopwords) for doc in docs])

In [None]:
# client.close()

In [None]:
# 

Replace country names using regex and combine multiword entities identified by spaCy.

Remove digits, long underscores, custom stopword list, words in only one document, words that appear fewer than five times, words with one character.

In [None]:
# dtm = CountVectorizer()