In [141]:
import numpy as np
import pandas as pd
#import pippip.main(["install", "openpyxl"])

## Import Data

<b> Russian Ministry of Foreign Affairs Articles from May 2004 - Dec 2020 (Harvard Dataverse)

In [142]:
ru_mofa = pd.read_excel('data/Russia_MOFA_May2004-Dec2020_articles-only.xlsx')

In [143]:
ru_mofa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22212 entries, 0 to 22211
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   headline          22212 non-null  object        
 1   publication_date  22212 non-null  datetime64[ns]
 2   article_content   22212 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 520.7+ KB


<b> cut rows of articles before 2012

In [144]:
ru_mofa.drop( ru_mofa[ ru_mofa['publication_date'] < pd.Timestamp(2012,1,1) ].index, inplace=True)

In [145]:
ru_mofa.reset_index(drop=True)

Unnamed: 0,headline,publication_date,article_content
0,"Telephone conversation between Sergey Lavrov, ...",2012-01-10,"PRESS-RELEASE On January 10, a telephone conve..."
1,"Telephone Conversation Between S. Lavrov, Mini...",2012-01-11,"PRESS-RELEASE On January 11, S. Lavrov, Minist..."
2,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,"PRESS-RELEASE William Burns, United States Fir..."
3,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,PRESS-RELEASE Traditional meeting of Sergey La...
4,"Meeting of S. Lavrov, Minister of Foreign Affa...",2012-01-17,"PRESS-RELEASE On January 17, a working meeting..."
...,...,...,...
13564,Foreign Ministry statement on extending the Ru...,2020-12-29,"For a long period of time, the story about a ‚..."
13565,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-29,"Mr Minister,_x000D_\n_x000D_\nMy dear Mevlut,_..."
13566,Foreign Minister Sergey Lavrov‚Äôs interview w...,2020-12-30,Question: The pandemic has changed people‚Äôs ...
13567,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-30,"Mr Minister,_x000D_\n_x000D_\nColleagues,_x000..."


In [146]:
ru_mofa.publication_date.sort_values()

8643    2012-01-10
8644    2012-01-11
8645    2012-01-16
8646    2012-01-16
8647    2012-01-17
           ...    
22206   2020-12-29
22208   2020-12-29
22210   2020-12-30
22209   2020-12-30
22211   2020-12-30
Name: publication_date, Length: 13569, dtype: datetime64[ns]

In [147]:
#pd.set_option('display.max_colwidth', None)

## Data Cleaning

<b> Extract proper name from col Headline with NLTK

In [148]:

import nltk
from nameparser.parser import HumanName
nltk.download('maxent_ne_chunker')
nltk.download('words')


def get_human_names(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    person_list = []
    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1: #avoid grabbing lone surnames
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []
    return (person_list)

ru_mofa["speaker"] = ru_mofa["headline"].apply(get_human_names)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/adrianacuppuleri/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adrianacuppuleri/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [149]:
ru_mofa["speaker"].value_counts()

[]                                               7673
[Sergey Lavrov]                                   743
[Alexander Lukashevich]                           231
[John Kerry]                                      128
[Mikhail Bogdanov]                                 83
                                                 ... 
[Sergey Lavrov, Sudan Hasabo Abderrahman]           1
[Sergey Lavrov, Louise Mushikiwabo]                 1
[Sergey Lavrov, Arabia Adel, Ahmed Al-Jubeir]       1
[Mr. Mikhail]                                       1
[Armenian Foreign, Ara Ayvazyan]                    1
Name: speaker, Length: 2518, dtype: int64

<b> Replace empty space of the speaker column

In [150]:
def replace_empty_speaker():
    for index, row in ru_mofa[(ru_mofa['speaker'].str.len() == 0) & \
        (ru_mofa["headline"].str.contains('LAVROV|Lavrov|RUSSIAN MINISTER OF FOREIGN AFFAIRS|Russian Minister of Foreign Affairs|FOREIGN MINISTRY|Foreign Ministry|FOREIGN MINISTER SERGEY LAVROV|Foreign Minister Sergey Lavrov|SERGEY LAVROV|Sergey Lavrov|SERGEY LAVROV MEETS|Sergey Lavrov Meets|RUSSIAN MFA|Russian MFA|S. LAVROV|S. Lavrov|MINISTRY|Ministry'))].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Sergey Lavrov]"
    for index, row in ru_mofa[(ru_mofa['speaker'].str.len() == 0) & (ru_mofa["headline"].str.contains('Alexander Yakovenko|Spokesman|SPOKESMAN|ALEXANDER YAKOVENKO'))].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Alexander Yakovenko]"
    for index, row in ru_mofa[(ru_mofa['speaker'].str.len() == 0) & (ru_mofa["headline"].str.contains('Alexander Lukashevich|ALEXANDER LUKASHEVICH|OSCE|osce'))].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Alexander Lukashevich]"
    for index, row in ru_mofa[(ru_mofa['speaker'].str.len() == 0) & (ru_mofa["headline"].str.contains("DEPUTY MINISTER OF FOREIGN AFFAIRS|Deputy Minister of Foreign Affairs|DEPUTY FOREIGN AFFAIRS|Deputy Foreign Affairs|RUSSIAN DEPUTY MINISTER|Russian Deputy Minister|DEPUTY FOREIGN MINISTER|Deputy Foreign Minister"))].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Deputy Minister of Foreign Affairs]"

In [151]:
replace_empty_speaker()

In [152]:
ru_mofa.speaker.value_counts()

[Sergey Lavrov]                                  3666
[]                                               2660
[Deputy Minister of Foreign Affairs]             1221
[Sergey Lavrov]                                   743
[Alexander Lukashevich]                           231
                                                 ... 
[Sergey Lavrov, Sudan Hasabo Abderrahman]           1
[Sergey Lavrov, Louise Mushikiwabo]                 1
[Sergey Lavrov, Arabia Adel, Ahmed Al-Jubeir]       1
[Mr. Mikhail]                                       1
[Armenian Foreign, Ara Ayvazyan]                    1
Name: speaker, Length: 2522, dtype: int64

In [153]:
def clean_speaker():
    for index, row in ru_mofa[ru_mofa["headline"].str.contains('LAVROV|Lavrov|RUSSIAN MINISTER OF FOREIGN AFFAIRS|Russian Minister of Foreign Affairs|FOREIGN MINISTRY|Foreign Ministry|FOREIGN MINISTER SERGEY LAVROV|Foreign Minister Sergey Lavrov|SERGEY LAVROV|Sergey Lavrov|SERGEY LAVROV MEETS|Sergey Lavrov Meets|RUSSIAN MFA|Russian MFA|S. LAVROV|S. Lavrov|MINISTRY|Ministry')].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Sergey Lavrov]"
    for index, row in ru_mofa[ru_mofa["headline"].str.contains('Alexander Yakovenko|Spokesman|SPOKESMAN|ALEXANDER YAKOVENKO')].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Alexander Yakovenko]"
    for index, row in ru_mofa[ru_mofa["headline"].str.contains('A.K. Lukashevich|A.K. LUKASHEVICH|Alexander Lukashevich|ALEXANDER LUKASHEVICH|OSCE|osce')].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Alexander Lukashevich]"
    for index, row in ru_mofa[ru_mofa["headline"].str.contains("DEPUTY MINISTER OF FOREIGN AFFAIRS|Deputy Minister of Foreign Affairs|DEPUTY FOREIGN AFFAIRS|Deputy Foreign Affairs|RUSSIAN DEPUTY MINISTER|Russian Deputy Minister|DEPUTY FOREIGN MINISTER|Deputy Foreign Minister")].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Deputy Minister of Foreign Affairs]"
    for index, row in ru_mofa[ru_mofa['headline'].str.contains('Ambassador Alexey Borodavkin|Alexey Borodavkin|the Russian Federation to the United Nations')].iterrows():
        ru_mofa.loc[index, 'speaker'] = "[Alexey Borodavkin]"



In [154]:
clean_speaker()

In [155]:
ru_mofa.speaker.value_counts()

[Sergey Lavrov]                                                    6865
[Deputy Minister of Foreign Affairs]                               2789
[]                                                                 2640
[Alexander Lukashevich]                                             685
[Alexander Yakovenko]                                                23
                                                                   ... 
[Bert Koenders]                                                       1
[Thoonglun Sisulit]                                                   1
[Large Sergey]                                                        1
[Georgy Muradov]                                                      1
[Permanent Representative, Euratom Ambassador Vladimir Chizhov]       1
Name: speaker, Length: 418, dtype: int64

In [156]:
#rest of empty cell --> Russia MFA
for index, row in ru_mofa[(ru_mofa['speaker'].str.len() == 0)].iterrows():
    ru_mofa.loc[index, 'speaker'] = "[Russia MFA]"

In [157]:
#trasnform speaker into a string to modify it
ru_mofa['speaker'] = ru_mofa['speaker'].astype(str)

In [158]:
#everything that coould not be assigned as specific speaker
for index, row in ru_mofa[ru_mofa['speaker'].str.contains('Nuclear Weapons|Joint Statement|Human Rights|Arab Republic|Rossiya Segodnya|Arms Control|Paul Whelan|Augusto Santos|John Kerry|Alexey Navalny|Threats Ilya|Climate Change|Border Commission|Hugo Martinez|Rossiya Segodnya International|Edi Rama|Human Rights Nils|Kofi Annan|States Parties|Saman Weerasinghe|Lundeg Purevsuren|Sudan Barnaba Marial Benjamin|Thoonglun Sisulit|Bert Koenders|Outer Space Objects|Gush Etzion|Polish Ambassador|Khalid Bin Mohammed|Red Army|Strategic Arms|Mohammad Javad|Holy See|Svalbard Archipelago|Davit Dondua|St Petersburg International Economic Forum|East Jerusalem|Radovan Karadzic|Moldova Andrei|Border Crossing|Worship Susana|Daily Telegraphâ€œ|Luxembourg Minister|Global Trends|Nuclear Safety|Draft Resolution|Hor Namhong|Gabonese Minister|Aliaskhab Kebekov|Wang Yi|Normandy Meeting|Zambian Foreign|Fumio Kishida|Sheikh Abdullah|Zayed Al|Jens Stoltenberg|Nuclear Material|David Choquehuanca|Tedros Adhanom|Nova Makedonija|Rossiyskaya Gazeta|Global Initiative|Arbitrary Deprivation|Culture Centre|Jeff Shell|German Foreign|Syrian Ambassador|Burundi External Relations|Sir Simon Lawrance Gass|Political Director|Avigdor Liberman|Iyad Ameen Madani|Julie Bishop|St Petersburg|Political Dialogue|Nils MuiÅ¾nieks|Expatriates Walid Muallem|Joint Press Statement|Carlos Raul|Civil Societies|Atomic Energy|Munich Betrayal|Terrorism Act|Assembly Summit|Volgograd Region|Dilma Rousseff|Sergey Sevastyanov|James Monastery Mother Agnes Mariam|Akhlas Akhlaq|Defence Ministers Council|Crime Alexander Zmeyevsky|Dmitry Safonov|Other International Organisations|Old Jerusalem|Benin Aurelien|Pham Binh|Niger Ibrahim Yacoubou|Elmar Mammadyarov|Sigmar Gabriel|Radio Vesti|Ratko Mladic|Armenian Foreig|Erlan Abdyldaev|Lebanese Tourism|Idriss Jazairy|Dilgam Askerov|Ethnic Affairs Igor|Davor Ivo Stier|Geoffrey Onyeama|Alex Younger|Estonian Foreign|Toxin Weapons|Visa Formalities|Ertharin Cousin|Biometric Personal Data|Peter Szijjarto|Zambian Minister|Expert Council|Joint Russian|Main Human Freedoms|Mutual Cancelation|Elections Held|Three Pussy Riot|Vatican City|Eesti Ekspress|Mutual Travel|Denis Ronaldo Moncada|Marshal Ivan Konev|Road Initiative|Vladimir Makei|Joint Working Group|Light Weapons|Wind Jet|Saad Haririâ€™s|Strategic Partnership Agreement|Andreas Fryganas|Costa Rica|Anton Mazur|Honorary Archbishop|Isselkou|Astana Process|Craig Reedie|Mutual Reductions|Gas Exporting|Worship Jorge|Mount Agung|Mohamed Siala|Joint Centre|Milorad Dodik|Osman Saleh|Vologda Region|Arab League|Laotian Foreign|Kabul Process|Normandy Four|Donald Trump|Ukraine Gerardo|Vehicle Registration|Mongolian Minister|John Nicholson|Venezuela Delcy|Salahuddin Rabbani|Works Agency|Aslan Abashidze|Humanitarian|Caesar|Trade|Ri Su Yong|Burundi|Nicole Roussell|Karin Kneissl|Bolshoi Theatre|Magnitsky List|Anna News|Georgia|Nikkei|Thorbjorn|Young Diplomats|Saint Kitts|Edmond Mulet|auritius|Lithuanian Foreign|Sebastian Kurz|Alexander Pushkin|Arab Emirates|Your Memory|Shanghai Cooperation Organisation|John Tefft|Vygaudas Usackas|Her Son|Mutual Abolition|Peru R. Roncayolo|Ossetia D.|Antonio Guterres|Kirghizia E.|Luiz Alberto|Honduras Mireya AgÃ¼ero|Ban Ki-moon|Dmytro Yarosh|Law K.|Abu Dhabi|Charap|Sudan|Ambassador Extraordinary|Assembly Â|Lebanon T.|Caucasus F. Lefor|Strategic Confidence|Military Security|Air Berlin|Great Britain|Mr. Alexey Yu|Sri Lanka|Law K.|Ukraine Azamat|Euratom Ambassador Vladimir|Ahmed Al|Masis Mailyan|Burkina Faso|Black Sea Economic|Assembly Third Committee|Mutual Exchanges|Young Guard|Political Questions|Edith Bouvier|African|Illicit Arms|Nepali Minister|Human Development|Dialogue Forum|Permanent Representative|Andrei Kelin|Äôs|Richard Lugar Centre|David Hale|John Bolton|Da Nang|Special Presidential|Brunei Darussalam|Election Results|Regular Migration|Extremist Organisations|Simon Coveney|Nuclear|Arab|Liberator|Ambassador Cheng Jingye|Work Visit|Kirghiz|Deputy Chairman|Michel Kilo|Lantos Swett|Panama City|Sexual|Joint|Treaty|Jeremy Hunt|Qasem|Qatari|Mutual|Syrian|Venezuelan|Afghan|Great|Defence|Title|Public|Nikolai Bayev|Alexander Kuranov|Mohsen Fakhrizadeh')].iterrows():
    ru_mofa.loc[index, 'speaker'] = "[Russia MFA]"

In [159]:
#pd.options.display.max_rows = 500

In [160]:
ru_mofa['speaker'].value_counts()

[Sergey Lavrov]                         6865
[Russia MFA]                            3058
[Deputy Minister of Foreign Affairs]    2789
[Alexander Lukashevich]                  685
[Alexander Yakovenko]                     23
                                        ... 
['Grigory Berdennikov']                    1
['Dmitry Bely']                            1
['Andrei Nekrasov']                        1
['Alexander Gaponenko']                    1
['Mr. Vladimir Putin']                     1
Name: speaker, Length: 96, dtype: int64

In [161]:
#clean col speaker (only 2 elements)
ru_mofa["speaker"] = ru_mofa["speaker"].str.replace('Convention|Spokesperson|Alexander Kuranov|Permanent Delegate|Global Times|Ian Hill|Chemical Weapons|Ambassador|Spokeperson|Mr. |mr. |Deputy Head|Permanent Delagate|Special Representative|Chemical Weapons Convention|Rashid Alimov|Chikahito Harada', '')

  ru_mofa["speaker"] = ru_mofa["speaker"].str.replace('Convention|Spokesperson|Alexander Kuranov|Permanent Delegate|Global Times|Ian Hill|Chemical Weapons|Ambassador|Spokeperson|Mr. |mr. |Deputy Head|Permanent Delagate|Special Representative|Chemical Weapons Convention|Rashid Alimov|Chikahito Harada', '')


In [162]:
ru_mofa['speaker'].value_counts()

[Sergey Lavrov]                         6865
[Russia MFA]                            3058
[Deputy Minister of Foreign Affairs]    2789
[Alexander Lukashevich]                  685
[Alexander Yakovenko]                     23
                                        ... 
['Andrey Krutskikh']                       1
['Vasily Muravitsky']                      1
['Igor Barinov']                           1
['Tamara Nersesyan']                       1
['Konstantin Yaroshenko']                  1
Name: speaker, Length: 93, dtype: int64

In [163]:
ru_mofa.reset_index(drop=True)

Unnamed: 0,headline,publication_date,article_content,speaker
0,"Telephone conversation between Sergey Lavrov, ...",2012-01-10,"PRESS-RELEASE On January 10, a telephone conve...",[Sergey Lavrov]
1,"Telephone Conversation Between S. Lavrov, Mini...",2012-01-11,"PRESS-RELEASE On January 11, S. Lavrov, Minist...",[Sergey Lavrov]
2,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,"PRESS-RELEASE William Burns, United States Fir...",[Sergey Lavrov]
3,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,PRESS-RELEASE Traditional meeting of Sergey La...,[Sergey Lavrov]
4,"Meeting of S. Lavrov, Minister of Foreign Affa...",2012-01-17,"PRESS-RELEASE On January 17, a working meeting...",[Sergey Lavrov]
...,...,...,...,...
13564,Foreign Ministry statement on extending the Ru...,2020-12-29,"For a long period of time, the story about a ‚...",[Sergey Lavrov]
13565,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-29,"Mr Minister,_x000D_\n_x000D_\nMy dear Mevlut,_...",[Sergey Lavrov]
13566,Foreign Minister Sergey Lavrov‚Äôs interview w...,2020-12-30,Question: The pandemic has changed people‚Äôs ...,[Sergey Lavrov]
13567,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-30,"Mr Minister,_x000D_\n_x000D_\nColleagues,_x000...",[Sergey Lavrov]


In [164]:
# Each article in col article_content starts with a disclaimer: "Unofficial translation from Russian";\ 
# some of them has the datetime at the end
# no NAN
# headline is what in next dataset is "description"
# remove _x000D_\n_x000D_\

## Data Preprocessing

### Identify Noise with Regex

In [165]:
import re 
RE_SUSPICIOUS = re.compile(r'[Äô¬†&#<>{}\[\]\\]')

In [166]:
def impurity(text, min_len=10): 
    """returns the share of suspicious characters in a text""" 
    if text == None or len(text) < min_len: 
        return 0 
    else: 
        return len(RE_SUSPICIOUS.findall(text))/len(text)

In [167]:
# add new column to data frame
ru_mofa['impurity'] = ru_mofa['article_content'].apply(impurity, min_len=10)

In [168]:
ru_mofa['impurity'].sort_values().value_counts()

0.000000    12519
0.000061       58
0.000122       36
0.000031       12
0.000183        9
            ...  
0.000502        1
0.000504        1
0.000531        1
0.000531        1
0.082069        1
Name: impurity, Length: 891, dtype: int64

In [169]:
# get the top 3 records 
ru_mofa[['article_content', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

Unnamed: 0,article_content,impurity
21908,"Mr. President, _x000D_\n_x000D_\nMr. Secretary...",0.082069
21976,"Vladimir Putin: Good afternoon, colleagues,_x0...",0.079931
22059,"Vladimir Putin: Good afternoon, colleagues._x0...",0.077296


### Remove Noise with Regex

In [170]:
import html 
def clean(text):
     # convert html escapes like &amp; to characters. 
     text = html.unescape(text) 
     # tags like <tab> 
     text = re.sub(r'<[^<>]*>', ' ', text) 
     # markdown URLs like [Some text](https://....) 
     text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text) 
     # text or code in brackets like [0] 
     text = re.sub(r'\[[^\[\]]*\]', ' ', text) 
     # standalone sequences of specials, matches ¬† but not #cool 
     text = re.sub(r'(?:^|\s)[¬†<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text) 
     # standalone sequences of hyphens like --- or == 
     text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text) 
     # everything non-alpahnumeric with a space
     text = re.sub(r'\W+',' ', text)
     # Two or more dots with one
     text = re.sub(r'\.{2,}', ' ', text)
     # all the non ASCII characters      
     text = re.sub(r'[^\x00-\x7F]+',' ', text)
     # PRESS RELEASE
     text = re.sub('PRESS RELEASE', ' ', text)    
     # _x000D_
     text = re.sub('_x000D_', ' ', text)
     # Unofficial translation from Russian
     text = re.sub('Unofficial translation from Russian', ' ', text)
     # on Month, Day
     text = re.sub(r"[a-zA-Z]+ [a-zA-Z]+ \d+", ' ', text)
      # sequences of white spaces
     text = re.sub(r'\s+', ' ', text)
     # good morning, etc
     #text = re.sub('Good morning|Good afternoon|Good evening|ladies and gentlemen|thank you', ' ', text)
     # Sergey Lavrov
     text = re.sub('Sergey Lavrov|S. Lavrov|S Lavrov| Lavrov', ' Sergey_Lavrov', text)
     return text.strip()

In [171]:
ru_mofa['text_clean'] = ru_mofa['article_content'].map(clean)

In [172]:
ru_mofa['text_clean']

8643     a telephone conversation between Minister of F...
8644     Sergey_Lavrov Minister of Foreign Affairs of t...
8645     William Burns United States First Deputy Secre...
8646     Traditional meeting of  Sergey_Lavrov Minister...
8647     a working meeting between  Sergey_Lavrov Minis...
                               ...                        
22207    For a long period of time the story about a Ru...
22208    Mr Minister My dear Mevlut Colleagues and frie...
22209    Question The pandemic has changed people s liv...
22210    Mr Minister Colleagues Welcome to Moscow We be...
22211    Ladies and gentlemen Foreign Minister of the L...
Name: text_clean, Length: 13569, dtype: object

In [173]:
ru_mofa['impurity'] = ru_mofa['text_clean'].apply(impurity, min_len=10) 

In [174]:
ru_mofa[['text_clean', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

Unnamed: 0,text_clean,impurity
8643,a telephone conversation between Minister of F...,0.0
17681,Foreign Minister Sergey_Lavrov had a traditio...,0.0
17683,Table of contents New methods of exerting pres...,0.0


In [175]:

#rename text_clean and remove col impurity
ru_mofa.rename(columns={'text_clean': 'text'}, inplace=True) 
ru_mofa.drop(columns=['impurity'], inplace=True)

### Processing with SpaCy



In [176]:
import spacy 

#### Tokenize, POS, Lemma

In [177]:
import spacy 
from spacy.tokenizer import Tokenizer 
from spacy.lang.es.stop_words import STOP_WORDS
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

In [178]:
#creates a tokenizer object with individual rules in a “minimally invasive” way: 
# drop the respective patterns from spaCy’s default rules but retain the major part of the logic

def custom_tokenizer(nlp): # use default patterns except the ones matched by re.search 
    prefixes = [pattern for pattern in nlp.Defaults.prefixes 
                if pattern not in ['-', '_', '#']] 
    suffixes = [pattern for pattern in nlp.Defaults.suffixes 
                if pattern not in ['_']] 
    infixes = [pattern for pattern in nlp.Defaults.infixes 
                if not re.search(pattern, 'xx-xx')]

    return Tokenizer(vocab = nlp.vocab,
                    rules = nlp.Defaults.tokenizer_exceptions, 
                    prefix_search = compile_prefix_regex(prefixes).search, 
                    suffix_search = compile_suffix_regex(suffixes).search, 
                    infix_finditer = compile_infix_regex(infixes).finditer, 
                    token_match = nlp.Defaults.token_match)


In [179]:
nlp = spacy.load('en_core_web_sm', exclude=["ner"])
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")

In [180]:
nlp.tokenizer = custom_tokenizer(nlp)

In [181]:
#Stopwords
from spacy.lang.en import stop_words
nlp.Defaults.stop_words |= { 'affairs', 'ambassador', 'colleagues', 'dear', 'deputy', 'foreign', 'gentleman', 'lady', 'minister', 'ministry', 'like', 'mr', 'mrs', 'miss', 'president', 'question', 's', 'sergey_lavrov', 'today', 'tomorrow', 'welcome', 'yesterday'}
stop_words = stop_words.STOP_WORDS
stop_words

{' s',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'affairs',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'ambassador',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'colleagues',
 'could',
 'dear',
 'deputy',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five

In [182]:
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(ru_mofa['text'].astype('unicode').values, batch_size=50):
    tokens.append([n.text.lower() for n in doc])
    lemma.append([n.lemma_.lower() for n in doc
                if n.lemma_.lower() not in stop_words])
    pos.append([n.pos_ for n in doc])

ru_mofa['tokens'] = tokens
ru_mofa['lemma'] = lemma
ru_mofa['pos'] = pos

In [183]:
ru_mofa['lemma']

8643     [telephone, conversation, russian, federation,...
8644     [russian, federation, r, sikorski, republic, p...
8645     [william, burns, united, states, secretary, st...
8646     [traditional, meeting,  , russian, federation,...
8647     [work, meeting,  , russian, federation, e, nal...
                               ...                        
22207    [long, period, time, story, russian, trace, ha...
22208    [mevlut, friend, happy, th, meeting, joint, st...
22209    [pandemic, change, people, life, year, instead...
22210    [moscow, believe, meeting, time, exaggeration,...
22211    [libyan, government, national, accord, mohamed...
Name: lemma, Length: 13569, dtype: object

In [184]:
ru_mofa.reset_index(drop=True)

Unnamed: 0,headline,publication_date,article_content,speaker,text,tokens,lemma,pos
0,"Telephone conversation between Sergey Lavrov, ...",2012-01-10,"PRESS-RELEASE On January 10, a telephone conve...",[Sergey Lavrov],a telephone conversation between Minister of F...,"[a, telephone, conversation, between, minister...","[telephone, conversation, russian, federation,...","[DET, NOUN, NOUN, ADP, PROPN, ADP, PROPN, PROP..."
1,"Telephone Conversation Between S. Lavrov, Mini...",2012-01-11,"PRESS-RELEASE On January 11, S. Lavrov, Minist...",[Sergey Lavrov],Sergey_Lavrov Minister of Foreign Affairs of t...,"[sergey_lavrov, minister, of, foreign, affairs...","[russian, federation, r, sikorski, republic, p...","[PROPN, PROPN, ADP, PROPN, PROPN, ADP, DET, PR..."
2,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,"PRESS-RELEASE William Burns, United States Fir...",[Sergey Lavrov],William Burns United States First Deputy Secre...,"[william, burns, united, states, first, deputy...","[william, burns, united, states, secretary, st...","[PROPN, PROPN, PROPN, PROPN, PROPN, PROPN, PRO..."
3,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,PRESS-RELEASE Traditional meeting of Sergey La...,[Sergey Lavrov],Traditional meeting of Sergey_Lavrov Minister...,"[traditional, meeting, of, , sergey_lavrov, m...","[traditional, meeting, , russian, federation,...","[ADJ, NOUN, ADP, SPACE, PROPN, PROPN, ADP, PRO..."
4,"Meeting of S. Lavrov, Minister of Foreign Affa...",2012-01-17,"PRESS-RELEASE On January 17, a working meeting...",[Sergey Lavrov],a working meeting between Sergey_Lavrov Minis...,"[a, working, meeting, between, , sergey_lavro...","[work, meeting, , russian, federation, e, nal...","[DET, VERB, NOUN, ADP, SPACE, PROPN, PROPN, AD..."
...,...,...,...,...,...,...,...,...
13564,Foreign Ministry statement on extending the Ru...,2020-12-29,"For a long period of time, the story about a ‚...",[Sergey Lavrov],For a long period of time the story about a Ru...,"[for, a, long, period, of, time, the, story, a...","[long, period, time, story, russian, trace, ha...","[ADP, DET, ADJ, NOUN, ADP, NOUN, DET, NOUN, AD..."
13565,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-29,"Mr Minister,_x000D_\n_x000D_\nMy dear Mevlut,_...",[Sergey Lavrov],Mr Minister My dear Mevlut Colleagues and frie...,"[mr, minister, my, dear, mevlut, colleagues, a...","[mevlut, friend, happy, th, meeting, joint, st...","[PROPN, PROPN, PRON, ADJ, PROPN, PROPN, CCONJ,..."
13566,Foreign Minister Sergey Lavrov‚Äôs interview w...,2020-12-30,Question: The pandemic has changed people‚Äôs ...,[Sergey Lavrov],Question The pandemic has changed people s liv...,"[question, the, pandemic, has, changed, people...","[pandemic, change, people, life, year, instead...","[NOUN, DET, NOUN, AUX, VERB, NOUN, PART, NOUN,..."
13567,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-30,"Mr Minister,_x000D_\n_x000D_\nColleagues,_x000...",[Sergey Lavrov],Mr Minister Colleagues Welcome to Moscow We be...,"[mr, minister, colleagues, welcome, to, moscow...","[moscow, believe, meeting, time, exaggeration,...","[PROPN, PROPN, PROPN, VERB, ADP, PROPN, PRON, ..."


In [201]:
ru_mofa.to_excel('data/ru_mofa.xlsx', engine='xlsxwriter', index=False)

## Topic Model with GENSIM

In [185]:
import numpy as np
import pandas as pd
import spacy 
nlp = spacy.load('en_core_web_sm', exclude=["ner"])
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")

In [202]:
ru_mofa = pd.read_excel('data/ru_mofa.xlsx')

In [203]:
ru_mofa 

Unnamed: 0,headline,publication_date,article_content,speaker,text,tokens,lemma,pos
0,"Telephone conversation between Sergey Lavrov, ...",2012-01-10,"PRESS-RELEASE On January 10, a telephone conve...",[Sergey Lavrov],a telephone conversation between Minister of F...,"['a', 'telephone', 'conversation', 'between', ...","['telephone', 'conversation', 'russian', 'fede...","['DET', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'ADP',..."
1,"Telephone Conversation Between S. Lavrov, Mini...",2012-01-11,"PRESS-RELEASE On January 11, S. Lavrov, Minist...",[Sergey Lavrov],Sergey_Lavrov Minister of Foreign Affairs of t...,"['sergey_lavrov', 'minister', 'of', 'foreign',...","['russian', 'federation', 'r', 'sikorski', 're...","['PROPN', 'PROPN', 'ADP', 'PROPN', 'PROPN', 'A..."
2,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,"PRESS-RELEASE William Burns, United States Fir...",[Sergey Lavrov],William Burns United States First Deputy Secre...,"['william', 'burns', 'united', 'states', 'firs...","['william', 'burns', 'united', 'states', 'secr...","['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', ..."
3,"Meeting of Sergey Lavrov, Minister of Foreign ...",2012-01-16,PRESS-RELEASE Traditional meeting of Sergey La...,[Sergey Lavrov],Traditional meeting of Sergey_Lavrov Minister...,"['traditional', 'meeting', 'of', ' ', 'sergey_...","['traditional', 'meeting', ' ', 'russian', 'fe...","['ADJ', 'NOUN', 'ADP', 'SPACE', 'PROPN', 'PROP..."
4,"Meeting of S. Lavrov, Minister of Foreign Affa...",2012-01-17,"PRESS-RELEASE On January 17, a working meeting...",[Sergey Lavrov],a working meeting between Sergey_Lavrov Minis...,"['a', 'working', 'meeting', 'between', ' ', 's...","['work', 'meeting', ' ', 'russian', 'federatio...","['DET', 'VERB', 'NOUN', 'ADP', 'SPACE', 'PROPN..."
...,...,...,...,...,...,...,...,...
13564,Foreign Ministry statement on extending the Ru...,2020-12-29,"For a long period of time, the story about a ‚...",[Sergey Lavrov],For a long period of time the story about a Ru...,"['for', 'a', 'long', 'period', 'of', 'time', '...","['long', 'period', 'time', 'story', 'russian',...","['ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', '..."
13565,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-29,"Mr Minister,_x000D_\n_x000D_\nMy dear Mevlut,_...",[Sergey Lavrov],Mr Minister My dear Mevlut Colleagues and frie...,"['mr', 'minister', 'my', 'dear', 'mevlut', 'co...","['mevlut', 'friend', 'happy', 'th', 'meeting',...","['PROPN', 'PROPN', 'PRON', 'ADJ', 'PROPN', 'PR..."
13566,Foreign Minister Sergey Lavrov‚Äôs interview w...,2020-12-30,Question: The pandemic has changed people‚Äôs ...,[Sergey Lavrov],Question The pandemic has changed people s liv...,"['question', 'the', 'pandemic', 'has', 'change...","['pandemic', 'change', 'people', 'life', 'year...","['NOUN', 'DET', 'NOUN', 'AUX', 'VERB', 'NOUN',..."
13567,Foreign Minister Sergey Lavrov‚Äôs opening rem...,2020-12-30,"Mr Minister,_x000D_\n_x000D_\nColleagues,_x000...",[Sergey Lavrov],Mr Minister Colleagues Welcome to Moscow We be...,"['mr', 'minister', 'colleagues', 'welcome', 't...","['moscow', 'believe', 'meeting', 'time', 'exag...","['PROPN', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PR..."


In [None]:
ru_mofa

### MODEL 1 ###
num_topics=10


In [188]:
list_of_docs = ru_mofa['lemma'].tolist()

### Compute bigrams

In [189]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(list_of_docs, min_count=20)
for idx in range(len(list_of_docs)):
    for token in bigram[list_of_docs[idx]]:
        if '_' in token:
           c # Token is a bigram, add to document.
            list_of_docs[idx].append(token)

### Create a Dictionary

In [190]:
from gensim.corpora import Dictionary

In [191]:
dictionary = Dictionary(list_of_docs)

In [192]:
dictionary.num_docs

13569

In [193]:
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [194]:
#Get number of stored tokens
print(len(dictionary))

10947


### Create a BOW corpus

In [195]:
 #compute the frequency of each word
corpus = [dictionary.doc2bow(text) for text in list_of_docs]

In [196]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 10947
Number of documents: 13569


### TD-IDF transformation

multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing the resulting documents to unit length.

In [197]:
from gensim.models import TfidfModel 
tfidf = TfidfModel(corpus) 
vectors = tfidf[corpus]

### LDA

In [198]:
from gensim.models import LdaModel 

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20 #controls how often we train the model on the entire corpus.
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

lda = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every, 
    random_state=42)

In [199]:
lda.show_topics()

[(0,
  '0.010*" " + 0.008*"united_states" + 0.008*"eu" + 0.008*"united" + 0.007*"time" + 0.007*"talk" + 0.007*"states" + 0.007*"european" + 0.006*"think" + 0.006*"know"'),
 (1,
  '0.016*"east" + 0.015*"middle" + 0.013*"settlement" + 0.012*"political" + 0.011*"syria" + 0.010*"middle_east" + 0.010*"special" + 0.009*"situation" + 0.009*"syrian" + 0.009*"un"'),
 (2,
  '0.030*"ukraine" + 0.025*"ukrainian" + 0.016*"osce" + 0.016*"kiev" + 0.013*"minsk" + 0.008*"force" + 0.007*"donbass" + 0.007*"contact" + 0.007*"agreement" + 0.006*"military"'),
 (3,
  '0.013*"security" + 0.010*"un" + 0.009*"treaty" + 0.009*"nuclear" + 0.009*"state" + 0.007*"states" + 0.006*"use" + 0.006*"world" + 0.006*"united" + 0.006*"global"'),
 (4,
  '0.022*"syria" + 0.018*"syrian" + 0.016*"terrorist" + 0.014*"un" + 0.010*"chemical" + 0.009*"group" + 0.008*"al" + 0.008*"security" + 0.008*"use" + 0.007*"council"'),
 (5,
  '0.011*"right" + 0.011*"human" + 0.010*"world" + 0.008*"people" + 0.008*"council" + 0.007*"war" + 0.00

In [200]:
# coherence level
top_topics = lda.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.9000.
[([(0.021058803, 'cooperation'),
   (0.011656194, 'economic'),
   (0.009000386, 'relation'),
   (0.008772561, 'meeting'),
   (0.0078087715, 'trade'),
   (0.007058805, 'region'),
   (0.006996523, 'development'),
   (0.0066829696, 'year'),
   (0.0056591756, 'hold'),
   (0.0055419123, 'bilateral'),
   (0.0054449625, 'joint'),
   (0.005332655, 'agreement'),
   (0.0051956405, 'area'),
   (0.0051501277, 'state'),
   (0.0051497207, 'regional'),
   (0.005142295, 'visit'),
   (0.005112244, 'work'),
   (0.0049873167, 'project'),
   (0.0044529443, 'dialogue'),
   (0.004090795, 'tie')],
  -0.7853374279804888),
 ([(0.009818873, ' '),
   (0.008244667, 'united_states'),
   (0.00808636, 'eu'),
   (0.007540068, 'united'),
   (0.00721698, 'time'),
   (0.007057178, 'talk'),
   (0.006676797, 'states'),
   (0.00661927, 'european'),
   (0.0064969226, 'think'),
   (0.0062653604, 'know'),
   (0.006010205, 'want'),
   (0.005979148, 't'),
   (0.0058769467, 'relation'),
   (0.00

In [None]:
from gensim.test.utils import datapath

#saving model to disk.
temp_file = datapath("lda")

In [None]:
#loading model from disk

#from gensim import  models

#lda = models.ldamodel.LdaModel.load(temp_file)