### 1- Load UN Speeches to Dataframe

In [54]:
import os
import numpy as np
import pandas as pd

sessions = np.arange(25, 76)
data=[]

for session in sessions:
    directory = "./Data/TXT/Session "+str(session)+" - "+str(1945+session)
    # directory = f"./TXT/Session {session} - {1945+session}"
    for filename in os.listdir(directory):
        # f = open(os.path.join(directory, filename))
        with open(os.path.join(directory, filename)) as f:
            if filename[0]==".": #ignore hidden files
                continue
            splt = filename.split("_")
            data.append([session, 1945+session, splt[0], f.read()])

df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])

df_speech.tail()

Unnamed: 0,Session,Year,ISO-alpha3 Code,Speech
8476,75,2020,HRV,"Mr President, Excellencies\nAll protocol obser..."
8477,75,2020,GAB,"Mr. President, Majesties,\nLadies and Gentleme..."
8478,75,2020,MCO,"Mr. President of the General Assembly,\nMr. Se..."
8479,75,2020,AND,"Mr. President,\nMr. Secretary General,\nYour E..."
8480,75,2020,BHR,"In the name of Allah, the most gracious, the m..."


### 2- Download the 'UNSD — Methodology.csv' to get detailed country information

In [55]:
codes_path = "./Data/UNSD_Methodology.csv"
df_codes = pd.read_csv(codes_path, sep=";")
df_codes.head()

Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,
2,1,World,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY,,,
3,1,World,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR,,,
4,1,World,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN,x,,


### 3- Merge Datasets

In [56]:
df_un_merged = pd.merge(
    df_speech[["ISO-alpha3 Code", "Session", "Year", "Speech"]],
    df_codes[["Country or Area", "Region Name","ISO-alpha3 Code"]],
    on = "ISO-alpha3 Code",
    how="left")

# Set the index
df_un_merged = df_un_merged.set_index(["Year", "ISO-alpha3 Code"]).sort_index()
df_un_merged.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Session,Speech,Country or Area,Region Name
Year,ISO-alpha3 Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970,ALB,25,33: May I first convey to our President the co...,Albania,Europe
1970,ARG,25,177.\t : It is a fortunate coincidence that pr...,Argentina,Americas
1970,AUS,25,100.\t It is a pleasure for me to extend to y...,Australia,Oceania
1970,AUT,25,155.\t May I begin by expressing to Ambassado...,Austria,Europe
1970,BEL,25,"176. No doubt each of us, before coming up to ...",Belgium,Europe


;### 4- Filtering Countries Within the Africa Region

In [57]:
df_africa = df_un_merged[df_un_merged["Region Name"] == "Africa"]
df_africa.tail(20)


Unnamed: 0_level_0,Unnamed: 1_level_0,Session,Speech,Country or Area,Region Name
Year,ISO-alpha3 Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020,MWI,75,Your Excellency Mr. Volkan Bozkir — President ...,Malawi,Africa
2020,NAM,75,"Your Excellency, Volkan Bozkir, President of t...",Namibia,Africa
2020,NER,75,"Mr. President,\nExcellencies Heads of State an...",Niger,Africa
2020,NGA,75,"Mr. President,\nHeads of State and Government,...",Nigeria,Africa
2020,RWA,75,"•\tExcellency, Volkan Bozkir, President of the...",Rwanda,Africa
2020,SDN,75,"In the name of God, the most merciful,\nYour M...",Sudan,Africa
2020,SEN,75,"Mr. President of the General Assembly,\nLadies...",Senegal,Africa
2020,SLE,75,"Mr. President,\nMr. Secretary General,\nYour E...",Sierra Leone,Africa
2020,SOM,75,"Honourable President of the Assembly, Excellen...",Somalia,Africa
2020,STP,75,Honorable President of the 75th Session of the...,Sao Tome and Principe,Africa


### 5- Extracting the education-related words from Unesco thesaurus (education group)

In [58]:
# Import necessary libraries
import requests
import pandas as pd

# Fetch the JSON file directly from URL
educationGlossaryURL = "https://id.loc.gov/authorities/subjects/sh85040989.json"
URLResponse = requests.get(educationGlossaryURL)
educationGlossaryData = URLResponse.json()

#Collect all the @value entries from the JSON, as education-related words are encoded within.
education_related_terms = []
for entry in educationGlossaryData:
    for key, values in entry.items():
        if isinstance(values, list):
            for value in values:
                if isinstance(value, dict) and "@value" in value:
                    education_related_terms.append(value["@value"])


In [59]:
# Build a dataframe of unique terms
df_education_related_terms = pd.DataFrame(education_related_terms, columns=["Education-related Key Word"])
print(df_education_related_terms.head(20))
print(df_education_related_terms.tail(20))
print(df_education_related_terms.size)


                      Education-related Key Word
0                     Human resource development
1                          Pictures in education
2                          Pictures in education
3   150  $aEducation$xAustralian states$xFinance
4          Education--Australian states--Finance
5          Education--Australian states--Finance
6                            Inclusive education
7                            Inclusive education
8           150  $aEducation$xEconometric models
9                  Education--Econometric models
10                 Education--Econometric models
11             150  $aEducation$xReference books
12                    Education--Reference books
13                    Education--Reference books
14                               Home and school
15                               Home and school
16                        Compensatory education
17                        Compensatory education
18                       Libraries and education
19                  

In [60]:
# Remove the duplicates
df_education_related_terms_clean = df_education_related_terms.drop_duplicates()
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)


                           Education-related Key Word
0                          Human resource development
1                               Pictures in education
3        150  $aEducation$xAustralian states$xFinance
4               Education--Australian states--Finance
6                                 Inclusive education
8                150  $aEducation$xEconometric models
9                       Education--Econometric models
11                  150  $aEducation$xReference books
12                         Education--Reference books
14                                    Home and school
16                             Compensatory education
18                            Libraries and education
20                                   Youth--Education
21   150  $aEducation$xStudy and teaching (Preschool)
22          Education--Study and teaching (Preschool)
24          150  $aEducation$xPeriodicals$xPublishing
25                 Education--Periodicals--Publishing
27                         1

In [61]:
#Check whether any null field exists
print(f"Null entry exists? {df_education_related_terms_clean.isnull().values.any()}")

Null entry exists? False


In [62]:
#Remove the entries start with digits
df_education_related_terms_clean = df_education_related_terms_clean[
    ~df_education_related_terms_clean["Education-related Key Word"].str[0].str.isdigit()]
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

                   Education-related Key Word
0                  Human resource development
1                       Pictures in education
4       Education--Australian states--Finance
6                         Inclusive education
9               Education--Econometric models
12                 Education--Reference books
14                            Home and school
16                     Compensatory education
18                    Libraries and education
20                           Youth--Education
22  Education--Study and teaching (Preschool)
25         Education--Periodicals--Publishing
28                        Education--Taxation
31  Education--US states--Aims and objectives
33                        Technical education
35                                  Education
36                            Moral education
38                        Affective education
40          Professional learning communities
42                      Education of children
387


In [63]:
# Replace the double dash ('--') with white spaces
df_education_related_terms_clean["Education-related Key Word"] = df_education_related_terms_clean[
    "Education-related Key Word"].str.replace("--", " ")
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

                  Education-related Key Word
0                 Human resource development
1                      Pictures in education
4        Education Australian states Finance
6                        Inclusive education
9               Education Econometric models
12                 Education Reference books
14                           Home and school
16                    Compensatory education
18                   Libraries and education
20                           Youth Education
22  Education Study and teaching (Preschool)
25          Education Periodicals Publishing
28                        Education Taxation
31   Education US states Aims and objectives
33                       Technical education
35                                 Education
36                           Moral education
38                       Affective education
40         Professional learning communities
42                     Education of children
387


In [64]:
# Normalize all the whitespaces (i.e., only one space between words) & lowercase all entries
df_education_related_terms_clean["Education-related Key Word"] = df_education_related_terms_clean[
    "Education-related Key Word"].str.replace(r"\s", " ", regex=True).str.strip().str.lower()
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

                  Education-related Key Word
0                 human resource development
1                      pictures in education
4        education australian states finance
6                        inclusive education
9               education econometric models
12                 education reference books
14                           home and school
16                    compensatory education
18                   libraries and education
20                           youth education
22  education study and teaching (preschool)
25          education periodicals publishing
28                        education taxation
31   education us states aims and objectives
33                       technical education
35                                 education
36                           moral education
38                       affective education
40         professional learning communities
42                     education of children
387


In [65]:
# Convert all chacters to unicode? é

### 6- Computing the frequency of education-related word usage per african country

In [66]:
text = df_africa.loc[(2010, "EGY"), "Speech"]
print(text)

The 
beginning of the sixty-fifth session of the General 
Assembly coincides with the preparations of the 
Non-Aligned Movement (NAM), chaired by my 
country, to celebrate 50 years of achievements. During 
those years, the Movement has greatly helped to 
reinforce international and multilateral action under 
changing regional and international circumstances. The 
Movement’s work over those years has demonstrated 
the ability of developing countries to contribute 
effectively to the maintenance of international peace 
and security, make progress in development, and 
promote human rights, basic freedoms and the 
furtherance of good governance at the international 
level. 
 
 
37 10-55103 
 
 This week’s NAM ministerial meeting represents 
the point of departure for a major celebration, to be 
hosted by Indonesia in April or May 2011. The meeting 
will chart a clear and integrated future course, drawing 
on past achievements as an impetus for future 
endeavours. It will define the element

Import the nltk libraries

In [67]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bugrasipahioglu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bugrasipahioglu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bugrasipahioglu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/bugrasipahioglu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Preprocess

In [68]:
def preprocess(words):
    custom_stopwords = set(stopwords.words("english")) | {"United Nations"}
    no_sw = []
    for w in words:
        if (w not in custom_stopwords):
            w.lower()
            if w.isalpha():
                no_sw.append(w)
    return no_sw

Aggregate the sum of education-related words into the df_africa dataset

In [69]:
def count_education_related_terms(text):
    tokens = [w.lower() for w in word_tokenize(text) if w.isalpha()]
    freq = FreqDist(tokens)
    # sum frequencies of education terms
    return sum(freq[t] for t in df_education_related_terms_clean if t in freq)

df_africa = df_africa.copy()
df_africa["edu_count"] = df_africa["Speech"].apply(count_education_related_terms)
df_africa.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Session,Speech,Country or Area,Region Name,edu_count
Year,ISO-alpha3 Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970,CMR,25,: A year ago I came here as the Acting Preside...,Cameroon,Africa,0
1970,COG,25,122.\t I cannot begin my intervention without...,Congo,Africa,0
1970,DZA,25,1. The delegation of Algeria is very pleased ...,Algeria,Africa,0
1970,GHA,25,121.\t I should like to begin by congratulatin...,Ghana,Africa,0
1970,GIN,25,35.\t The delegation of the Republic of Guine...,Guinea,Africa,0


In [71]:
df_africa.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Session,Speech,Country or Area,Region Name,edu_count
Year,ISO-alpha3 Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020,MWI,75,Your Excellency Mr. Volkan Bozkir — President ...,Malawi,Africa,0
2020,NAM,75,"Your Excellency, Volkan Bozkir, President of t...",Namibia,Africa,0
2020,NER,75,"Mr. President,\nExcellencies Heads of State an...",Niger,Africa,0
2020,NGA,75,"Mr. President,\nHeads of State and Government,...",Nigeria,Africa,0
2020,RWA,75,"•\tExcellency, Volkan Bozkir, President of the...",Rwanda,Africa,0
2020,SDN,75,"In the name of God, the most merciful,\nYour M...",Sudan,Africa,0
2020,SEN,75,"Mr. President of the General Assembly,\nLadies...",Senegal,Africa,0
2020,SLE,75,"Mr. President,\nMr. Secretary General,\nYour E...",Sierra Leone,Africa,0
2020,SOM,75,"Honourable President of the Assembly, Excellen...",Somalia,Africa,0
2020,STP,75,Honorable President of the 75th Session of the...,Sao Tome and Principe,Africa,0


Let us now see some examples of word analysis with NLTK:

Which were the most frequent words used in the Austrian Speech in 1970?

In [None]:
from nltk.probability import FreqDist
from nltk import word_tokenize

# load text of Austria in 1970
text = df_un_merged.loc[1970,'AUT']["Speech"]

# tokenize words
words = word_tokenize(text)

# compute word frequency
freq = FreqDist(words)

# show 30 most frequent words
freq.most_common(30)

In [None]:
# plot the histogram with the top most used words
freq.plot(20)

Notice that the most frequent words are not that informative about the Austrian speech (the, of, to...). These words are often called *stop-words*. These words are generally filtered out before processing text (natural language). These are actually some of the most common words in any language (articles, prepositions, pronouns, conjunctions, etc) but do not add much information to the text. Let's now use NLTK to filter those words

In [None]:
from nltk.corpus import stopwords

def preprocess(words):
    sw = stopwords.words("english")
    no_sw = []
    for w in words:
        if (w not in sw):
            no_sw.append(w)
    return no_sw

text = df_un_merged.loc[2002,"AFG"]["Speech"]

words = word_tokenize(text)
words = preprocess(words)
freq = FreqDist(words)

freq.plot(20)

**Q2: Can you change the method preprocess to put all words in lower case, remove punctuation and remove non-informative words (e.g., United Nations)?**

Tip: the method isalpha() might be useful

In [None]:
def preprocess(words):
    custom_stopwords = set(stopwords.words("english")) | {"United Nations"}
    no_sw = []
    for w in words:
        if (w not in custom_stopwords):
            w.lower()
            if w.isalpha():
                no_sw.append(w)
    return no_sw
text = df_un_merged.loc[2002,"AFG"]["Speech"]

words = word_tokenize(text)
words = preprocess(words)
freq = FreqDist(words)

freq.plot(20)

A regular expression is a sequence of characters that specifies a pattern. Usually, such patterns are used by to find, match, replace sub-strings within a document. Regular expressions have a particular syntax and are often useful to clean and pre-process textual data. Here one example where the regular expression 'afg.\*' is used to match any word that starts with afg and is followed by any character.

In [None]:
# Regular expression example
s = set({})
import re
for w in words:
    if re.match('afg.*n$', w):
        s.add(w)
print(s)

Another useful usage of NLTK is performing sentiment analysis.

Sentiment analysis can be seen as the process of automatically classifying text into positive or negative sentiment categories. With NLTK, you can employ these algorithms without effort. This was also called opinion mining.

In the political field, sentiment analysis is used to keep track of political view, to detect consistency and inconsistency between statements and actions at the government level or to derive the opinion or attitude of a speaker.

NLTK implements VADER (Valence Aware Dictionary and sEntiment Reasoner), which is a lexicon and rule-based sentiment analysis. VADER uses a list of lexical features (e.g., words) which are generally labeled according to their semantic orientation as either positive or negative. VADER not only tells about the Positivity and Negativity score but also tells us about how positive or negative a sentiment is.

NLTK implements VADER through the module SentimentIntensityAnalyzer. Below an example of application (with natural limitations as VADER is specifically attuned to sentiments expressed in **social media**):

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt

sia = SentimentIntensityAnalyzer()

vecUSA = [sia.polarity_scores(df_un_merged.loc(axis=0)[i,"USA"]["Speech"])['pos'] for i in np.arange(1971, 2021)]
vecRUS = [sia.polarity_scores(df_un_merged.loc(axis=0)[i,"RUS"]["Speech"])['pos'] for i in np.arange(1971, 2021)]
vecCHN = [sia.polarity_scores(df_un_merged.loc(axis=0)[i,"CHN"]["Speech"])['pos'] for i in np.arange(1971, 2021)]

fig, ax = plt.subplots()
ax.plot(np.arange(1971, 2021), vecUSA, label='USA')
ax.plot(np.arange(1971, 2021), vecRUS, label='RUS')
ax.plot(np.arange(1971, 2021), vecCHN, label='CHN')
ax.set_xlabel('Year')
ax.set_ylabel('Positive Sentiment Score')

ax.legend()
plt.show()

## 2- Extracting the education-related words from Unesco thesaurus (education group)


In [None]:
# Import necessary libraries
import requests
import pandas as pd

In [None]:
# Fetch the JSON file directly from URL
educationGlossaryURL = "https://id.loc.gov/authorities/subjects/sh85040989.json"
URLResponse = requests.get(educationGlossaryURL)
educationGlossaryData = URLResponse.json()

#Collect all the @value entries from the JSON, as education-related words are encoded within.
education_related_terms = []
for entry in educationGlossaryData:
    for key,values in entry.items():
        if isinstance(values, list):
            for value in values:
                if isinstance(value, dict) and "@value" in value:
                    education_related_terms.append(value["@value"])
# Build a dataframe of unique terms
df_education_related_terms = pd.DataFrame(education_related_terms, columns = ["Education-related Key Word"])
print(df_education_related_terms.head(20))
print(df_education_related_terms.tail(20))
print(df_education_related_terms.size)

In [None]:
# Remove the duplicates
df_education_related_terms_clean = df_education_related_terms.drop_duplicates()
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

In [None]:
#Check whether any null field exists
print(f"Null entry exists? {df_education_related_terms_clean.isnull().values.any()}")

In [None]:
#Remove the entries start with digits
df_education_related_terms_clean = df_education_related_terms_clean[~df_education_related_terms_clean["Education-related Key Word"].str[0].str.isdigit()]
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)


In [None]:
# Replace the double dash ('--') with white spaces
df_education_related_terms_clean["Education-related Key Word"] = df_education_related_terms_clean["Education-related Key Word"].str.replace("--", " ")
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

In [None]:
# Normalize all the whitespaces (i.e., only one space between words) & lowercase all entries
df_education_related_terms_clean["Education-related Key Word"] = df_education_related_terms_clean["Education-related Key Word"].str.replace(r"\s", " ", regex=True).str.strip().str.lower()
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)


In [None]:
# Convert all chacters to unicode? Keep like this for now until you see the UN corpus.