# FDS Assignment 1: UN Debates, World Happiness and International Trade

We can start by loading the data:

In [1]:
import os
import numpy as np
import pandas as pd

sessions = np.arange(25, 76)
data=[]

for session in sessions:
    directory = "./TXT/Session " + str(session) + " - " + str(1945 + session)
    for filename in os.listdir(directory):
        f = open(os.path.join(directory, filename), encoding='utf8')
        if filename[0] == ".":
            continue
        splt = filename.split("_")
        data.append([session, 1945 + session, splt[0], f.read()])

df_speech = pd.DataFrame(data, columns=['Session','Year','ISO-alpha3 Code','Speech'])
df_speech.tail()

Unnamed: 0,Session,Year,ISO-alpha3 Code,Speech
8476,75,2020,WSM,"Mr. President,\nDistinguished delegates,\nLadi..."
8477,75,2020,YEM,In the name of God the Merciful and the Compas...
8478,75,2020,ZAF,"President of the General Assembly, Secretary-G..."
8479,75,2020,ZMB,"Your excellency Mr. Volkan Bozkir, President o..."
8480,75,2020,ZWE,"Your Excellency, Ambassador Volkan Bozkir, Pre..."


In [3]:
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agniv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agniv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\agniv\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
def preprocess(speech):
    #tolower
    speech = speech.lower()
    
    #stopwords
    text = word_tokenize(speech)
    tokens_without_sw = [word for word in text if not word in stopwords.words("english")] 
    processed_speech = (" ").join(tokens_without_sw)
    
    #punctuation, indents etc
    processed_speech = processed_speech.translate(str.maketrans('', '', string.punctuation))
    processed_speech = processed_speech.replace("\n"," ")
    processed_speech = processed_speech.replace("\t"," ")
    processed_speech = re.sub('\s*\d+\s*', ' ', processed_speech)
    
    return processed_speech

In [5]:
def yearly_processing(speeches):
    total_speech = ''
    for speech in speeches:
        proc_speech = preprocess(speech)
        total_speech += (' ' + proc_speech)
    
    return total_speech

In [6]:
def iteration(all_years):
    corpus = []
#     for year in all_years:
    for i in range(1970, 2021):
        print(i)
        speeches = yearly_processing(all_years.loc[i]['Speech'])
        corpus.append(speeches)

    return corpus

In [7]:
corpus = iteration(df_speech.set_index(["Year", "ISO-alpha3 Code"]))

1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [8]:
print(len(corpus))

51


In [13]:
with open('corpus.txt', 'w', encoding='utf-8') as f:
    f.writelines(corpus)

## TF-IDF

*Tf* means term-frequency while *tf-idf* means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval that has also found good use in document classification. The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document (as in the previous example) is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus. If needed, more info can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html).

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dense = X.todense()
denselist = dense.tolist()

tf_idf = pd.DataFrame(denselist, columns=feature_names).T
print(tf_idf)

          0    1    2    3    4    5    6    7    8    9   ...   41   42   43  \
aa       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
aaa      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
aac      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
aachen   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
aaf      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
шг       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
ьо       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
қарекет  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
қылмақ   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
ﬂagrant  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

               44   45     

In [15]:
tf_idf.to_csv('tfidf.csv')

## Word bank

List of good words:

In [None]:
predictors = [
    'climate', 'change', 'glaciers', 'temperature', 'melting', 'greenhouse', 'fossil', 'fuels', 
    'emissions', 'co2', 'disaster', 'hurricane', 'floods', 'pollution', 'tsunami', 'drought', 'carbon', 
    'dioxide', 'carbondioxide', 'ozone', 'ozonelayer', 'global', 'warming', 'atmosphere', 'environment', 
    'environmental', 'oil', 'barrel', 'crude', 'coal', 'sea', 'level', 'ecology', 'climatologists', 'climatology', 
    'manmade', 'planet', 'earth', 'catastrophe', 'urbanization', 'terrestrial', 'antarctica', 'ice', 'depletion', 
    'nonrenewable', 'natural', 'nature', 'tree', 'deforestation', 'amazon', 'rainforest', 'forest', 'fracking', 
    'drilling', 'methane', 'leak', 'waste', 'contamination'
]

responses = [
    'renewable', 'green', 'solar', 'clean', 'energy', 'sustainable', 'sustainability', 'carbonneutral', 
    'carbon', 'reduction', 'turbine', 'geothermal', 'hydroelectric', 'hydro', 'electricity', 'nuclear', 
    'powerplant', 'shale', ... 
Power
Biodegradable
Biodiversity 
Panels
Farm(s)
Dam(s)
Alternative 
Ecosystem
Ecological 
Ecofriendly 
Footprint 
Phaseout 
Decommissioning 
]