## Bottom-up Analysis


In [9]:
import pandas as pd
import tqdm
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import PyPDF2
from nltk.corpus import stopwords

In [3]:

def get_text_from_pdf(countries_legend, country, pg_from, pg_to, dimension, save_txt = False):
    pg_from = pg_from - 1
    pg_to = pg_to - 1
    doc_name = countries_legend.loc[countries_legend['country'] == country, 'doc_name'].iloc[0]
    pdf_file = open("Energy National Plans/" + doc_name + ".pdf", "rb")
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    all_text = ''
    for i in range(pg_to - pg_from):
        doc = pdf_reader.pages[pg_from + i]
        all_text = all_text + doc.extract_text()

    if save_txt:
                txt = open('output/' + dimension + '_txt/' + country + ".txt", 'w', encoding="utf-8")
                txt.write(all_text)
                txt.close()
    return all_text


In [None]:
def extract_text_from_pdf(pdf_path):
    """
    Function extract_text_from_pdf
    Returns full text from any pdf by providing pdf_path
    """
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = pdf_reader.pages

        for page_num in range(len(num_pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    return text

def read_from_pdf(files, countries):
    """
    Function read_from_pdf
    Returns a list containing all NECP texts
    """
    texts = []
    for i in tqdm.tqdm(range(len(files)), desc='Reading pdf file'):
        t = extract_text_from_pdf('Energy National Plans/' + files[i])
        texts.append(t)

        txt = open('output/Full_txt/' + countries[i] + ".txt", 'w', encoding="utf-8")
        txt.write(t)
        txt.close()
    
    return texts

def read_by_section(dimension, countries_legend, countries):
    """
    Function read_by_section
    Returns text from a desired section of NECP. 
    countries_legend.xlsx must be filled out before execution, providing which pages should the algorithm read
    """
    texts = []
    for i in range(len(countries)):
        pg_from = int(countries_legend.loc[countries_legend['country'] == countries[i], 'page_from'].iloc[0])
        pg_to   = int(countries_legend.loc[countries_legend['country'] == countries[i], 'page_to'].iloc[0]) + 1  
        text = get_text_from_pdf(countries_legend, countries[i], pg_from, pg_to, dimension, save_txt = False)
        texts.append(text) 

    return texts

def clean_text(text):
    """ 
    Function clean_text
    Returns a sentence without extra-spaces, punctuation signs, etc
    """
    # Normalize tabs and remove newlines
    text = str(text).replace('\t', ' ').replace('\n', '')
    # Remove all characters except A-Z and a dot.
    text = re.sub("[^a-zA-Z\.]", " ", text)
    text = text.replace(".", "")
    # Normalize spaces to 1
    text = re.sub(" +", " ", text)
    # Strip trailing and leading spaces
    text = text.strip()
    # Normalize all charachters to lowercase
    text = text.lower()

    # Filter out words with less than 2 characters
    words = text.split()
    text = [word for word in words if len(word) >= 2]
    text = ' '.join(text)

    return text

def norm_text(text):
    """
    Function norm_text
    Returns words after applying lemmatization
    """
    lemmatizer = WordNetLemmatizer()
    corpus_norm = []
    for i in range(len(text)):
        words = word_tokenize(text[i])
        clean_sent = []
        for j in range(len(words)):
            clean_sent.append(lemmatizer.lemmatize(words[j]))
        corpus_norm.append(' '.join(clean_sent))
    return corpus_norm


### 1- Read text and create clean corpus

#### 1.1- Select which method you want to use to extract the information from NECP. Options:
 - 1: Read a single dimensions from all plans (the name must be indicated in countries_legend tabs)
 - 2: Read all sections of NECPs and store them in folder **output/Full_txt/**
 - 3: If option 2 has been executed previously, just read output stored in folder
 

In [6]:
countries = ['Austria','Belgium','Bulgary','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
'Italy','Latvia','Lithuania','Luxemburg','Malta','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia','Spain','Sweden']

files = [file for file in os.listdir('Energy National Plans/') if file.lower().endswith('.pdf')]

# 1- Read by Dimension
dimension = 'Energy_security_1'
countries_legend = pd.read_excel("countries_legend.xlsx", sheet_name = dimension)
texts = read_by_section(dimension, countries_legend, countries)

# 2- Read from pdf
# texts = read_from_pdf(files, countries)

# 3- Read from txt
# texts = []
# files = os.listdir('output/Full_txt/')
# for file in files:
#     with open('output/Full_txt/' + file, 'r', encoding="utf-8") as f:
#         texts.append(f.read())

Get stop-words in the english language. Next tokenize each text by sentence, while cleaning, normalizing, and tokenizing by word. The output must be a list containing each sentence tokenized by words

In [10]:
stop_words = set(stopwords.words('english'))

data = []
for text in texts:
    sents = sent_tokenize(text)
    sents = [clean_text(sen) for sen in sents]
    sents = [sent for sent in sents if len(word_tokenize(sent)) > 10]
    sents = norm_text(sents)
    
    new_sents = []
    for sent in sents:
        new_sents_1 = []
        for word in word_tokenize(sent):
            new_sents_1.append(word)
        data.append(new_sents_1)

all_corpus = []
for i in range(len(texts)):
    corpus = sent_tokenize(texts[i])
    corpus_clean = [clean_text(sen) for sen in corpus]
    corpus_clean = [sentence for sentence in corpus_clean if len(word_tokenize(sentence)) > 10]
    corpus_norm = norm_text(corpus_clean)
    
    all_corpus.append(corpus_norm)

input_data = []
for i in range(len(all_corpus)):
    for j in range(len(all_corpus[i])):
        words = word_tokenize(all_corpus[i][j])
        input_data.append(words)

In [13]:
import re, numpy as np, pandas as pd
from pprint import pprint
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords

In [15]:
# Build the bigram and trigram models
texts = input_data
bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[texts], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
stop_words = set(stopwords.words('english'))

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load("en_core_web_sm")
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(texts)


### 2- Implement wordcloud

In [None]:
from wordcloud import WordCloud

# Select number of country
n_country = 25 
foo = [' '.join(process_words(all_corpus[n_country])[i]) for i in tqdm.tqdm(range(len(all_corpus[n_country])))]

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(foo))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## 3- Compute LDA model 

    ### 3.1- Find optimal number of topics between 2 and 6

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

coherence_values = []
model_list = []
for num_topics in range(2, 6, 1):
    model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = num_topics, 
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 10,
                                           passes = 10,
                                           alpha = 'symmetric',
                                           iterations = 100,
                                           per_word_topics = True)
    model_list.append(model)
    coherencemodel = CoherenceModel(model = model, texts = data_ready, dictionary = id2word, coherence = 'c_v')
    coh = coherencemodel.get_coherence()
    coherence_values.append(coh)
    print(coh)
    
x = range(2, 6, 1)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.show()

    ### 3.2- Select number of topics and execute LDA model

In [None]:
##### SELECT N_TOPICS #####
N_topics = 3
##### SELECT N_TOPICS #####

# Build LDA model
id2word = corpora.Dictionary(data_ready)
corpus = [id2word.doc2bow(text) for text in data_ready]

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                           id2word = id2word,
                                           num_topics = N_topics, 
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 10,
                                           passes = 10,
                                           alpha = 'symmetric',
                                           iterations = 100,
                                           per_word_topics = True)

pprint(lda_model.print_topics())

In [None]:
for idx, topic in lda_model.print_topics():
    print(f"TÃ³pico {idx}:", topic)

In [None]:
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(lda_model[corpus]):
    row = row_list[0] if lda_model.per_word_topics else row_list 
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([pd.Series([int(topic_num), round(prop_topic,4), topic_keywords])])], ignore_index=True)
        else:
            break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

df_topic_sents_keywords = sent_topics_df

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_dominant_topic.head(5)

    ### 3.3- Implement topic distributiona analysis

In [None]:
aux = []
for i in range(len(countries)):
    for j in range(len(all_corpus[i])):
        aux.append(countries[i])
df_dominant_topic['aux'] = aux
distributions = pd.DataFrame()

topics = range(0, N_topics, 1)
for topic in topics:
    nss = []
    for country in countries:
        df_aux = df_dominant_topic[(df_dominant_topic['aux'] == country) & (df_dominant_topic['Dominant_Topic'] == topic)]
        len1 = len(df_dominant_topic[df_dominant_topic['aux'] == country])
        len2 = len(df_dominant_topic[(df_dominant_topic['aux'] == country) & (df_dominant_topic['Dominant_Topic'] == topic)])
        nss.append(len2/len1)
    distributions[str(topic)] = nss


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

if N_topics == 4:
    discursos = ['first']*27+['second']*27+['third']*27+['fourth']*27
    scores = distributions['0'].to_list() + distributions['1'].to_list() + distributions['2'].to_list() + distributions['3'].to_list()
elif N_topics == 3:
    discursos = ['first']*27+['second']*27+['third']*27
    scores = distributions['0'].to_list() + distributions['1'].to_list() + distributions['2'].to_list()
elif N_topics == 2:
    discursos = ['first']*27+['second']*27
    scores = distributions['0'].to_list() + distributions['1'].to_list()

df = pd.DataFrame({'Discourses': discursos, 'scores': scores})

fig, ax = plt.subplots()
palette = sns.color_palette("Set2")
sns.violinplot(x="Discourses", y="scores", data=df, ax=ax, linewidth=.8, inner='point', palette='muted', )
sns.boxplot(x="Discourses", y="scores", data=df, ax=ax, width=0.3, linewidth=.8, palette='bright')

ax.set_ylabel("Probability")
ax.set_title("Topic probabilities among EU member states")
plt.show()

In [None]:
# a partir d'aqui hi ha proves/funcionalitats que es poden crear

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(2)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    # ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

In [None]:
#load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as mlines
import matplotlib.ticker as mtick
import matplotlib.gridspec as grid_spec
from matplotlib.patches import Rectangle

import pandas as pd
import statsmodels.api as sm

# import pyarrow.parquet as pq
# import pyarrow as pa


import os

import itertools
import collections

#---NLP packages--------------------
import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.util import ngrams

#----process string-------
import string
import re

#---network visualization-----------
import re
import networkx as nx


import warnings
warnings.filterwarnings("ignore")

In [None]:
# # alternatiu
# input_data = []
# for j in range(len(all_corpus[25])):
#     words = word_tokenize(all_corpus[25][j])
#     input_data.append(words)

In [None]:
data = []
stop_words = set(stopwords.words('english'))
for i in range(len(input_data)):
    data_1 = []
    for j in range(len(input_data[i])):
        if input_data[i][j] not in stop_words:
            data_1.append(input_data[i][j])
    data.append(data_1)

In [None]:
word_pairs = []
for words in data:
    words_ = list(set(words))
    for i in range(len(words_)-1):
        for j in range(i+1, len(words_)):
            word_i = words_[i]
            word_j = words_[j]
            if word_i < word_j:
                word_pairs.append([word_i, word_j])
            else:
                word_pairs.append([word_i, word_j])

In [None]:
df_word_pairs = pd.DataFrame(data = word_pairs, columns=['word1', 'word2'])
word_pairs_count = df_word_pairs.groupby(['word1', 'word2']).size()
word_pairs_count = word_pairs_count.sort_values().tail(300).reset_index()
word_pairs_count.head(5)

In [None]:
# Create list of lists containing bigrams 
terms_bigram = [list(bigrams(words)) for words in data]

# View bigrams for the first assay
print('View bigrams for the first assay')
print(terms_bigram[0][:5])

# Flatten list of bigrams in clean text
bigram_list = list(itertools.chain(*terms_bigram))

# Create counter of words in clean bigrams
bigram_counts = collections.Counter(bigram_list)

bigram_counts.most_common(20)

In [None]:
# Create list of lists containing bigrams 
terms_3gram = [list(ngrams(words, 3)) for words in data]

# View bigrams for the first assay
print('View N-grams (N=3) for the first assay')
print(terms_3gram[0][:5])


gram3_list = list(itertools.chain(*terms_3gram))

# Create counter of words in clean bigrams
gram3_counts = collections.Counter(gram3_list)

gram3_counts.most_common(20)

In [None]:
# Create network plot 
G = nx.Graph()

# Create connections between nodes
for v in bigram_counts.most_common(30):
    G.add_edge(v[0][0], v[0][1], weight=(v[1] * 10))
fig, ax = plt.subplots(figsize=(18, 10))

pos = nx.spring_layout(G, k=8)

d = dict(nx.degree(G))
edges = G.edges()
weights = [G[u][v]['weight']/1000 for u,v in edges]
# Plot networks
nx.draw_networkx(G, pos,
                 font_size=16,
                 width=weights,
                 node_size = [v * 200 for v in d.values()], 
                 edge_color='grey',
                 #node_color='tomato',
                 with_labels = True,
                 ax=ax)

ax.set_title('Bigram Network', 
             fontdict={'fontsize': 26,
            'fontweight': 'bold',
            'color': 'salmon', 
            'verticalalignment': 'baseline',
            'horizontalalignment': 'center'}, 
             loc='center')    
plt.show()

In [None]:
# Create network plot 
G = nx.Graph()


for _, row in word_pairs_count.iterrows():
    G.add_edge(row['word1'], row['word2'], weight=row[0])
    
pos_kkl = nx.kamada_kawai_layout(G)
f, ax = plt.subplots(figsize=(16, 16))


d = dict(nx.degree(G))
edges = G.edges()
weights = [G[u][v]['weight']/1000 for u,v in edges]

nx.draw(G, pos_kkl, 
        with_labels=True, 
        node_size=[v * 100 for v in d.values()],
        nodelist=d.keys(),  
        width=weights, 
        edge_color='grey', #node_color=list(df_skills_stats['core_number']), cmap="coolwarm_r", 
        alpha=0.9,
       )
#node_labels = nx.draw_networkx_labels(G, pos_kkl, labels, font_size=10)
# Set title
ax.set_title('Word Co-occurrence Network', 
             fontdict={'fontsize': 26,
            'fontweight': 'bold',
            'color': 'salmon', 
            'verticalalignment': 'baseline',
            'horizontalalignment': 'center'}, 
             loc='center')
# Set edge color
plt.gca().collections[0].set_edgecolor("#000000")

In [None]:
# df_dominant_topic.head(5)

In [None]:
n1 = 0
primer = []
segon  = []
tercer = []
# quart  = []

for i in range(len(countries)): #len(countries)
    # print(i)
    n2 = len(all_corpus[i]) + n1
    df = df_dominant_topic.iloc[n1:n2]
    df1 = df['Dominant_Topic'].value_counts()
    # print(df1)
    primer.append(df1[0]) if 0 in df['Dominant_Topic'].value_counts() else primer.append(0)
    segon.append(df1[1])  if 1 in df['Dominant_Topic'].value_counts() else segon.append(0)
    tercer.append(df1[2]) if 2 in df['Dominant_Topic'].value_counts() else tercer.append(0)
    # quart.append(df1[3])  if 3 in df['Dominant_Topic'].value_counts() else quart.append(0)
    n1 += len(all_corpus[i])

In [None]:
# dataframe_resum = pd.DataFrame({'country': countries, 'Discurs1': primer, 'Discurs2': segon, 'Discurs3': tercer, 'Discurs4': quart})
dataframe_resum = pd.DataFrame({'country': countries, 'Discurs1': primer, 'Discurs2': segon, 'Discurs3': tercer})

dataframe_resum

In [None]:
keys = []

for i in tqdm.tqdm(range(10000)):
    keys.append(list(bigram_counts.items())[i])

In [None]:
a = []
b = []

for key in keys:
    a.append(key[0])
    b.append(key[1])

In [None]:
df = pd.DataFrame({'bigram': a, 'num': b})
df = df.sort_values(by='num', ascending=False)
df.head(5)

In [None]:
for i in range(len(df)):
    if 'europe' in df['bigram'].iloc[i]:
        print(df['bigram'].iloc[i], df['num'].iloc[i])

In [None]:
a = dict(bigram_counts)

keys = list(a.keys())
values = list(a.values())

In [None]:
for i in range(1000):
    if 'gas' in keys[i]:
        print(keys[i])
        print(values[i])
    # if 'gas' in str(keys[i]):
    #     print(keys[i])
    #     print(values[i])


In [None]:
b = dict(gram3_counts)

keys = list(b.keys())
values = list(b.values())

In [None]:
for i in range(1000):
    # if 'gas' in keys[i]:
    #     print(keys[i])
    #     print(values[i])
    if 'gas' in str(keys[i]):
        print(keys[i])
        print(values[i])


In [None]:
s = []
for bigram in list(a):
    if 'gas' in str(bigram):
        s.append(bigram)

bigram_list = list(itertools.chain(*s))

# Create counter of words in clean bigrams
bigram_counts = collections.Counter(bigram_list)

bigram_counts.most_common()