In [1]:
import string
import re
import os
import numpy as np
import pandas as pd

from io import StringIO
from html.parser import HTMLParser

In [2]:
DATA_DIR = '../data'

In [3]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

In [4]:
def strip_tags(html):
    s = MLStripper()
    s.feed(str(html))
    return s.get_data()

def remove_tabulations(text):
    text = str(text)
    return(text.replace("\r", ' ').replace("\t", ' ').replace("\n", ' '))

def clean_text(text):
    # Remove HTML tags
    text = strip_tags(text)
    # Remove tabulation
    text = remove_tabulations(text)
    # convert to lower case
    text = text.lower()
    # Remove special characters
    text = re.sub('\[.*?\]', ' ', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    # normalize whitespace
    text = ' '.join(text.split())
    return text

def clean_text_basic(text):
    # remove whitespace before and after
    text = text.strip()
    # normalize whitespace
    text = ' '.join(text.split())
    return text

def clean_text_for_indexing(text):
    # Remove HTML tags
    text = strip_tags(text)
    # Remove tabulation
    text = remove_tabulations(text)
    # normalize whitespace
    text = ' '.join(text.split())
    return text

In [5]:
laws_path = os.path.join(DATA_DIR, 'Climate Laws targets-23_03_2021.csv')
events_path = os.path.join(DATA_DIR, 'events-23_03_2021.csv')
legislations_path = os.path.join(DATA_DIR, 'legislations-23_03_2021.csv')

In [6]:
# Add engine 'python' to avoid UnicodeDecodeError, as the cell values are surrounded by double quotes
laws = pd.read_csv(laws_path, sep=',', engine='python')
events = pd.read_csv(events_path, sep=',', engine='python')
legislations = pd.read_csv(legislations_path, sep=',', engine='python')

In [7]:
laws['Description']=laws['Description'].apply(lambda x: clean_text_for_indexing(x))
events['Description']=events['Description'].apply(lambda x: clean_text_for_indexing(x))
legislations['Description']=legislations['Description'].apply(lambda x: clean_text_for_indexing(x))

In [8]:
# for a better modeling add prefixes to avoid columns overlapping
laws.columns = ['Laws'+column for column in laws.columns]
events.columns = ['Events'+column for column in events.columns]
legislations.columns = ['Legislations'+column for column in legislations.columns]

In [10]:
laws.head()

Unnamed: 0,LawsId,LawsTarget type,LawsDescription,LawsGhg target,LawsYear,LawsBase year period,LawsSingle year,LawsSource,LawsGeography,LawsGeography iso,LawsSector,LawsConnected law ids,LawsScopes,LawsVisibility status
0,8977,not_applicable,Seaports and airports functioning within one w...,False,2030.0,,False,plan,Dominica,DMA,Disaster Risk Management (Drm),9751,,published
1,8976,not_applicable,90% of the population able to identify the pil...,False,2030.0,,False,plan,Dominica,DMA,Other,9751,,published
2,8975,not_applicable,"100% of national budgeting, policies in place ...",False,2030.0,,False,plan,Dominica,DMA,Public Sector,9751,,published
3,8974,not_applicable,50% increase healthy coral reef coverage to su...,False,2030.0,,False,plan,Dominica,DMA,Adaptation,9751,,published
4,8973,not_applicable,100% of telecommunications restored within thr...,False,,,False,plan,Dominica,DMA,Disaster Risk Management (Drm),9751,,published


In [20]:
legislations['LegislationsTitle'][0]

'Decree-Law No. 117/2010, regulating sustainability criteria for production and use of biofuel and bio liquids, amended by Decree-Law 6/2012 and Decree-Law 8/2021'

In [12]:
laws_legislations =  pd.merge(laws, legislations, how='outer', left_on=['LawsId'], right_on=['LegislationsLaw Id'])

In [13]:
laws_legislations.head()

Unnamed: 0,LawsId,LawsTarget type,LawsDescription,LawsGhg target,LawsYear,LawsBase year period,LawsSingle year,LawsSource,LawsGeography,LawsGeography iso,...,LegislationsParent,LegislationsGeography,LegislationsGeography iso,LegislationsSector,LegislationsFrameworks,LegislationsResponses,LegislationsDocument types,LegislationsKeywords,LegislationsNatural hazards,LegislationsVisibility status
0,8977.0,not_applicable,Seaports and airports functioning within one w...,False,2030.0,,False,plan,Dominica,DMA,...,,,,,,,,,,
1,8976.0,not_applicable,90% of the population able to identify the pil...,False,2030.0,,False,plan,Dominica,DMA,...,,,,,,,,,,
2,8975.0,not_applicable,"100% of national budgeting, policies in place ...",False,2030.0,,False,plan,Dominica,DMA,...,,,,,,,,,,
3,8974.0,not_applicable,50% increase healthy coral reef coverage to su...,False,2030.0,,False,plan,Dominica,DMA,...,,,,,,,,,,
4,8973.0,not_applicable,100% of telecommunications restored within thr...,False,,,False,plan,Dominica,DMA,...,,,,,,,,,,


In [14]:
laws_legislations.to_excel(os.path.join(DATA_DIR, "LawsLegislation.xlsx"), index=False)

In [15]:
laws_legislations.columns

Index(['LawsId', 'LawsTarget type', 'LawsDescription', 'LawsGhg target',
       'LawsYear', 'LawsBase year period', 'LawsSingle year', 'LawsSource',
       'LawsGeography', 'LawsGeography iso', 'LawsSector',
       'LawsConnected law ids', 'LawsScopes', 'LawsVisibility status',
       'LegislationsId', 'LegislationsLaw Id', 'LegislationsTitle',
       'LegislationsLegislation type', 'LegislationsDescription',
       'LegislationsParent', 'LegislationsGeography',
       'LegislationsGeography iso', 'LegislationsSector',
       'LegislationsFrameworks', 'LegislationsResponses',
       'LegislationsDocument types', 'LegislationsKeywords',
       'LegislationsNatural hazards', 'LegislationsVisibility status'],
      dtype='object')

In [19]:
laws_legislations.to_csv(os.path.join(DATA_DIR, "LawsLegislation.tsv"), sep='\t', index=False)