In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import os
from ast import literal_eval
import itertools
from urllib.parse import urlparse

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# 1 Load Data

In [2]:
data_location = "data_round_1"

l_train_documents = pd.read_csv(os.path.join(data_location,"documents_en_train.csv"))
l_validation_documents = pd.read_csv(os.path.join(data_location,"documents_en_val.csv"))
l_test_documents = pd.read_csv(os.path.join(data_location,"documents_en_test.csv"))
        
l_train_sentences = pd.read_csv(os.path.join(data_location,"sentences_en_train.csv"), converters={'sector_ids': literal_eval})
l_validation_sentences = pd.read_csv(os.path.join(data_location,"sentences_en_val.csv"), converters={'sector_ids': literal_eval})
l_test_sentences = pd.read_csv(os.path.join(data_location,"sentences_en_test.csv"), converters={'sector_ids': literal_eval})

In [3]:
display(HTML(l_train_documents.head(2).to_html()))
display(HTML(l_train_sentences.head(2).to_html()))

Unnamed: 0,project_name,country_code,lang_code,doc_id,doc_text,doc_url
0,IMMAP/DFS Syria,SYR,en,48582,"This website uses cookies to improve your experience. We'll assume you're ok with this, but you can opt-out if you wish.Accept",https://www.syriahr.com/en/203844/
1,IMMAP/DFS Syria,SYR,en,41032,Please enable Cookies and reload the page.\n\nThis process is automatic. Your browser will redirect to your requested content shortly.\n\nPlease allow up to 5 seconds…,https://www.syriahr.com/en/187230/


Unnamed: 0,doc_id,sentence_id,sentence_text,is_relevant,sector_ids
0,51657,0,"New Salesian youth center in La Cecilia district serves more than 100 youth (MissionNewswire) Salesian missionaries have opened a youth center in the La Cecilia district of Armenia, Colombia.",0,[]
1,51657,1,Armenia is at the center of the Colombian coffee growing axis.,0,[]


In [4]:
train_documents = l_train_documents.copy().set_index("doc_id")
validation_documents = l_validation_documents.copy().set_index("doc_id")
test_documents = l_test_documents.copy().set_index("doc_id")

#Adding one feature here already because its easier on this data-format
l_train_sentences["sentence_position"] = l_train_sentences["sentence_id"].apply(lambda x: np.log(x+1))
l_validation_sentences["sentence_position"] = l_validation_sentences["sentence_id"].apply(lambda x: np.log(x+1))
l_test_sentences["sentence_position"] = l_test_sentences["sentence_id"].apply(lambda x: np.log(x+1))

train_sentences = l_train_sentences.copy().set_index(["doc_id","sentence_id"])
validation_sentences = l_validation_sentences.copy().set_index(["doc_id","sentence_id"])
test_sentences = l_test_sentences.copy().set_index(["doc_id","sentence_id"])

# 2 Change nominal features to Indices

In [5]:
train_documents["doc_url"].fillna("",inplace=True)
validation_documents["doc_url"].fillna("",inplace=True)
test_documents["doc_url"].fillna("",inplace=True)

In [6]:
project_name_mapping = dict((o,idx) for idx, o in enumerate(set(train_documents["project_name"])))
country_code_mapping = dict((o,idx) for idx, o in enumerate(set(train_documents["country_code"])))
url_set = set(train_documents["doc_url"].apply(lambda x: urlparse(x).netloc))
document_url_mapping = dict((o,idx) for idx, o in enumerate(url_set))

In [7]:
train_documents.replace(project_name_mapping, inplace=True)
validation_documents.replace(project_name_mapping, inplace=True)
test_documents.replace(project_name_mapping, inplace=True)

train_documents.replace(country_code_mapping, inplace=True)
validation_documents.replace(country_code_mapping, inplace=True)
test_documents.replace(country_code_mapping, inplace=True)

In [8]:
train_documents["url"] = train_documents["doc_url"].apply(lambda x: urlparse(x).netloc).replace(document_url_mapping)
validation_documents["url"] = validation_documents["doc_url"].apply(lambda x: urlparse(x).netloc).replace(document_url_mapping)
test_documents["url"] = test_documents["doc_url"].apply(lambda x: urlparse(x).netloc).replace(document_url_mapping)
#Make unknown (from train set) urls a seperate index
for item in validation_documents.iterrows():
    if urlparse(item[1]["doc_url"]).netloc not in url_set:
        validation_documents.loc[item[0], "url"] = len(url_set)
        
for item in test_documents.iterrows():
    if urlparse(item[1]["doc_url"]).netloc not in url_set:
        test_documents.loc[item[0], "url"] = len(url_set)

# 3 Extract some features

In [9]:
#Extract document length
train_documents["text_length"] = train_documents["doc_text"].apply(len).apply(np.log) #the idea behind log is that the net can use it better (but maybe it is wrong?)
validation_documents["text_length"] = validation_documents["doc_text"].apply(len).apply(np.log)
test_documents["text_length"] = test_documents["doc_text"].apply(len).apply(np.log)

#Extract sentence count in document
train_documents["sentence_count"] = train_sentences.groupby(level="doc_id").size().apply(np.log)
validation_documents["sentence_count"] = validation_sentences.groupby(level="doc_id").size().apply(np.log)
test_documents["sentence_count"] = test_sentences.groupby(level="doc_id").size().apply(np.log)

#Extract Sentence Length
train_sentences["sentence_length"] = train_sentences["sentence_text"].apply(len).apply(np.log)
validation_sentences["sentence_length"] = validation_sentences["sentence_text"].apply(len).apply(np.log)
test_sentences["sentence_length"] = test_sentences["sentence_text"].apply(len).apply(np.log)

In [10]:
test_documents

Unnamed: 0_level_0,project_name,country_code,lang_code,doc_text,doc_url,url,text_length,sentence_count
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40328,2,3,en,Cox’s Bazar – The International Organization f...,https://reliefweb.int/report/bangladesh/iom-am...,57,7.983099,2.772589
39775,3,5,en,Introduction The continuation of conflict in N...,https://reliefweb.int/sites/reliefweb.int/file...,57,9.124456,3.496508
47025,0,0,en,"Damascus, SANA- Al Mujtahed Damascus Hospital...",http://sana.sy/en/?p=216501,124,7.271704,1.609438
44256,1,1,en,"Kongoussi, Burkina Faso —Editor's note: In a M...",https://allafrica.com/stories/202010130090.html,7,9.229456,4.110874
35135,0,0,en,Pandemic Also Opportunity for Business Elite t...,,0,8.122074,2.944439
...,...,...,...,...,...,...,...,...
44892,0,0,en,CCCM and Shelter/NFI Clusters’ Fire Prevention...,https://reliefweb.int/sites/reliefweb.int/file...,57,9.627009,4.624973
51667,2,3,en,Bangladesh’s daily infection rate fell slightl...,https://unb.com.bd/category/Bangladesh/covid-1...,96,8.129470,3.367296
40325,2,3,en,Bangladesh is hosting more than one million Ro...,https://reliefweb.int/report/bangladesh/bangla...,57,7.973500,2.995732
34596,0,0,en,July 2020 | Round 2 How COVID-19 compounds alr...,,0,10.176982,4.934474


- Maybe add target encoding (but I am not sure if it is so powerful since we do not have so many nominal classes)

# 4 Tokenize sentences (first 512 tokens) (lowercase) & remove sentence feature

In [11]:
train_sentences["tokenized_sentence"] = train_sentences["sentence_text"].apply(lambda x: tokenizer(x,  max_length=512, truncation="longest_first")["input_ids"])
validation_sentences["tokenized_sentence"] = validation_sentences["sentence_text"].apply(lambda x: tokenizer(x, max_length=512, truncation="longest_first")["input_ids"])
test_sentences["tokenized_sentence"] = test_sentences["sentence_text"].apply(lambda x: tokenizer(x, max_length=512, truncation="longest_first")["input_ids"])

train_sentences.drop("sentence_text", axis="columns", inplace= True)
validation_sentences.drop("sentence_text", axis="columns", inplace= True)
test_sentences.drop("sentence_text", axis="columns", inplace= True)

# 5 Remove unnecessary features

In [12]:
train_documents.drop("lang_code",axis="columns", inplace = True)
validation_documents.drop("lang_code",axis="columns", inplace = True)
test_documents.drop("lang_code",axis="columns", inplace = True)

In [13]:
train_documents.drop("doc_text",axis="columns", inplace = True)
validation_documents.drop("doc_text",axis="columns", inplace = True)
test_documents.drop("doc_text",axis="columns", inplace = True)

In [14]:
train_documents.drop("doc_url",axis="columns", inplace = True)
validation_documents.drop("doc_url",axis="columns", inplace = True)
test_documents.drop("doc_url",axis="columns", inplace = True)

In [15]:
display(HTML(train_documents.head(2).to_html()))
display(HTML(train_sentences.head(2).to_html()))

Unnamed: 0_level_0,project_name,country_code,url,text_length,sentence_count
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
48582,0,0,90,4.836282,0.693147
41032,0,0,90,5.09375,1.386294


Unnamed: 0_level_0,Unnamed: 1_level_0,is_relevant,sector_ids,sentence_position,sentence_length,tokenized_sentence
doc_id,sentence_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
51657,0,0,[],0.0,5.252273,"[101, 2047, 4341, 2937, 3360, 2415, 1999, 2474, 18459, 2212, 4240, 2062, 2084, 2531, 3360, 1006, 3260, 2638, 9333, 20357, 1007, 4341, 2937, 11743, 2031, 2441, 1037, 3360, 2415, 1999, 1996, 2474, 18459, 2212, 1997, 10110, 1010, 7379, 1012, 102]"
51657,1,0,[],0.693147,4.127134,"[101, 10110, 2003, 2012, 1996, 2415, 1997, 1996, 13598, 4157, 3652, 8123, 1012, 102]"


# 6 Normalize (Numerical) Features

In [18]:
train_document_text_length_mean = train_documents["text_length"].mean()
train_document_sentence_count_mean = train_documents["sentence_count"].mean()
train_sentence_position_mean = train_sentences["sentence_position"].mean()
train_sentence_length_mean = train_sentences["sentence_length"].mean()

train_document_text_length_std = train_documents["text_length"].std()
train_document_sentence_count_std = train_documents["sentence_count"].std()
train_sentence_position_std = train_sentences["sentence_position"].std()
train_sentence_length_std = train_sentences["sentence_length"].std()

In [19]:
train_documents["text_length"] = (train_documents["text_length"] - train_document_text_length_mean) / train_document_text_length_std
validation_documents["text_length"] = (validation_documents["text_length"] - train_document_text_length_mean) / train_document_text_length_std
test_documents["text_length"] = (test_documents["text_length"] - train_document_text_length_mean) / train_document_text_length_std

train_documents["sentence_count"] = (train_documents["sentence_count"] - train_document_sentence_count_mean) / train_document_sentence_count_std
validation_documents["sentence_count"] = (validation_documents["sentence_count"] - train_document_sentence_count_mean) / train_document_sentence_count_std
test_documents["sentence_count"] = (test_documents["sentence_count"] - train_document_sentence_count_mean) / train_document_sentence_count_std

train_sentences["sentence_position"] = (train_sentences["sentence_position"] - train_sentence_position_mean) / train_sentence_position_std
validation_sentences["sentence_position"] = (validation_sentences["sentence_position"] - train_sentence_position_mean) / train_sentence_position_std
test_sentences["sentence_position"] = (test_sentences["sentence_position"] - train_sentence_position_mean) / train_sentence_position_std

train_sentences["sentence_length"] = (train_sentences["sentence_length"] - train_sentence_length_mean) / train_sentence_length_std
validation_sentences["sentence_length"] = (validation_sentences["sentence_length"] - train_sentence_length_mean) / train_sentence_length_std
test_sentences["sentence_length"] = (test_sentences["sentence_length"] - train_sentence_length_mean) / train_sentence_length_std

# 7 Join together and save

In [20]:
train_joint = train_sentences.join(train_documents, on="doc_id")
validation_joint = validation_sentences.join(validation_documents, on="doc_id")
test_joint = test_sentences.join(test_documents, on="doc_id")
train_joint

Unnamed: 0_level_0,Unnamed: 1_level_0,is_relevant,sector_ids,sentence_position,sentence_length,tokenized_sentence,project_name,country_code,url,text_length,sentence_count
doc_id,sentence_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
51657,0,0,[],-2.396411,0.569648,"[101, 2047, 4341, 2937, 3360, 2415, 1999, 2474...",5,4,119,-0.605825,-0.295492
51657,1,0,[],-2.077704,-0.124752,"[101, 10110, 2003, 2012, 1996, 2415, 1997, 199...",5,4,119,-0.605825,-0.295492
51657,2,0,[],-1.891273,0.217774,"[101, 1996, 2047, 3360, 2415, 2038, 2366, 2062...",5,4,119,-0.605825,-0.295492
51657,3,0,[],-1.758998,0.336929,"[101, 1996, 4341, 2937, 3360, 2415, 2001, 2764...",5,4,119,-0.605825,-0.295492
51657,4,0,[],-1.656397,-0.076886,"[101, 2116, 1997, 2122, 3360, 2272, 2013, 3532...",5,4,119,-0.605825,-0.295492
...,...,...,...,...,...,...,...,...,...,...,...
34512,121,0,[],-0.187538,-0.058732,"[101, 2174, 1010, 11470, 19621, 2015, 2024, 20...",0,0,0,1.055759,1.123685
34512,122,0,[],-0.183785,0.424618,"[101, 1999, 5712, 1010, 2045, 2024, 4311, 1997...",0,0,0,1.055759,1.123685
34512,123,0,[],-0.180062,-0.058732,"[101, 1996, 9353, 9331, 2015, 2136, 2097, 2562...",0,0,0,1.055759,1.123685
34512,124,0,[],-0.176368,-0.095589,"[101, 2017, 2064, 2424, 2019, 19184, 1997, 203...",0,0,0,1.055759,1.123685


In [21]:
train_joint.to_hdf(os.path.join("preprocessed_data", "train_joint.h5"), key='s')
validation_joint.to_hdf(os.path.join("preprocessed_data", "validation_joint.h5"), key='s')
test_joint.to_hdf(os.path.join("preprocessed_data", "test_joint.h5"), key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['sector_ids', 'tokenized_sentence'], dtype='object')]

  pytables.to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->Index(['sector_ids', 'tokenized_sentence', 'url'], dtype='object')]

  pytables.to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->Index(['tokenized_sentence', 'url'], dtype='object')]

  pytables.to_hdf(


In [22]:
train_sentences.to_hdf(os.path.join("preprocessed_data", "train_sentences.h5"), key='s')
validation_sentences.to_hdf(os.path.join("preprocessed_data", "validation_sentences.h5"), key='s')
test_sentences.to_hdf(os.path.join("preprocessed_data", "test_sentences.h5"), key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['sector_ids', 'tokenized_sentence'], dtype='object')]

  pytables.to_hdf(
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['tokenized_sentence'], dtype='object')]

  pytables.to_hdf(


In [23]:
train_documents.to_hdf(os.path.join("preprocessed_data", "train_documents.h5"), key='s')
validation_documents.to_hdf(os.path.join("preprocessed_data", "validation_documents.h5"), key='s')
test_documents.to_hdf(os.path.join("preprocessed_data", "test_documents.h5"), key='s')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block1_values] [items->Index(['url'], dtype='object')]

  pytables.to_hdf(
