In [2]:
from transformers import AutoModel, AutoTokenizer
from hydra import compose, initialize
from omegaconf import DictConfig
from src.data.preprocess import remove_not_needed_elements_from_string
from transformers import pipeline
from tqdm.notebook import tqdm
import pandas as pd
from geopy.geocoders import Nominatim

2022-10-05 14:12:06.679652: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-05 14:12:06.715852: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-10-05 14:12:06.715872: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [4]:
with initialize(version_base=None, config_path="../conf"):
    cfg: DictConfig = compose(config_name="config")

path_to_data = cfg.supervisor.processed

df = pd.read_csv("../" + path_to_data)

print(f"Number of tweets that explicity mentions locations\n{len(df[df['mentions_location'] == 1])}/{len(df)}")

Number of tweets that explicity mentions locations
591/5035


In [5]:
tok = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")
model = AutoModel.from_pretrained("KB/bert-base-swedish-cased")
nlp = pipeline(
    "ner",
    model="KB/bert-base-swedish-cased-ner",
    tokenizer="KB/bert-base-swedish-cased-ner",
)

Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def get_location_entities(text: str):
    text = remove_not_needed_elements_from_string(text)
    # Remove stopwords
    tokens = nlp(text)
    updated_tokens = []
    for token in tokens:
        if token["word"].startswith("##"):
            try:
                updated_tokens[-1]["word"] += token["word"][2:]
            except Exception:
                continue
        else:
            updated_tokens += [token]
    loc_entities = list(filter(lambda x: x["entity"] == "LOC", updated_tokens))
    return [{entity["word"]: entity["score"]} for entity in loc_entities]


# Use NER on only relevant tweets
df = df[df["relevant"] == 1]
tqdm.pandas(desc="NER NLP")
df["tokens"] = df["raw_text"].progress_apply(get_location_entities)

NER NLP:   0%|          | 0/1493 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ÖSREGN: Översvämning i Malmbäck  
{'entity': 'LOC', 'score': 0.92924124, 'index': 2, 'word': '##SR', 'start': 1, 'end': 3}
[{'entity': 'LOC', 'score': 0.92924124, 'index': 2, 'word': '##SR', 'start': 1, 'end': 3}, {'entity': 'LOC', 'score': 0.976103, 'index': 3, 'word': '##EG', 'start': 3, 'end': 5}, {'entity': 'LOC', 'score': 0.9978521, 'index': 10, 'word': 'Malm', 'start': 23, 'end': 27}, {'entity': 'LOC', 'score': 0.9973863, 'index': 11, 'word': '##bäck', 'start': 27, 'end': 31}]
-----------------------------------
ÖSREGN: Översvämning i Malmbäck  
{'entity': 'LOC', 'score': 0.976103, 'index': 3, 'word': '##EG', 'start': 3, 'end': 5}
[{'entity': 'LOC', 'score': 0.92924124, 'index': 2, 'word': '##SR', 'start': 1, 'end': 3}, {'entity': 'LOC', 'score': 0.976103, 'index': 3, 'word': '##EG', 'start': 3, 'end': 5}, {'entity': 'LOC', 'score': 0.9978521, 'index': 10, 'word': 'Malm', 'start': 23, 'end': 27}, {'entity': 'LOC', 'score': 0.9973863, 'index': 11, 'word': '##bäck', 'start': 27, 'e

The NLP pipeline seems to generate tokens missing it's initial part that's needed for the subsequents that contains "##" at the start

Now, let's filter out non-swedish locations

In [14]:
geolocator = Nominatim(user_agent="flood_detection")

def is_swedish_geo(list_entities):
    for geo in list_entities:
        entity_name = list(geo.keys())[0]
        swedish_location = geolocator.geocode(entity_name, country_codes="se", language="en")
        if swedish_location is not None:
            return True
    return False


df['has_loc_entities'] = df['tokens'].apply(lambda x: len(x) > 0)
tqdm.pandas(desc="Is Swidsh")
df['loc_ent_is_swedish'] = df['tokens'].progress_apply(is_swedish_geo)

NER NLP:   0%|          | 0/1493 [00:00<?, ?it/s]

2022-10-05T14:18:19.205173+0200 - Timed out waiting for syncing to complete.
2022-10-05T14:19:08.547628+0200 - Timed out waiting for syncing to complete.
2022-10-05T14:19:13.609385+0200 - Timed out waiting for syncing to complete.
2022-10-05T14:19:26.410623+0200 - Timed out waiting for syncing to complete.
2022-10-05T14:19:31.465171+0200 - Timed out waiting for syncing to complete.
2022-10-05T14:19:36.519399+0200 - Timed out waiting for syncing to complete.
2022-10-05T14:19:41.624314+0200 - Timed out waiting for syncing to complete.


In [16]:
confusion_matrix = pd.crosstab(df['mentions_location'], df['loc_ent_is_swedish'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,772,218
1.0,33,469


The percision seems to be not that good (i.e. The model seems to predict that some locations are swedish, but they are not)

In [46]:
469/(469+218)

0.6826783114992722

In [47]:
df[(df["mentions_location"] == 0) & df['loc_ent_is_swedish']][['id', 'tokens']].head(10)

Unnamed: 0,id,tokens
9,907315054151962629,"[{'Miami': 0.9995778}, {'Beach': 0.9994505}]"
28,907249999553159168,[{'Florida': 0.9995338}]
35,907216307321491456,[{'Florida': 0.9996426}]
37,907197471339794434,[{'Florida': 0.99937505}]
41,907169643898572800,[{'Trelleborg': 0.99761}]
42,907168859228135424,[{'Florida': 0.9993895}]
50,907128844431237120,[{'Florida': 0.99947363}]
51,907123196159242240,[{'Florida': 0.9991867}]
53,907116790093938688,[{'Florida': 0.9995834}]
56,907044532767936514,[{'Florida': 0.9992543}]


In [48]:
swedish_location = geolocator.geocode("Florida", country_codes="se", language="en", extratags=True, addressdetails=True)
swedish_location.raw

{'place_id': 78229519,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 7466858592,
 'boundingbox': ['56.4729872', '56.4730872', '14.7061858', '14.7062858'],
 'lat': '56.4730372',
 'lon': '14.7062358',
 'display_name': 'Florida, Ryd, Tingsryds kommun, Kronoberg County, 360 10, Sweden',
 'class': 'place',
 'type': 'isolated_dwelling',
 'importance': 0.30999999999999994,
 'address': {'isolated_dwelling': 'Florida',
  'village': 'Ryd',
  'municipality': 'Tingsryds kommun',
  'county': 'Kronoberg County',
  'ISO3166-2-lvl4': 'SE-G',
  'postcode': '360 10',
  'country': 'Sweden',
  'country_code': 'se'},
 'extratags': {}}

In [56]:
swedish_location = geolocator.geocode("Stockholm", country_codes="se", language="en", extratags=True, addressdetails=True)
swedish_location.raw['extratags']

{'capital': 'yes',
 'wikidata': 'Q1754',
 'wikipedia': 'sv:Stockholm',
 'population': '829417',
 'ref:se:scb': '0336',
 'ref:se:pts:postort': 'STOCKHOLM'}

It seems that some locations in Sweden use terms like miami and Florida.
Let's update the filter so that we limit them further. Importance and population seem to be logical picks.
Let's filter out places that don't have population tag or have less than 1000

In [99]:
def is_swedish_geo(list_entities):
    for geo in list_entities:
        entity_name = list(geo.keys())[0]
        swedish_location = geolocator.geocode(entity_name, country_codes="se", language="en", extratags=True)
        if swedish_location is not None:
#             if swedish_location.raw['importance'] > 0.5:
                # if 'population' in swedish_location.raw['extratags'] and \
                #     int(swedish_location.raw['extratags']['population']) > 1000:
                return swedish_location.raw
    return False

df['has_loc_entities'] = df['tokens'].apply(lambda x: len(x) > 0)
tqdm.pandas(desc="Is Swidish location")
df['loc_ent_is_swedish'] = df['tokens'].progress_apply(is_swedish_geo)

Is Swidish location:   0%|          | 0/1493 [00:00<?, ?it/s]

In [107]:
confusion_matrix = pd.crosstab(df['mentions_location'], df['loc_ent_is_swedish'] != False, rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,772,218
1.0,33,469


In [103]:
df[df['loc_ent_is_swedish'] != False]['loc_ent_is_swedish']

9       {'place_id': 78651618, 'licence': 'Data © Open...
28      {'place_id': 78229519, 'licence': 'Data © Open...
35      {'place_id': 78229519, 'licence': 'Data © Open...
37      {'place_id': 78229519, 'licence': 'Data © Open...
41      {'place_id': 138406, 'licence': 'Data © OpenSt...
                              ...                        
4910    {'place_id': 144499694, 'licence': 'Data © Ope...
4949    {'place_id': 156164308, 'licence': 'Data © Ope...
4967    {'place_id': 297450050, 'licence': 'Data © Ope...
5016    {'place_id': 158528919, 'licence': 'Data © Ope...
5032    {'place_id': 298284642, 'licence': 'Data © Ope...
Name: loc_ent_is_swedish, Length: 687, dtype: object

In [98]:
swedish_location = geolocator.geocode("Norrsundet", country_codes="se", language="en", extratags=True, addressdetails=True)
swedish_location.raw

{'place_id': 830929,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 286644597,
 'boundingbox': ['60.909974', '60.949974', '17.1206678', '17.1606678'],
 'lat': '60.929974',
 'lon': '17.1406678',
 'display_name': 'Norrsundet, Gävle kommun, Gävleborg County, 817 30, Sweden',
 'class': 'place',
 'type': 'village',
 'importance': 0.385,
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_place_village.p.20.png',
 'address': {'village': 'Norrsundet',
  'municipality': 'Gävle kommun',
  'county': 'Gävleborg County',
  'ISO3166-2-lvl4': 'SE-X',
  'postcode': '817 30',
  'country': 'Sweden',
  'country_code': 'se'},
 'extratags': {'ref:se:scb': '7228', 'ref:se:pts:postort': 'NORRSUNDET'}}