In [1]:
from src.data.preprocess import remove_not_needed_elements_from_string
from transformers import AutoModel, AutoTokenizer
from transformers import pipeline
import pandas as pd
from hydra import compose, initialize
from omegaconf import DictConfig
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim
import plotly.express as px
from shapely.geometry import Point
import geopandas as gpd
from urllib.request import urlopen
import json
import ast

2022-10-07 09:11:36.803761: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-07 09:11:36.896727: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-10-07 09:11:36.896743: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [4]:
with initialize(version_base=None, config_path="../conf"):
    cfg: DictConfig = compose(config_name="config")

path_to_data = cfg.supervisor.processed

df = pd.read_csv("../" + path_to_data)

print(
    f"Number of tweets that explicity mentions Swedish locations\n{len(df[df['mentions_location'] == 1])}/{len(df)}"
)

Number of tweets that explicity mentions locations
591/5035


In [5]:
tok = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased")
model = AutoModel.from_pretrained("KB/bert-base-swedish-cased")
nlp = pipeline(
    "ner",
    model="KB/bert-base-swedish-cased-ner",
    tokenizer="KB/bert-base-swedish-cased-ner",
)

Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def get_location_entities(text: str):
    text = remove_not_needed_elements_from_string(text)
    # Remove stopwords
    tokens = nlp(text)
    updated_tokens = []
    for token in tokens:
        if token["word"].startswith("##"):
            try:
                updated_tokens[-1]["word"] += token["word"][2:]
            except Exception:
                continue
        else:
            updated_tokens += [token]
    loc_entities = list(filter(lambda x: x["entity"] == "LOC", updated_tokens))
    return [{entity["word"]: entity["score"]} for entity in loc_entities]


# Use NER on only relevant tweets
df = df[df["relevant"] == 1]
tqdm.pandas(desc="NER NLP")
df["tokens"] = df["raw_text"].progress_apply(get_location_entities)

NER NLP:   0%|          | 0/1493 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ÖSREGN: Översvämning i Malmbäck  
{'entity': 'LOC', 'score': 0.92924124, 'index': 2, 'word': '##SR', 'start': 1, 'end': 3}
[{'entity': 'LOC', 'score': 0.92924124, 'index': 2, 'word': '##SR', 'start': 1, 'end': 3}, {'entity': 'LOC', 'score': 0.976103, 'index': 3, 'word': '##EG', 'start': 3, 'end': 5}, {'entity': 'LOC', 'score': 0.9978521, 'index': 10, 'word': 'Malm', 'start': 23, 'end': 27}, {'entity': 'LOC', 'score': 0.9973863, 'index': 11, 'word': '##bäck', 'start': 27, 'end': 31}]
-----------------------------------
ÖSREGN: Översvämning i Malmbäck  
{'entity': 'LOC', 'score': 0.976103, 'index': 3, 'word': '##EG', 'start': 3, 'end': 5}
[{'entity': 'LOC', 'score': 0.92924124, 'index': 2, 'word': '##SR', 'start': 1, 'end': 3}, {'entity': 'LOC', 'score': 0.976103, 'index': 3, 'word': '##EG', 'start': 3, 'end': 5}, {'entity': 'LOC', 'score': 0.9978521, 'index': 10, 'word': 'Malm', 'start': 23, 'end': 27}, {'entity': 'LOC', 'score': 0.9973863, 'index': 11, 'word': '##bäck', 'start': 27, 'e

The NLP pipeline seems to generate tokens missing it's initial part that's needed for the subsequents that contains "##" at the start

Now, let's filter out non-swedish locations

In [92]:
geolocater = Nominatim(user_agent="flood_detection")

In [4]:
def is_swedish_geo(list_entities):
    for geo in list_entities:
        entity_name = list(geo.keys())[0]
        swedish_location = geolocator.geocode(
            entity_name, country_codes="se", language="en", extratags=True
        )
        if swedish_location is not None:
            #             if swedish_location.raw['importance'] > 0.5:
            # if 'population' in swedish_location.raw['extratags'] and \
            #     int(swedish_location.raw['extratags']['population']) > 1000:
            return swedish_location.raw
    return False


df["has_loc_entities"] = df["tokens"].apply(lambda x: len(x) > 0)
tqdm.pandas(desc="Is Swidish location")
df["loc_ent_is_swedish"] = df["tokens"].progress_apply(is_swedish_geo)
df.to_csv("file1.csv")

Is Swidish location:   0%|          | 0/1493 [00:00<?, ?it/s]

AttributeError: 'str' object has no attribute 'keys'

In [2]:
df = pd.read_csv("file1.csv")

confusion_matrix = pd.crosstab(
    df["mentions_location"],
    df["loc_ent_is_swedish"] != "False",
    rownames=["Actual"],
    colnames=["Predicted"],
)
confusion_matrix

Predicted,False,True
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,772,218
1.0,33,469


## Error analysis

In [94]:
# False negatives
needed_columns = ['id', 'tokens', 'raw_text', 'text']
false_neg = df[(df["mentions_location"] == 1) & (df["loc_ent_is_swedish"] == "False")][needed_columns]

In [52]:
# Locations extracted by NER, not recognized by geocoders
false_neg[false_neg['tokens'].apply(lambda x: len(ast.literal_eval(x)) > 0)]

Unnamed: 0,id,tokens,raw_text,text
261,893482866151231488,[{'Färgargårdens': 0.9966882}],Regnet vräker ner och innehåller hagel. Det mu...,the rain pours contains hail its rumbling writ...
314,1148141876035428352,[{'Kullabygden': 0.98074657}],Skyfall över Kullabygden – många översvämninga...,heavy rain kullabygden floods
532,1404035141140467715,[{'Lapplandsfjällen': 0.999564}],Lapplandsfjällen (Väder) SMHI varning klass 1 ...,lapland mountains weather smhi warning class h...
551,1403759207103541252,[{'FarstaO': 0.9812517}],"Jösses vad det regnar, var i Farsta-hOoden, åt...",jeez rain farstahooden roads blocked floods ca...
752,808624344381485056,[{'Växjöhotell': 0.8389809}],Översvämning på Växjöhotell https://t.co/Bza2T...,flooding växjöhotell
1089,1323375668059688960,[{'Gumbodahamn': 0.99864274}],Lv 738 Vippersrönningen–Gumbodahamn (Trafikstö...,lv vippersrönningen–gumbodahamn traffic distur...
1164,833576058058371072,[{'Sydkusten': 0.99791116}],Översvämningar och jordskred på kusten - Sydku...,floods landslides coast sydkustenes
1226,1035449418542006272,[{'Sydsverige': 0.99941397}],E22 översvämmad i intensiva skyfallet SMHI var...,e flooded intense downpour smhi warns heavy do...
1492,618744167938818048,[{'Stockholmsområdet': 0.9994622}],Kraftigt regn på väg in över Stockholmsområdet...,heavy rain moving stockholm area risk floodin...


In [68]:
# Another geocoder
from geopy.geocoders import GeoNames
geonames_geolocater = GeoNames("yasser_kaddoura")
print(geonames_geolocater.geocode("Skånska", country="SE", country_bias="SE"))
print(geonames_geolocater.geocode("Kullabygden", country="SE", country_bias="SE"))
print(geonames_geolocater.geocode("Sydkusten", country="SE", country_bias="SE"))

None
None
None


In [53]:
# Locations not extracted by NER
# Gavla wsn't recognized
# E18 E6 wasn't recognized (Are these places?)
false_neg[false_neg['tokens'].apply(lambda x: len(ast.literal_eval(x)) == 0)]

Unnamed: 0,id,tokens,raw_text,text
59,1438895332784582663,[],@JakopDalunde fast översvämningar i gävle kan ...,floods gävle hardly attributed climate change ...
265,893393481602826240,[],Stopp på E6 efter översvämning är en illusion+...,stop e flood illusion
266,893390880526262273,[],JUST NU: Stopp på väg på grund av översvämning...,right now road stoppage flooding illusion
269,893385509795266560,[],Stopp på E6 efter översvämning https://t.co/Xk...,stop e flooding
277,1028405310069846017,[],"Ojojoj... alla Halmstadbors samlade böner, dan...",whoops halmstad residents collected prayers da...
327,639080984290308096,[],Gamla ledningar bakom översvämningar på E18 ht...,old wiring flooding e
356,898747159423492096,[],Mycket vatten på R40 https://t.co/A68M6yGvVV\n...,lots water r borastidning nyheter översvämning...
367,767730451767300096,[],#Vattenföring'en har ökat i #Närpes å pga regn...,vattenföring increased närpes å rain lower wee...
477,644473393262297088,[],"Lv 957 Bänteby-Hallerud (SOS): Vägskador, över...",lv bäntebyhallerud sos road damage flooding di...
484,1026840737013485568,[],Översvämning återigen https://t.co/ouAxRWgjGD ...,flooding karlskoga


In [78]:
# Geocoder seems to identify some of the locations not extracted by NER
# To increase recall, use geocders on text 
print(geolocator.geocode("Skånska", country_codes="se", language="en", extratags=True ))
print(geolocator.geocode("E6", country_codes="se", language="en", extratags=True ))
print(geolocator.geocode("Höglandet", country_codes="se", language="en", extratags=True ))

Skanska, Mälarvägen, Runby, Smedby, Upplands Väsby, Upplands Väsby kommun, Stockholm County, 194 04, Sweden
167, Grinneröd, Uddevalla kommun, Västra Götaland County, 444 97, Sweden
Bromma stadsdelsområde, Stockholms kommun, Stockholm County, Sweden


In [84]:
# False positives
false_pos = df[(df["mentions_location"] == 0) & (df["loc_ent_is_swedish"] != "False")][needed_columns]
false_pos.head(50)
# Sweden has locations that contains florida and miami
# Idea: Increase precision by: 
# - Make more restrictive classification using data extracted from geocoders regarding location (e.g. population, type, popularity)
# - Check non-swedish enteries for that location and if they are (e.g. more popular) filter the swedish entry out

Unnamed: 0,id,tokens,raw_text,text
1,907315054151962629,"[{'Miami': 0.9995778}, {'Beach': 0.9994505}]",Inga översvämningar eller strukturella skador ...,no flooding structural damage miami beach obam...
8,907249999553159168,[{'Florida': 0.9995338}],Stormflod hotar Florida i Irmas spår\nhttps://...,storm surge threatens florida irmas wake
11,907216307321491456,[{'Florida': 0.9996426}],Nu stiger vattennivåerna – varning för översvä...,now water levels rising flood warning florida...
13,907197471339794434,[{'Florida': 0.99937505}],Nu stiger vattennivåerna – varning för översvä...,now water levels rising florida flood warning...
14,907169643898572800,[{'Trelleborg': 0.99761}],Nu laddar vi inför #Kustmöte2017 i #Trelleborg...,now charging kustmöte trelleborg the interest ...
15,907168859228135424,[{'Florida': 0.9993895}],Meteorologer varnar för enorma översvämningar ...,meteorologists warn massive flooding florida e...
19,907128844431237120,[{'Florida': 0.99947363}],Orkanen Irma härjar Florida – livsfarliga över...,hurricane irma ravages florida lifethreatenin...
20,907123196159242240,[{'Florida': 0.9991867}],Nu stiger vattennivåerna – varning för översvä...,now water levels rising florida flood warning
21,907116790093938688,[{'Florida': 0.9995834}],Meteorologer varnar för enorma översvämningar ...,meteorologists warn huge floods florida afton...
24,907044532767936514,[{'Florida': 0.9992543}],Nu stiger vattennivåerna – varning för översvä...,now water levels rising florida flood warning...


In [3]:
def parse_raw_text(raw_json):
    if raw_json == "False":
        return pd.Series(5 * [False])
    return pd.Series(
        [
            ast.literal_eval(raw_json)["class"],
            ast.literal_eval(raw_json)["importance"],
            ast.literal_eval(raw_json)["type"],
            ast.literal_eval(raw_json)["display_name"],
            float(ast.literal_eval(raw_json)["lat"]),
            float(ast.literal_eval(raw_json)["lon"]),
        ]
    )


df[
    [
        "loc_class",
        "loc_importance",
        "loc_type",
        "loc_display_name",
        "loc_lat",
        "loc_lon",
    ]
] = df["loc_ent_is_swedish"].apply(parse_raw_text)

In [10]:
def get_color(row):
    if (row["mentions_location"] == 1 and row["loc_ent_is_swedish"] != "False"):
        return "blue"
    elif row["mentions_location"] == 0 and row["loc_ent_is_swedish"] != "False":
        return "red"

df["color"] = df.apply(get_color, axis=1,)

In [11]:
# Get the geojson file and load it as a geopandas dataframe
with urlopen(
    "https://raw.githubusercontent.com/ostropunk/swegeojson/master/geodata/kommun/Kommun_RT90_region.json"
) as response:
    sweden_json = json.load(response)

sweden_geo_df = gpd.GeoDataFrame.from_features(sweden_json["features"])
geometry = [Point(x, y) for x, y in zip(df["loc_lon"], df["loc_lat"])]

In [12]:
fig = px.scatter_mapbox(
    df,
    lat="loc_lat",
    lon="loc_lon",
    hover_name=df["loc_display_name"],
    hover_data=["id"],
    color_discrete_map={"blue": "blue", "red": "red"},
    color="color",
    mapbox_style="carto-positron",
    height=600,
    zoom=3,
    center={"lat": 63.333112, "lon": 16.007205},
)

fig.show()