In [1]:
from flood_detection.data.preprocess import remove_not_needed_elements_from_string
from transformers import AutoModel, AutoTokenizer
from transformers import pipeline
import pandas as pd
from hydra import compose, initialize
from omegaconf import DictConfig
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim, GeoNames
import plotly.express as px
from shapely.geometry import Point
import ast

2022-10-11 10:35:52.977170: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-11 10:35:53.064654: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-10-11 10:35:53.064677: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
with initialize(version_base=None, config_path="../conf"):
    cfg: DictConfig = compose(config_name="config")

path_to_data = cfg.supervisor.processed

df = pd.read_csv("../" + path_to_data)
# Use NER on only relevant tweets
df = df[df["relevant"] == 1]

print(

    f"Number of tweets that explicity mentions Swedish locations\n{len(df[df['mentions_location'] == 1])}/{len(df)}"
)

transformer = "KB/bert-base-swedish-cased-ner"
transformer = "KBLab/bert-base-swedish-cased-ner"

Number of tweets that explicity mentions Swedish locations
493/1402


In [3]:
tok = AutoTokenizer.from_pretrained(transformer)
model = AutoModel.from_pretrained(transformer)
nlp = pipeline(
    "ner",
    model=transformer,
    tokenizer=transformer,
    # Default ignore list is ["O"] which are tokens needed to extract locations
    ignore_labels=[]
)

geolocater: Nominatim = Nominatim(user_agent="flood_detection")
# geonames_geolocater = GeoNames("yasser_kaddoura")

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased-ner were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def get_location_entities(text: str):
    text = remove_not_needed_elements_from_string(text, remove_numbers=False)
    tokens = nlp(text)
    merged_tokens = []
    for token in tokens:
        if token["word"].startswith("##"):
            merged_tokens[-1]["word"] += token["word"][2:]
        else:
            merged_tokens += [token]
    # Remove not needed tokens
    return list(filter(lambda x: x["entity"] != "O", merged_tokens))


tqdm.pandas(desc="NER NLP")
df["tokens"] = df["raw_text"].progress_apply(get_location_entities)

NER NLP:   0%|          | 0/1402 [00:00<?, ?it/s]

2022-10-11T10:44:38.719068+0200 - Timed out waiting for syncing to complete.
2022-10-11T10:44:43.994932+0200 - Timed out waiting for syncing to complete.
2022-10-11T10:44:49.223514+0200 - Timed out waiting for syncing to complete.
2022-10-11T10:44:54.413098+0200 - Timed out waiting for syncing to complete.
2022-10-11T10:44:59.755686+0200 - Timed out waiting for syncing to complete.


In [15]:
def get_location_tokens(tokens: list[dict]) -> dict[str, dict[str, float]]:
    loc_tokens: dict[str, dict[str, float]] = {}
    for token in tokens:
        if token['entity'] == "LOC":
            loc_tokens[token['word']] = {'ner_score': token['score']}
    return loc_tokens


df['locations'] = df["tokens"].apply(get_location_tokens)

In [16]:
def is_swedish_geo(locations: dict[str, dict]) -> dict[str, dict]:
    swedish_locations = locations.copy()
    for name in locations:
        swedish_location = geolocater.geocode(
            name, country_codes="se", language="en", extratags=True  # pyright: ignore
        )

        # swedish_location = geonames_geolocater.geocode(name, country="SE", country_bias="SE")

        if swedish_location is not None:
            swedish_locations[name]["swedish_loc_info"] = swedish_location.raw
        else:
            swedish_locations[name]["swedish_loc_info"] = {}

    return swedish_locations


tqdm.pandas(desc="Is Swidish location")
df["locations"] = df["locations"].progress_apply(is_swedish_geo)
df.to_csv("file1.csv")

Is Swidish location:   0%|          | 0/1402 [00:00<?, ?it/s]

In [19]:
# Convert locations to json
df = pd.read_csv("file1.csv", converters={'locations': ast.literal_eval})

In [20]:
def has_swedish_loc(location_row) -> bool:
    for loc in location_row:
        if len(location_row[loc]['swedish_loc_info']) > 0:
            return True
    return False


TP = df[(df["locations"].apply(has_swedish_loc)) & (df["mentions_location"] == 1)]
TN = df[(~df["locations"].apply(has_swedish_loc)) & (df["mentions_location"] == 0)]
FP = df[(df["locations"].apply(has_swedish_loc)) & (df["mentions_location"] == 0)]
FN = df[(~df["locations"].apply(has_swedish_loc)) & (df["mentions_location"] == 1)]

confusion_matrix = pd.crosstab(
    df["mentions_location"],
    df["locations"].apply(has_swedish_loc),
    rownames=["Actual"],
    colnames=["Predicted"],
)
print(confusion_matrix)

print(f"precision {len(TP)/(len(TP)+len(FP))}")
print(f"recall {len(TP)/(len(TP)+len(FN))}")
print(f"f1 {len(TP)/(len(TP)+0.5*(len(FP)+len(FN)))}")

Predicted  False  True 
Actual                 
0            725    184
1             28    465
precision 0.7164869029275809
recall 0.9432048681541582
f1 0.8143607705779334


## Error analysis

In [21]:
# False negatives
needed_columns = ["id", "locations", "raw_text", "text"]

# Locations extracted by NER, not recognized by geocoders
FN[FN["locations"].apply(lambda x: len(x) > 0)][needed_columns]

Unnamed: 0,id,locations,raw_text,text
171,893482866151231488,"{'Färgargårdens': {'ner_score': 0.9966882, 'sw...",Regnet vräker ner och innehåller hagel. Det mu...,the rain pours contains hail its rumbling writ...
224,1148141876035428352,"{'Kullabygden': {'ner_score': 0.98074657, 'swe...",Skyfall över Kullabygden – många översvämninga...,heavy rain kullabygden floods
442,1404035141140467715,"{'Lapplandsfjällen': {'ner_score': 0.9995497, ...",Lapplandsfjällen (Väder) SMHI varning klass 1 ...,lapland mountains weather smhi warning class h...
662,808624344381485056,"{'Växjöhotell': {'ner_score': 0.8389809, 'swed...",Översvämning på Växjöhotell https://t.co/Bza2T...,flooding växjöhotell
999,1323375668059688960,"{'Gumbodahamn': {'ner_score': 0.99914753, 'swe...",Lv 738 Vippersrönningen–Gumbodahamn (Trafikstö...,lv vippersrönningen–gumbodahamn traffic distur...
1074,833576058058371072,"{'Sydkusten': {'ner_score': 0.99791116, 'swedi...",Översvämningar och jordskred på kusten - Sydku...,floods landslides coast sydkustenes
1136,1035449418542006272,"{'Sydsverige': {'ner_score': 0.9994123, 'swedi...",E22 översvämmad i intensiva skyfallet SMHI var...,e flooded intense downpour smhi warns heavy do...
1401,618744167938818048,"{'Stockholmsområdet': {'ner_score': 0.9994622,...",Kraftigt regn på väg in över Stockholmsområdet...,heavy rain moving stockholm area risk floodin...


In [22]:
# Locations not extracted by NER
FN[FN["locations"].apply(lambda x: len(x) == 0)][needed_columns]

Unnamed: 0,id,locations,raw_text,text
176,893390880526262273,{},JUST NU: Stopp på väg på grund av översvämning...,right now road stoppage flooding illusion
187,1028405310069846017,{},"Ojojoj... alla Halmstadbors samlade böner, dan...",whoops halmstad residents collected prayers da...
266,898747159423492096,{},Mycket vatten på R40 https://t.co/A68M6yGvVV\n...,lots water r borastidning nyheter översvämning...
277,767730451767300096,{},#Vattenföring'en har ökat i #Närpes å pga regn...,vattenföring increased närpes å rain lower wee...
387,644473393262297088,{},"Lv 957 Bänteby-Hallerud (SOS): Vägskador, över...",lv bäntebyhallerud sos road damage flooding di...
394,1026840737013485568,{},Översvämning återigen https://t.co/ouAxRWgjGD ...,flooding karlskoga
582,735835585307369472,{},Kmr bli översvämning i äby,kmr flood äby
696,880474759796068353,{},Det ska bli oväder här i syd. Bäst att vara fö...,there storms south best prepared ramen middang...
734,1397938929182167056,{},Lv 617 Bo Kyrka–Brevens bruk (Trafikstörning) ...,lv bo kyrka–brevens bruk traffic disruption ro...
848,640964992318185472,{},Översvämning i kumla💦 http://t.co/K89QShnE8C,flood kumla


In [23]:
# Geocoder seems to identify some of the locations not extracted by NER
# To increase recall, use geocders on text directly
print(geolocater.geocode("Åker–Nyhom", country_codes="se", language="en", extratags=True))
print(
    geolocater.geocode("Höglandet", country_codes="se", language="en", extratags=True)
)

None
Höglandet, Bromma stadsdelsområde, Stockholm, Stockholms kommun, Stockholm County, 167 71, Sweden


In [24]:
# False positives
def func(row):
    return [key if len(value['swedish_loc_info']) > 0 else None for key, value in row.items()]


x = FP.copy()
x['loc_names'] = FP['locations'].apply(func)
x[['loc_names', 'id']]
# Sweden has locations that contains florida and miami
# Idea: Increase precision by:
# - Make more restrictive classification using data extracted from geocoders regarding location (e.g. population, type, popularity)
# - Check non-swedish enteries for that location and if they are (e.g. more popular) filter the swedish entry out

Unnamed: 0,loc_names,id
3,"[Vita, huset]",902255491753156610
34,"[None, Vita, huset]",902052364831641600
62,[Alperna],1421583982454616071
72,"[Europa, Tyskland]",1421520893738639365
88,"[Tyskland, Belgien]",1421381143312707585
...,...,...
1372,"[Grönland, None, Indonesiens, None, None]",1158012438610108416
1385,[Ica],747032950013825024
1391,"[Ryssland, Isis, Grekland]",619230729562648577
1392,[sverige],619137827700363264


In [25]:
x = geolocater.geocode("Vita", country_codes="se", language="en", extratags=True).raw
print(x)

{'place_id': 19718459, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'node', 'osm_id': 2235762052, 'boundingbox': ['65.9068121', '65.9468121', '22.4066368', '22.4466368'], 'lat': '65.9268121', 'lon': '22.4266368', 'display_name': 'Vitå, Luleå kommun, Norrbotten County, Sweden', 'class': 'place', 'type': 'village', 'importance': 0.275, 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/poi_place_village.p.20.png', 'extratags': {'ref:se:pts:postort': 'VITÅ'}}


In [40]:
data_needed = ["class", "importance", "type", "display_name", "lat", "lon"]

pd.set_option('max_colwidth', 800)
def get_from_raw_loc(row):
    locations = {}
    for name, value in row.items():
        extracted_data = {}
        for data in data_needed:
            if data in value['swedish_loc_info']:
                extracted_data[data] = value['swedish_loc_info'][data]
            else:
                extracted_data[data] = None
        locations[name] = extracted_data
    return locations


df['locations_info'] = df['locations'].apply(get_from_raw_loc)

In [41]:
# Create a row for each location
df['locations_info'] = df['locations_info'].apply(lambda x: list(x.items()))
df_exploded = df.explode('locations_info')

df_exploded = df_exploded[df_exploded['locations_info'].notna()]
# Seperate each data in column
df_exploded[['loc_name', 'raw_data']] = df_exploded['locations_info'].to_list()
df_exploded[["class", "importance", "type", "display_name", "lat", "lon"]] = df_exploded['raw_data'].apply(lambda x: list(x.values())).to_list()
df_exploded = df_exploded.astype({"lon": "float", "lat": "float"})


def get_color(row):
    if row["mentions_location"] == 1 and has_swedish_loc(row['locations']):
        return "blue"
    elif row["mentions_location"] == 0 and has_swedish_loc(row['locations']):
        return "red"


df_exploded["color"] = df_exploded.apply(get_color, axis=1)


geometry = [Point(x, y) for x, y in zip(df_exploded["lon"], df_exploded["lat"])]

In [44]:
df_exploded['count'] = 1
df_exploded = df_exploded.groupby(['lon', 'lat'], as_index=False) \
                         .agg({'count': 'sum', 'color': 'first', 'id': 'first',
                               'loc_name': 'first'})

In [49]:
df[df['loc_name'] == 'län']

KeyError: 'loc_name'

In [45]:
fig = px.scatter_mapbox(
    df_exploded,
    lat="lat",
    lon="lon",
    size='count',
    hover_name=df_exploded["loc_name"],
    hover_data=["id"],
    color_discrete_map={"blue": "blue", "red": "red"},
    color="color",
    mapbox_style="carto-positron",
    height=600,
    zoom=3,
    center={"lat": 63.333112, "lon": 16.007205},
)

fig.show()