In [16]:
from sklearn.neighbors import KernelDensity
from shapely.geometry import Point, Polygon
import geopandas
from sklearn.cluster import DBSCAN
import pandas as pd
import torch
import numpy as np

In [17]:
from transformers import DistilBertTokenizerFast
# from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaForSequenceClassification , RobertaTokenizerFast, RobertaConfig

BASE_MODEL = './Downloads/2021-11-14_model-distilroberta-base_loss-huber_epoch-6/'

TOKEN_MODEL = 'distilroberta-base'
# tokenizer = DistilBertTokenizerFast.from_pretrained(TOKEN_MODEL)
# model = DistilBertForSequenceClassification.from_pretrained(BASE_MODEL)
tokenizer = RobertaTokenizerFast.from_pretrained(TOKEN_MODEL)
model = RobertaForSequenceClassification.from_pretrained(BASE_MODEL)

In [18]:
text = '''Since the beginning of November, there have been thousands of recorded border crossing attempts, according to Polish authorities.
A spokesperson for Poland's border guards told CNN on Monday that there had been "forced mass attempts to cross the border" in Kuznica area by a group of people over the weekend and that the situation was "very tense and very dangerous."
Last week, Polish border guard representatives told CNN that some of the migrants had been pushed toward the barriers by Belarusian services.
Speaking on Monday, Belarusian President Alexander Lukashenko said Belarus was doing everything to prevent people from accumulating at the border.
Polish authorities have detained small numbers of people, allowing only small numbers of people the option of applying for asylum in Poland. Others have been immediately sent back to Belarus.
Access to the border area is tightly restricted. Journalists and aid workers have been blocked from traveling to the area by an exclusion zone."'''

text = text.replace('\n', ' ').lower()
text_split = text.split()
n_steps = 20
step_size = int(len(text_split)/n_steps)
variations = [" ".join(text_split[:i]) + " " + " ".join(text_split[i+step_size:]) for i in range(n_steps)] 

In [19]:
tok = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
out = model.forward(input_ids = tok['input_ids'], attention_mask=tok['attention_mask'], output_hidden_states=True)

In [20]:
last_hidden_state = out['hidden_states'][-1]

In [21]:
rand_locs = []
for i in range(500):
    ones = torch.ones_like(last_hidden_state)
    dim1 = ones.shape[1]
    dim2 = ones.shape[2]
    rand_dim1 = np.random.randint(0, dim1)
    rand_dim2 = np.random.randint(0, dim2)
    ones[0, :, rand_dim2] = 0
    masked = ones * last_hidden_state
    rand_loc = model.classifier(masked).cpu().detach().numpy().squeeze()
    rand_locs.append(rand_loc)

In [22]:
out_df = pd.DataFrame(rand_locs, columns=['lat', 'lon'])

In [23]:
kd = KernelDensity(kernel='gaussian', metric='haversine')

In [24]:
kd.fit(out_df)

KernelDensity(metric='haversine')

In [25]:
conf_sample = kd.sample(1000)
conf_probas = np.exp(kd.score_samples(conf_sample))
conf_probas /= conf_probas.sum()
conf_df = pd.Series(conf_probas, name='proba').to_frame()

In [26]:
conf_df['point'] = geopandas.GeoSeries([Point(point[::-1]) for point in conf_sample])
conf_df.loc[:,['lat', 'lon']] = conf_sample

In [27]:
conf_df['cdf'] = conf_df.sort_values('proba', ascending=False).proba.cumsum()

In [28]:
levels = 10

In [29]:
conf_df['conf_level'] = pd.qcut(conf_df.cdf, q=levels,labels=False)

In [30]:
conf_layers = []
dbscan = DBSCAN()
for level in range(levels):
    layer_points = conf_df[conf_df.conf_level >= level][['lat', 'lon']]
    pred = dbscan.fit_predict(layer_points)
    clusters = np.unique(pred)[1:]
    layer_cluster_areas = []
    for cluster in clusters:
        layer_cluster_points = conf_df.loc[layer_points[pred == cluster].index]
        conf_area_poly = Polygon(layer_cluster_points.point.values)
        smoothed_area = conf_area_poly.convex_hull.buffer(.1,0,3,3, single_sided=True)
        conf_layers.append(smoothed_area)

IllegalArgumentException: Shell is not a LinearRing


ValueError: Null geometry supports no operations

In [31]:

# conf_areas = []
# for level in range(levels):
#     conf_area_points = conf_df[conf_df.conf_level == level].point
#     conf_area_poly = Polygon(conf_area_points.values)
#     smoothed_area = conf_area_poly.convex_hull.buffer(.1,0,3,3, single_sided=True)
#     conf_areas.append(smoothed_area)

In [None]:
conf_area_series = geopandas.GeoSeries(conf_layers)

In [None]:
conf_area_series

In [None]:
import folium
from folium.plugins import HeatMap

In [None]:
maploc = folium.Map(
#     location=[conf_df.lon.mean(), conf_df.lat.mean()],
    zoom_start=11,
    tiles="Stamen Toner", 
    min_lat=conf_df['point'].map(lambda x: x.x).min(), 
    max_lat=conf_df['point'].map(lambda x: x.x).max(),
    min_lon=conf_df['point'].map(lambda x: x.y).min(),
    max_lon=conf_df['point'].map(lambda x: x.y).max())
maploc.add_child(folium.GeoJson(
    data=conf_area_series.to_json(), 
    style_function=lambda x: {'fillColor': 'blue', 'stroke': False, 'fillOpacity': 0.1}))
maploc.add_child(HeatMap(conf_sample))
maploc

In [34]:
out_df.to_xml('test.xml')

In [33]:
pd.__version__

'1.3.4'

In [1476]:
!pip install -U pandas

