# prepare for semantic analysis

In [1]:
import pandas as pd
from pathlib import Path

from asreview.data import load_data

In [2]:
# load preprocessed file
df = pd.read_csv("https://raw.githubusercontent.com/asreview-ftm-hackathon/Data/main/data/preprocessed_data.csv")[["title", "abstract"]]

In [3]:
# remove \n from abstract column
df["abstract"] = df["abstract"].str.replace("\n", " ")

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]\w+')
df["abstract"] = df["abstract"].apply(lambda x: TreebankWordDetokenizer().detokenize(tokenizer.tokenize(x)))

In [5]:
df

Unnamed: 0,title,abstract
0,Verzoek_regulier__facultatief_advies_uitgebr_p...,Ons briefkenmerk Geachte heer mevrouw Dn heeft...
1,Toezicht_ Correspondentie n.a.v. Toezicht en H...,STADSBALIE ARCHIEF GEMEENTE ASSNT Va TEM Neder...
2,FW Draaiboek demonstratie NAM AZC 24 sept...,Date To Marco Out assen nl Subject FW Draaiboe...
3,Uitgaande mail mededeling akkoord bijdrage aan...,bijdrage gemeente Assen Pagina Van Aan energyv...
4,Rapportage n.a.v. vergunningen-meldingen.pdf_j...,cle Size Date Time MD5 checksum ec8b73ddddb116...
...,...,...
2504,BenW besluit BB54473 inzake deelname gem Assen...,fifi Gemeente Assen Voorstel ter besluitvormin...
2505,Persbericht Energy Valley_,Gemeente Assen GEMEENIE Afdeling Communicatie ...
2506,Melding betoging 24 september NAM,Date nitie ucefe nsic nl To po ii nl Subject M...
2507,Locaties in Groningen_,Centrale meldkamer NAM Assen CMK Beveiliging S...


In [6]:
# save file as cleaned.csv
df.to_csv(Path("..", "temp", "cleaned.csv"), index=False)

# perform semantic analysis

In [7]:
from asreviewcontrib.semantic_clustering.semantic_clustering import run_clustering_steps

In [8]:
#load cleaned.csv
data = load_data(Path("..", "temp", "cleaned.csv"))

In [9]:
transformer = "pdelobelle/robbert-v2-dutch-base"

In [10]:
run_clustering_steps(
                data,
                Path("..","output", "clustered.csv", index=False),
                transformer=transformer)

Loading data...
Loading tokenizer and model pdelobelle/robbert-v2-dutch-base...
Tokenizing abstracts...


 40%|████      | 1014/2509 [00:07<00:14, 105.99it/s]

# restore original abstracts

In [None]:
data = pd.read_csv(Path("..","output", "clustered.csv", index=False))
old_abstracts = pd.read_csv("https://raw.githubusercontent.com/asreview-ftm-hackathon/Data/main/data/preprocessed_data.csv")['abstract']

In [None]:
data['abstract'] = old_abstracts

In [None]:
# save data as clustered_original_abstracts.csv
data.to_csv(Path("..","output", "clustered.csv", index=False))