# prepare for semantic analysis

In [3]:
import pandas as pd
from asreview.data import load_data

In [4]:
# load preprocessed_and_cleaned.xlsx
df = pd.read_excel("..\pre-processing\preprocessed.xlsx")

In [5]:
df

Unnamed: 0.1,Unnamed: 0,id,type,date,betterDate,title,abstract
0,18,13.16,Overig,2015,NaT,Verzoek_regulier__facultatief_advies_uitgebr_p...,\n\n\n\n\n13.16 \n \n\nOns briefkenmerk: ...
1,29,8.17,Correspondentie,2007-2013,NaT,Toezicht_ Correspondentie n.a.v. Toezicht en H...,\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
2,31,22.11,Mail,??,NaT,FW Draaiboek demonstratie NAM AZC 24 sept...,"\n\n\nDate : 1-1-0001 00:00:00 \nTo : ""Marco O..."
3,32,32.3,Correspondentie,,NaT,Uitgaande mail mededeling akkoord bijdrage aan...,\n\n\n\n\n| bijdrage gemeente Assen Pagina 1 \...
4,35,8.10,Vergunning,1997,NaT,Rapportage n.a.v. vergunningen-meldingen.pdf_j...,\n\n\n\n\n \ncle) \n \n\n\n\n\n\n\n\n\n\n\n...
...,...,...,...,...,...,...,...
2504,2537,32.1,Bestuurlijk besluit,2005-04-26 00:00:00,NaT,BenW besluit BB54473 inzake deelname gem Assen...,\n\n\nÀ \n\nfifi Gemeente Assen \n\n\n\n \n...
2505,2538,32.5,Document,2005-04-01 00:00:00,NaT,Persbericht Energy Valley_,\n\n\n\n\n32.5 \n \n\nÓ Gemeente Assen \n...
2506,2539,3.35,Mail,2004-12-13 00:00:00,2016-09-09 10:47:53,Melding betoging 24 september NAM,\n\n\n\n\n\n\nDate : 9-9-2016 10:47:53 \nF B n...
2507,2540,3.40,Document,2004-12-13 00:00:00,NaT,Locaties in Groningen_,\n\n\nCentrale meldkamer NAM Assen (CMK) \n\nB...


In [6]:
# remove column record_id
df.drop(columns=["id"], inplace=True)

# remove column 'Unnamed: 0'
df.drop(columns=["Unnamed: 0"], inplace=True)

# rename column type to keywords
df.rename(columns={"type": "keywords"}, inplace=True)


In [7]:
# remove \n from abstract column
df["abstract"] = df["abstract"].str.replace("\n", " ")

In [8]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]\w+')
df["abstract"] = df["abstract"].apply(lambda x: TreebankWordDetokenizer().detokenize(tokenizer.tokenize(x)))

In [9]:
df

Unnamed: 0,keywords,date,betterDate,title,abstract
0,Overig,2015,NaT,Verzoek_regulier__facultatief_advies_uitgebr_p...,Ons briefkenmerk Geachte heer mevrouw Dn heeft...
1,Correspondentie,2007-2013,NaT,Toezicht_ Correspondentie n.a.v. Toezicht en H...,STADSBALIE ARCHIEF GEMEENTE ASSNT Va TEM Neder...
2,Mail,??,NaT,FW Draaiboek demonstratie NAM AZC 24 sept...,Date To Marco Out assen nl Subject FW Draaiboe...
3,Correspondentie,,NaT,Uitgaande mail mededeling akkoord bijdrage aan...,bijdrage gemeente Assen Pagina Van Aan energyv...
4,Vergunning,1997,NaT,Rapportage n.a.v. vergunningen-meldingen.pdf_j...,cle Size Date Time MD5 checksum ec8b73ddddb116...
...,...,...,...,...,...
2504,Bestuurlijk besluit,2005-04-26 00:00:00,NaT,BenW besluit BB54473 inzake deelname gem Assen...,fifi Gemeente Assen Voorstel ter besluitvormin...
2505,Document,2005-04-01 00:00:00,NaT,Persbericht Energy Valley_,Gemeente Assen GEMEENIE Afdeling Communicatie ...
2506,Mail,2004-12-13 00:00:00,2016-09-09 10:47:53,Melding betoging 24 september NAM,Date nitie ucefe nsic nl To po ii nl Subject M...
2507,Document,2004-12-13 00:00:00,NaT,Locaties in Groningen_,Centrale meldkamer NAM Assen CMK Beveiliging S...


In [10]:
# save file as cleaned.csv
df.to_csv("cleaned.csv", index=False)

# perform semantic analysis

In [11]:
from asreviewcontrib.semantic_clustering.semantic_clustering import run_clustering_steps

In [12]:
#load cleaned.csv
data = load_data('cleaned.csv')

In [13]:
transformer = "pdelobelle/robbert-v2-dutch-base"

In [14]:
run_clustering_steps(
                data,
                'clustered.csv',
                transformer=transformer)

Loading data...
Loading tokenizer and model pdelobelle/robbert-v2-dutch-base...


Downloading: 100%|██████████| 446M/446M [00:40<00:00, 11.6MB/s]


Tokenizing abstracts...


100%|██████████| 2509/2509 [00:07<00:00, 331.69it/s]


Generating embeddings...


100%|██████████| 2509/2509 [33:50<00:00,  1.24it/s]


Running PCA...
Running t-SNE...




Calculating optimal number of clusters...
Optimal number of clusters:  6
Running k-means...
Creating file clustered.csv...


# restore original abstracts

In [16]:
data = pd.read_csv('clustered.csv')
old_abstracts = pd.read_excel("..\pre-processing\preprocessed.xlsx")['abstract']

In [17]:
data['abstract'] = old_abstracts

In [26]:
# save data as clustered_original_abstracts.csv
data.to_csv('clustered.csv', index=False)