In [None]:
import pandas as pd
import gdown

#filmstarts dataset.

reviews_file_id = "17l-LzyTfB_1FlIpQDAlV2a0Yestyu-CI"
labeled_file_id = "12t-EeIEeMq1tdQRkYbP6XFohhdAHIh0o"

reviews_file_path = "reviews.csv"
labeled_file_path = "labels.csv"

gdown.download(f"https://drive.google.com/uc?id={reviews_file_id}", reviews_file_path, quiet=False)
gdown.download(f"https://drive.google.com/uc?id={labeled_file_id}", labeled_file_path, quiet=False)

reviews_df = pd.read_csv(reviews_file_path, sep='\t', header=None, names=['URL', 'Rating', 'Review'], on_bad_lines='skip')
labeled_df = pd.read_csv(labeled_file_path, sep='\t', header=None, names=['Label'])

labeled_df[['Sentiment', 'Score']] = labeled_df['Label'].str.split(' ', n=1, expand=True)
labeled_df['Score'] = pd.to_numeric(labeled_df['Score'], errors='coerce')
reviews_df = reviews_df.drop(columns=['Rating'])
combined_df = pd.concat([reviews_df, labeled_df], axis=1)
combined_df = combined_df.dropna(subset=['Review', 'Score'])
combined_df.head()


Downloading...
From: https://drive.google.com/uc?id=17l-LzyTfB_1FlIpQDAlV2a0Yestyu-CI
To: /content/reviews.csv
100%|██████████| 63.3M/63.3M [00:00<00:00, 86.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=12t-EeIEeMq1tdQRkYbP6XFohhdAHIh0o
To: /content/labels.csv
100%|██████████| 1.56M/1.56M [00:00<00:00, 94.6MB/s]


Unnamed: 0,URL,Review,Label,Sentiment,Score
0,http://www.filmstarts.de/kritiken/27070.html,Der Herr der Ringe - Die Gefährten ist für mic...,__label__neutral 5.0,__label__neutral,5.0
1,http://www.filmstarts.de/kritiken/27070.html,"Ein Ring sie zu knechten, Sie alle zu finden, ...",__label__neutral 5.0,__label__neutral,5.0
2,http://www.filmstarts.de/kritiken/27070.html,Der Herr der Ringe - Die Gefährten <> der best...,__label__neutral 5.0,__label__neutral,5.0
3,http://www.filmstarts.de/kritiken/27070.html,Was Peter Jackson hier erschaffen hat..ist ein...,__label__neutral 5.0,__label__neutral,5.0
4,http://www.filmstarts.de/kritiken/27070.html,ich habe es schon so oft gesehen ich gucke es ...,__label__neutral 5.0,__label__neutral,5.0


In [None]:
#list of modal particles.
MODAL_PARTICLES = ["halt", "eben", "eigentlich", "wirklich", "mal", "ja", "ruhig", "nun einmal", "doch", "schon"]


In [None]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import spacy
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = AutoModel.from_pretrained("bert-base-german-cased")

nlp = spacy.load("de_core_news_sm")

In [None]:
def classify_modal_particle_bert_grammar(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states[-1]
    doc = nlp(sentence)
    modal_particles = []

    for i, token in enumerate(doc):
        if token.text.lower() not in MODAL_PARTICLES:
            continue

        token_ids = tokenizer.encode(token.text, add_special_tokens=False)
        input_id_list = inputs["input_ids"][0].tolist()
        if token_ids[0] not in input_id_list:
            continue
        token_index = input_id_list.index(token_ids[0])

        start_idx = max(token_index - 2, 0)
        end_idx = min(token_index + 3, hidden_states.shape[1])
        context_vectors = hidden_states[0][start_idx:end_idx]
        if context_vectors.shape[0] == 0:
            continue

        token_vector = hidden_states[0][token_index].unsqueeze(0)
        context_mean = context_vectors.mean(dim=0).unsqueeze(0)
        if torch.isnan(context_mean).any() or torch.isnan(token_vector).any():
            continue

        similarity = cosine_similarity(token_vector, context_mean)[0][0]

        modified_sentence = " ".join([t.text for t in doc if t.i != token.i])
        modified_doc = nlp(modified_sentence)

        if (
            token.pos_ in ["PART", "ADV", "INTJ"]
            and len(list(modified_doc.sents)) == len(list(doc.sents))
            and similarity > 0.5
        ):
            modal_particles.append(token.text.lower())

    return modal_particles


In [None]:
# divide dataset into 4 chunks.
chunks = np.array_split(combined_df, 4)

#save each chunk as csv files.
for i, chunk in enumerate(chunks):
    chunk.to_csv(f"combined_chunk_{i}.csv", index=False)


  return bound(*args, **kwds)


In [None]:
from tqdm.notebook import tqdm
import json

particle_sentences = []

# chunk 0
chunk_df = pd.read_csv("combined_chunk_0.csv")

for idx, row in tqdm(enumerate(chunk_df.iterrows()), total=len(chunk_df)):

    text = row[1]['Review']

    try:
        detected_particles = classify_modal_particle_bert_grammar(text)
        for particle in detected_particles:
            particle_sentences.append({
                'Review': text,
                'Particle': particle
            })
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        continue

    if idx % 100 == 0:
        with open('modal_particle_progress_chunk0.json', 'w', encoding='utf-8') as f:
            json.dump(particle_sentences, f, ensure_ascii=False, indent=2)

#final save
with open('modal_particle_final_chunk0.json', 'w', encoding='utf-8') as f:
    json.dump(particle_sentences, f, ensure_ascii=False, indent=2)


  0%|          | 0/17768 [00:00<?, ?it/s]

In [None]:
from google.colab import files
files.download('modal_particle_final_chunk0.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_json = pd.read_json("modal_particle_final_chunk0.json", encoding="utf-8")
df_json.head()

Unnamed: 0,Review,Particle
0,Was Peter Jackson hier erschaffen hat..ist ein...,doch
1,ich habe es schon so oft gesehen ich gucke es ...,schon
2,ich habe es schon so oft gesehen ich gucke es ...,mal
3,Eigentlich nur ganz kurz: wer Herr der Ringe n...,eigentlich
4,Es ist halt ein Film. In der extended Edition ...,halt


In [None]:
df_json['Particle'].value_counts()


Unnamed: 0_level_0,count
Particle,Unnamed: 1_level_1
schon,3539
wirklich,3529
mal,2841
doch,1893
ja,1661
eigentlich,1324
eben,762
halt,216
ruhig,138


In [None]:
#particle numbers in chunk 0 is enough for research

#pick 100 samples from this df.
sampled_df = df_json.groupby('Particle', group_keys=False).apply(lambda x: x.sample(n=100, random_state=42)).reset_index(drop=True)


  sampled_df = df_json.groupby('Particle', group_keys=False).apply(lambda x: x.sample(n=100, random_state=42)).reset_index(drop=True)
