In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Imports
import os
import json
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import re
from transformers import BertModel, BertTokenizer
import torch

In [3]:
# Read (or prepare and write) the data
mode = 'read' # 'write' or 'read'
path_to_pkl = '/content/drive/MyDrive/data_legal/data.pkl' # path the ready-to-use pkl data
if mode == 'write':
    # To write the data
    path_to_data = 'data/'
    file_names = os.listdir(path_to_data)
    print('Number of files: ', len(file_names))
    data = []
    for i in tqdm(range(len(file_names))):
        with open(path_to_data + file_names[i], 'r') as f:
            data.append(BeautifulSoup(json.loads(f.read())['data'], 'html.parser').get_text())
    with open(path_to_pkl, 'wb') as f:
        pickle.dump(data, f)
else:
    # To read the data
    with open(path_to_pkl, 'rb') as f:
        data = pickle.load(f)

In [4]:
# Extract the queries
queries = []
for i in tqdm(range(len(data))):
    pattern = r"[Dd]ava dilekçesinde.*?;.*?[qwertyuıopğüasdfghjklşizxcvbnmöç]\.[ ]?[QWERTYUIOPĞÜASDFGHJKLŞİZXCVBNMÖÇ]"
    match = re.search(pattern, data[i])
    result = match.group(0) if match else None
    queries.append(result.split(';')[1].strip() if result is not None else None)
    if match:
        data[i] = re.sub(pattern, result[-1], data[i])

100%|██████████| 446094/446094 [01:26<00:00, 5167.50it/s]


In [5]:
# Successfully extracted query-doc pairs
null_count = sum(1 for item in queries if item is None)
print(f"Number of null elements in queries: {null_count}")
total_count = len(queries)
non_null_count = total_count - null_count
ratio = round(non_null_count / total_count, 4)
print(f"Successful pair rate: {ratio*100}%")

Number of null elements in queries: 158625
Successful pair rate: 64.44%


In [6]:
# Prepare the dictionary for the dataset
dataset_dict = {'doc': [], 'query': []}
for i in tqdm(range(len(data))):
    if queries[i] is not None and data[i] is not None:
        dataset_dict['doc'].append(data[i])
        dataset_dict['query'].append(queries[i])

100%|██████████| 446094/446094 [00:00<00:00, 1301597.24it/s]


In [7]:
# Delete variables that are not needed anymore
del data
del queries

In [8]:
# Create the pandas dataframe for the dataset
dataset = pd.DataFrame(dataset_dict)
dataset

Unnamed: 0,doc,query
0,T.C.İSTANBUL5. ASLİYE TİCARET MAHKEMESİ ESAS ...,Taraflar ve tarafların dışındaki kişilerle yap...
1,T.C.İSTANBUL7. ASLİYE TİCARET MAHKEMESİESAS N...,müvekkilinin ... Bankası A.Ş. ... Şubesi'ne ai...
2,T.C.ANTALYA3. ASLİYE TİCARET MAHKEMESİESAS NO...,müvekkili ile davalı şirket arasında ticari il...
3,T.C. ...,Müvekkili şirket ile davalı arasında icra dosy...
4,T.C. İstanbul Anadolu 8. ASLİYE TİCARET MAHK...,Müvekkilinin ---------pay sahiplerinden olduğu...
...,...,...
287464,....TÜRK MİLLETİ ADINA\tT.C.\tBURSA1. ASLİYE...,davacı ve davalı arasında olan ticari ilişki n...
287465,T.C. İstanbul Anadolu 1. ASLİYE TİCARET MAHK...,davacının belirtilen tarihler arasında çalıştı...
287466,T.C. İstanbul Anadolu 3. ASLİYE TİCARET MAHK...,Müvekkili ----- başlangıç tarihli---- poliçesi...
287467,T.C.İSTANBULBÖLGE ADLİYE MAHKEMESİ14. HUKUK D...,müvekkilinin davalıya satmış olduğu mal ve hiz...


In [9]:
# Delete variables that are not needed anymore
del dataset_dict

In [10]:
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load pre-trained Turkish BERT model and tokenizer
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Move the model to the GPU if available
model = model.to(device)

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [11]:
# Convert queries and documents to lists
queries = dataset['query'].tolist()
documents = dataset['doc'].tolist()

In [13]:
# Find BERT representation vectors for the queries in batches
batch_size = 128
embeddings_arr_queries = np.empty((len(queries), model.config.hidden_size), dtype=np.float32)
for i in tqdm(range(0, len(queries), batch_size)):
    batch_queries = queries[i:i + batch_size]

    # Tokenize and convert to tensor
    inputs = tokenizer(batch_queries, return_tensors="pt", truncation=True, padding=True)

    # Move the inputs to the GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings from the output and move to CPU
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    # Fill the batch embeddings in the result array
    embeddings_arr_queries[i:i + len(batch_queries)] = embeddings

  0%|          | 8/2246 [00:46<3:38:57,  5.87s/it]


KeyboardInterrupt: ignored

In [None]:
with open('/content/drive/MyDrive/data_legal/query_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_arr_queries, f)

In [None]:
# Find BERT representation vectors for the documents in batches
batch_size = 128
embeddings_arr_documents = np.empty((len(documents), model.config.hidden_size), dtype=np.float32)
for i in range(0, len(documents), batch_size):
    batch_documents = documents[i:i + batch_size]

    # Tokenize and convert to tensor
    inputs = tokenizer(batch_documents, return_tensors="pt", truncation=True, padding=True)

    # Move the inputs to the GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings from the output and move to CPU
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    # Fill the batch embeddings in the result array
    embeddings_arr_documents[i:i + len(batch_documents)] = embeddings

In [None]:
with open('/content/drive/MyDrive/data_legal/document_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_arr_documents, f)

In [None]:
# Shapes of the embeddings
print(embeddings_arr_queries.shape)
print(embeddings_arr_documents.shape)