In [5]:
from pathlib import Path
import os

dataset_path = Path("../../dataset/")

In [6]:
[x for x in os.listdir(dataset_path) if x.endswith('.pq')]

['lemmatized_test_df_dataset_1.pq',
 'lemmatized_train_df_dataset_1.pq',
 'lemmatized_validation_df_dataset_1.pq',
 'outliers_df.pq',
 'test_df_dataset_1.pq',
 'test_df_dataset_2.pq',
 'test_df_dataset_3.pq',
 'test_df_dataset_4.pq',
 'test_df_dataset_5.pq',
 'train_df_dataset_1.pq',
 'train_df_dataset_2.pq',
 'train_df_dataset_3.pq',
 'train_df_dataset_4.pq',
 'train_df_dataset_5.pq',
 'validation_df_dataset_1.pq',
 'validation_df_dataset_2.pq',
 'validation_df_dataset_3.pq',
 'validation_df_dataset_4.pq',
 'validation_df_dataset_5.pq']

In [3]:
import spacy

nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy-cuda11x, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



In [4]:
splits = ['train', 'test', 'validation']
dataset_indices = [1,2,3,4,5]

In [8]:
import gc
import pandas as pd
from logging import Logger, StreamHandler

logger = Logger("dummy")
logger.addHandler(StreamHandler())

n_workers = max(1, os.cpu_count() - 1)

In [5]:
batch_size = 384

def lemmatize_sentences(sentences, join=True):
    lemmatized_sentences = []

    sentences = [' '.join(sentence) if isinstance(sentence, list) else sentence for sentence in sentences] # TODO just check 0th element, no need to check everything

    for doc in nlp.pipe(sentences, batch_size=batch_size, disable=["parser", "ner"], n_process=n_workers):
        lemmatized_sentence = [token.lemma_ for token in doc]

        if join:
            lemmatized_sentence = ' '.join(lemmatized_sentence)

        lemmatized_sentences.append(lemmatized_sentence)

    return lemmatized_sentences


for dataset_index in [1]:
    logger.info(f"Processing dataset - {dataset_index}")
    
    for split in splits:
        logger.info(f"Processing split {split}")
        
        df = pd.read_parquet(dataset_path / f"{split}_df_dataset_{dataset_index}.pq")
        logger.info(f"Read df of shape {df.shape}")
        
        df['abstract'] = lemmatize_sentences(sentences=df['abstract'].tolist())
        logger.info(f"Finished Processing df")
        
        target_name = dataset_path / f"lemmatized_{split}_df_dataset_{dataset_index}.pq"
        df.to_parquet(target_name)
        logger.info(f"Saved processed df to {target_name}")

Processing dataset - 1
Processing split train
Read df of shape (903265, 6)
Finished Processing df
Saved processed df to ../../dataset/lemmatized_train_df_dataset_1.pq
Processing split test
Read df of shape (903489, 6)
Finished Processing df
Saved processed df to ../../dataset/lemmatized_test_df_dataset_1.pq
Processing split validation
Read df of shape (451593, 6)
Finished Processing df
Saved processed df to ../../dataset/lemmatized_validation_df_dataset_1.pq
Processing dataset - 2
Processing split train
Read df of shape (1354937, 6)


KeyboardInterrupt: 

In [9]:
df1 = pd.read_parquet(dataset_path/'lemmatized_test_df_dataset_1.pq')
df2 = pd.read_parquet(dataset_path/'lemmatized_train_df_dataset_1.pq')
df3 = pd.read_parquet(dataset_path/'lemmatized_validation_df_dataset_1.pq')
df = pd.concat([df1, df2, df3])
df.shape

(2258347, 6)

In [18]:
splits = ['train', 'test', 'validation']
dataset_indices = [2,3,4,5]

for dataset_index in dataset_indices:
    print(f"Processing index: {dataset_index}")
    for split in splits:
        print(f"Processing split {split}")
        source_df = pd.read_parquet(dataset_path / f"{split}_df_dataset_{dataset_index}.pq")
        print(f"source shape: {source_df.shape}")

        source_df = source_df.join(df[['abstract']], rsuffix='_right')
        source_df['abstract'] = source_df['abstract_right']
        source_df = source_df.drop(columns=['abstract_right'])

        target_name = dataset_path / f"lemmatized_{split}_df_dataset_{dataset_index}.pq"
        print(f"new source shape: {source_df.shape}")
        source_df.to_parquet(target_name)

Processing index: 2
Processing split train
source shape: (1354937, 6)
new source shape: (1354937, 6)
Processing split test
source shape: (677658, 6)
new source shape: (677658, 6)
Processing split validation
source shape: (225752, 6)
new source shape: (225752, 6)
Processing index: 3
Processing split train
source shape: (1354937, 6)
new source shape: (1354937, 6)
Processing split test
source shape: (451817, 6)
new source shape: (451817, 6)
Processing split validation
source shape: (451593, 6)
new source shape: (451593, 6)
Processing index: 4
Processing split train
source shape: (1580762, 6)
new source shape: (1580762, 6)
Processing split test
source shape: (451833, 6)
new source shape: (451833, 6)
Processing split validation
source shape: (225752, 6)
new source shape: (225752, 6)
Processing index: 5
Processing split train
source shape: (112830, 6)
new source shape: (112830, 6)
Processing split test
source shape: (112830, 6)
new source shape: (112830, 6)
Processing split validation
source