# Preprocessing

In [1]:
from sklearn.preprocessing import FunctionTransformer

In [2]:
def lowercase_transformer(X):
    return X.str.lower()


lowercase_transformer = FunctionTransformer(lowercase_transformer)

In [3]:
def remove_numbers_transformer(X):
    X = X.astype(str)
    return X.str.replace(r'\d+', '', regex=True)


remove_numbers_transformer = FunctionTransformer(remove_numbers_transformer)

In [4]:
def remove_punctuation_transformer(X):
    return X.str.replace(r'[^\w\s]', '', regex=True)


remove_punctuation_transformer = FunctionTransformer(remove_punctuation_transformer)

In [5]:
def remove_whitespace_transformer(X):
    return X.str.strip()


remove_whitespace_transformer = FunctionTransformer(remove_whitespace_transformer)

In [6]:
import nltk
import os

nltk_data_dir = os.path.expanduser('~/Tools/nltk_data')
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

nltk.data.path.append(nltk_data_dir)
# nltk.download('stopwords', download_dir=nltk_data_dir)


def load_custom_stopwords(file_path):
    with open(file_path, 'r') as file:
        stopwords = set(line.strip().lower() for line in file)
    return stopwords


nltk_stopwords = set(nltk.corpus.stopwords.words('indonesian'))
custom_stopwords = load_custom_stopwords('../utils/stopwords-id.txt')
all_stopwords = nltk_stopwords.union(custom_stopwords)


def remove_stopwords(X):
    return X.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in all_stopwords]))


remove_stopwords_transformer = FunctionTransformer(remove_stopwords)

In [7]:
def remove_short_words_transformer(min_length):
    def remove_short_words(X):
        return X.apply(lambda x: ' '.join([word for word in x.split() if len(word) > min_length]))

    return remove_short_words


remove_short_words_transformer = FunctionTransformer(
    remove_short_words_transformer(3)
)

In [8]:
from nlp_id.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer()


def lemmatization_transformer(X):
    return X.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))


lemmatization_transformer = FunctionTransformer(lemmatization_transformer)

In [9]:
from sklearn.pipeline import Pipeline

preprocessing_pipeline = Pipeline([
    ('lowercase', lowercase_transformer),
    ('remove_punctuation', remove_punctuation_transformer),
    ('remove_numbers', remove_numbers_transformer),
    ('remove_whitespace', remove_whitespace_transformer),
    ('remove_short_words', remove_short_words_transformer),
    ('lemmatization', lemmatization_transformer),
    ('stopwords_remover', remove_stopwords_transformer),
])

In [10]:
from tqdm import tqdm
import pandas as pd

study_programs = pd.read_csv('../data/raw/02082024.csv')['study_program'].unique()

for study_program in tqdm(study_programs, desc="Processing files"):
    file_name = f"../data/raw/{study_program.lower().replace(' ', '-')}-raw.csv"

    if os.path.exists(file_name):
        df = pd.read_csv(file_name)

        abstracts = df['abstract']
        abstracts_processed = preprocessing_pipeline.fit_transform(abstracts)

        output_file_name = f"../data/preprocessed/{study_program.lower().replace(' ', '-')}-preprocessed.csv"
        abstracts_processed.to_csv(output_file_name)

        tqdm.desc = f"Processed {study_program}"
    else:
        tqdm.write(f"File {file_name} does not exist.")

Processing files: 100%|██████████| 11/11 [00:05<00:00,  2.05it/s]
