In [1]:
import os.path
import pandas as pd
import spacy

from tqdm import tqdm
from time import time
from os import path
from stempel import StempelStemmer
from lib.preprocessing import lower_text
from lib.preprocessing import remove_numbers_punctuation_whitespaces
from lib.preprocessing import remove_stopwords
from lib.preprocessing import apply_stempel_stemmer
from lib.preprocessing import apply_spacy_lemmatize
from lib.preprocessing import remove_empty_documents

In [2]:
tqdm.pandas()

with open('data/stopwords', 'r', encoding='UTF-8') as f:
    STOPWORDS = [line.replace('\n','') for line in f.readlines()]

In [3]:
min_word_length = 2
min_count_of_words = 10

dset_path = 'data/limit_5K_per_type_order_by_id_desc'

df_path = f'{dset_path}/00_chosen_types/chosen_types_df.csv'

df_clean_path = f'{dset_path}/01_processed/cleaned.csv'
df_stemm_path = f'{dset_path}/01_processed/stemmed.csv'
df_lemma_path = f'{dset_path}/01_processed/lemmatized.csv'

if not path.exists(f'{dset_path}/01_processed'):
    os.mkdir(f'{dset_path}/01_processed')

# Leaving id, doc_type and text cols

In [5]:
c_final_names = {'id_dokument': 'id', 'text': 'text', 'id_typ_dokument_true': 'type'}

df = pd.read_csv(df_path, sep=';')

df = df[c_final_names.keys()]
df = df.rename(columns=c_final_names)

# DATA CLEANING

## Data cleaning

In [7]:
start_time = time()

print('Lowering...')
df['text'] = df['text'].progress_apply(lower_text)

print(f'Serching 4 all words with length > {min_word_length} && removing numbers, punctuation and whitespaces...')
df['text'] = df['text'].progress_apply(remove_numbers_punctuation_whitespaces, args=[min_word_length])

print('Removing stopwords...')
df['text'] = df['text'].progress_apply(remove_stopwords, args=[STOPWORDS])

print('Arr to str...')
df['text'] = df['text'].progress_apply(lambda words: ' '.join(words))

print(f'Process ENDED in {(time()-start_time)/60}min')

print(f'Removing documents having fewer then {min_count_of_words} words...',
            f"\n\tBEFORE : count of documents = {df.shape[0]}")
df = remove_empty_documents(df, min_count_of_words)
print(f"\tAFTER : count of documents = {df.shape[0]}")

print(f'Writing to {df_clean_path}...')
df.to_csv(df_clean_path, index=False, sep=';')

Lowering...


100%|██████████| 274966/274966 [00:10<00:00, 26835.45it/s]


Serching 4 all words with length > 2 && removing numbers, punctuation and whitespaces...


100%|██████████| 274966/274966 [02:42<00:00, 1693.31it/s]


Removing stopwords...


100%|██████████| 274966/274966 [21:58<00:00, 208.54it/s]


Arr to str...


100%|██████████| 274966/274966 [01:28<00:00, 3121.93it/s]


Process ENDED in 28.230589834849038min
Removing documents having fewer then 10 words... 
	BEFORE : count of documents = 274966
	AFTER : count of documents = 271885
Writing to data/limit_5K_per_type_order_by_id_desc/01_processed/cleaned.csv...


## Data cleaning using stemming

In [8]:
print(f'Reading from {df_clean_path}...')
df = pd.read_csv(df_clean_path, sep=';')

start_time = time()

print('Loading stemmer...')
stemmer = StempelStemmer.polimorf()

print('Stemming...')
if type(df['text'][0]) == str:
    df['text'] = df['text'].apply(lambda text: str(text).split())
df['text'] = df['text'].progress_apply(apply_stempel_stemmer, args=[stemmer, min_word_length])

print('Arr to str...')
df['text'] = df['text'].progress_apply(lambda words: ' '.join(words))

print(f'Searching 4 all words with length > {min_word_length} && removing numbers, punctuation and whitespaces...')
df['text'] = df['text'].progress_apply(remove_numbers_punctuation_whitespaces, args=[min_word_length])

print('Removing stopwords...')
df['text'] = df['text'].progress_apply(remove_stopwords, args=[STOPWORDS])

print('Arr to str...')
df['text'] = df['text'].progress_apply(lambda words: ' '.join(words))

print(f'Process ENDED in {(time()-start_time)/60}min')

print(f'Writing to {df_stemm_path}...')
df.to_csv(df_stemm_path, index=False, sep=';')

Reading from data/limit_5K_per_type_order_by_id_desc/01_processed/cleaned.csv...
Loading stemmer...


Loading: 100%|██████████| 11368252/11368252 [00:09<00:00, 1202604.44bytes/s]


Stemming...


100%|██████████| 271885/271885 [1:01:04<00:00, 74.20it/s] 


Arr to str...


100%|██████████| 271885/271885 [01:46<00:00, 2542.33it/s]


Searching 4 all words with length > 2 && removing numbers, punctuation and whitespaces...


100%|██████████| 271885/271885 [01:11<00:00, 3827.71it/s]


Removing stopwords...


100%|██████████| 271885/271885 [16:35<00:00, 273.10it/s]


Arr to str...


100%|██████████| 271885/271885 [01:18<00:00, 3466.92it/s]


Process ENDED in 85.49818791945775min
Writing to data/limit_5K_per_type_order_by_id_desc/01_processed/stemmed.csv...


## Data cleaning using lemmatizing

In [5]:
print(f'Reading from {df_clean_path}...')
df = pd.read_csv(df_clean_path, sep=';')

start_time = time()

print('Loading stemmer...')
stemmer = StempelStemmer.polimorf()

print('Loading lemmatizer...')
lemmatizer = spacy.load("pl_core_news_md")

print('Lemmatizing...')
if type(df['text'][0]) != str:
    df['text'] = df['text'].apply(lambda words: ' '.join(words))
df['text'] = df['text'].progress_apply(apply_spacy_lemmatize, args=[lemmatizer])

print('Arr to str...')
df['text'] = df['text'].progress_apply(lambda words: ' '.join(words))

print(f'Searching 4 all words with length > {min_word_length} && removing numbers, punctuation and whitespaces...')
df['text'] = df['text'].progress_apply(remove_numbers_punctuation_whitespaces, args=[min_word_length])

print('Removing stopwords...')
df['text'] = df['text'].progress_apply(remove_stopwords, args=[STOPWORDS])

print('Arr to str...')
df['text'] = df['text'].progress_apply(lambda words: ' '.join(words))

print(f'Process ENDED in {(time()-start_time)/60}min')

print(f'Writing to {df_lemma_path}...')
df.to_csv(df_lemma_path, index=False, sep=';')

Reading from data/limit_5K_per_type_order_by_id_desc/01_processed/cleaned.csv...
Loading stemmer...


Loading: 100%|██████████| 11368252/11368252 [00:09<00:00, 1182653.45bytes/s]


Loading lemmatizer...
Lemmatizing...


100%|██████████| 271885/271885 [6:30:26<00:00, 11.61it/s]


Arr to str...


100%|██████████| 271885/271885 [02:00<00:00, 2247.64it/s]


Searching 4 all words with length > 2 && removing numbers, punctuation and whitespaces...


100%|██████████| 271885/271885 [01:35<00:00, 2852.93it/s]


Removing stopwords...


100%|██████████| 271885/271885 [14:54<00:00, 304.00it/s]


Arr to str...


100%|██████████| 271885/271885 [00:59<00:00, 4550.65it/s]


Process ENDED in 412.9527033408483min
Writing to data/limit_5K_per_type_order_by_id_desc/01_processed/lemmatized.csv...
